{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 50.0,
  "eval_steps": 500,
  "global_step": 3200,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.015625,
      "grad_norm": 1.8719109296798706,
      "learning_rate": 0.0002,
      "loss": 2.0019,
      "step": 1
    },
    {
      "epoch": 0.03125,
      "grad_norm": 2.353105306625366,
      "learning_rate": 0.0001999375,
      "loss": 2.4135,
      "step": 2
    },
    {
      "epoch": 0.046875,
      "grad_norm": 2.117205858230591,
      "learning_rate": 0.00019987500000000002,
      "loss": 2.1936,
      "step": 3
    },
    {
      "epoch": 0.0625,
      "grad_norm": 2.2750327587127686,
      "learning_rate": 0.0001998125,
      "loss": 2.3194,
      "step": 4
    },
    {
      "epoch": 0.078125,
      "grad_norm": 2.4351274967193604,
      "learning_rate": 0.00019975,
      "loss": 2.1065,
      "step": 5
    },
    {
      "epoch": 0.09375,
      "grad_norm": 2.355513095855713,
      "learning_rate": 0.0001996875,
      "loss": 2.15,
      "step": 6
    },
    {
      "epoch": 0.109375,
      "grad_norm": 2.2269349098205566,
      "learning_rate": 0.00019962500000000001,
      "loss": 2.2739,
      "step": 7
    },
    {
      "epoch": 0.125,
      "grad_norm": 2.051481246948242,
      "learning_rate": 0.0001995625,
      "loss": 2.1879,
      "step": 8
    },
    {
      "epoch": 0.140625,
      "grad_norm": 2.470672130584717,
      "learning_rate": 0.00019950000000000002,
      "loss": 2.2054,
      "step": 9
    },
    {
      "epoch": 0.15625,
      "grad_norm": 2.0219268798828125,
      "learning_rate": 0.00019943750000000002,
      "loss": 2.4391,
      "step": 10
    },
    {
      "epoch": 0.171875,
      "grad_norm": 2.2756736278533936,
      "learning_rate": 0.000199375,
      "loss": 2.3523,
      "step": 11
    },
    {
      "epoch": 0.1875,
      "grad_norm": 1.945286750793457,
      "learning_rate": 0.0001993125,
      "loss": 2.402,
      "step": 12
    },
    {
      "epoch": 0.203125,
      "grad_norm": 2.121889591217041,
      "learning_rate": 0.00019925,
      "loss": 2.1365,
      "step": 13
    },
    {
      "epoch": 0.21875,
      "grad_norm": 2.6106631755828857,
      "learning_rate": 0.0001991875,
      "loss": 2.1216,
      "step": 14
    },
    {
      "epoch": 0.234375,
      "grad_norm": 2.1075828075408936,
      "learning_rate": 0.000199125,
      "loss": 2.5793,
      "step": 15
    },
    {
      "epoch": 0.25,
      "grad_norm": 2.1582677364349365,
      "learning_rate": 0.00019906250000000002,
      "loss": 2.2373,
      "step": 16
    },
    {
      "epoch": 0.265625,
      "grad_norm": 2.1860029697418213,
      "learning_rate": 0.000199,
      "loss": 2.103,
      "step": 17
    },
    {
      "epoch": 0.28125,
      "grad_norm": 2.329177141189575,
      "learning_rate": 0.0001989375,
      "loss": 1.8669,
      "step": 18
    },
    {
      "epoch": 0.296875,
      "grad_norm": 2.2798051834106445,
      "learning_rate": 0.00019887500000000002,
      "loss": 2.4221,
      "step": 19
    },
    {
      "epoch": 0.3125,
      "grad_norm": 2.153740406036377,
      "learning_rate": 0.00019881250000000001,
      "loss": 2.2804,
      "step": 20
    },
    {
      "epoch": 0.328125,
      "grad_norm": 1.9342625141143799,
      "learning_rate": 0.00019875,
      "loss": 2.2716,
      "step": 21
    },
    {
      "epoch": 0.34375,
      "grad_norm": 2.46905517578125,
      "learning_rate": 0.0001986875,
      "loss": 2.5111,
      "step": 22
    },
    {
      "epoch": 0.359375,
      "grad_norm": 2.330343008041382,
      "learning_rate": 0.00019862500000000002,
      "loss": 2.2401,
      "step": 23
    },
    {
      "epoch": 0.375,
      "grad_norm": 2.051076650619507,
      "learning_rate": 0.0001985625,
      "loss": 2.1632,
      "step": 24
    },
    {
      "epoch": 0.390625,
      "grad_norm": 2.1071362495422363,
      "learning_rate": 0.00019850000000000003,
      "loss": 1.8548,
      "step": 25
    },
    {
      "epoch": 0.40625,
      "grad_norm": 2.509737968444824,
      "learning_rate": 0.00019843750000000002,
      "loss": 2.1552,
      "step": 26
    },
    {
      "epoch": 0.421875,
      "grad_norm": 2.1835129261016846,
      "learning_rate": 0.000198375,
      "loss": 2.4059,
      "step": 27
    },
    {
      "epoch": 0.4375,
      "grad_norm": 2.033925771713257,
      "learning_rate": 0.0001983125,
      "loss": 2.1809,
      "step": 28
    },
    {
      "epoch": 0.453125,
      "grad_norm": 2.058732271194458,
      "learning_rate": 0.00019825,
      "loss": 2.4359,
      "step": 29
    },
    {
      "epoch": 0.46875,
      "grad_norm": 2.335789680480957,
      "learning_rate": 0.0001981875,
      "loss": 2.3268,
      "step": 30
    },
    {
      "epoch": 0.484375,
      "grad_norm": 2.1191935539245605,
      "learning_rate": 0.000198125,
      "loss": 2.3007,
      "step": 31
    },
    {
      "epoch": 0.5,
      "grad_norm": 2.209824562072754,
      "learning_rate": 0.00019806250000000002,
      "loss": 2.4542,
      "step": 32
    },
    {
      "epoch": 0.515625,
      "grad_norm": 2.2472550868988037,
      "learning_rate": 0.00019800000000000002,
      "loss": 2.2277,
      "step": 33
    },
    {
      "epoch": 0.53125,
      "grad_norm": 2.2530689239501953,
      "learning_rate": 0.0001979375,
      "loss": 2.1629,
      "step": 34
    },
    {
      "epoch": 0.546875,
      "grad_norm": 2.194068193435669,
      "learning_rate": 0.000197875,
      "loss": 1.7826,
      "step": 35
    },
    {
      "epoch": 0.5625,
      "grad_norm": 2.2254586219787598,
      "learning_rate": 0.0001978125,
      "loss": 2.2475,
      "step": 36
    },
    {
      "epoch": 0.578125,
      "grad_norm": 2.3649280071258545,
      "learning_rate": 0.00019775,
      "loss": 2.2339,
      "step": 37
    },
    {
      "epoch": 0.59375,
      "grad_norm": 1.998327374458313,
      "learning_rate": 0.0001976875,
      "loss": 2.2739,
      "step": 38
    },
    {
      "epoch": 0.609375,
      "grad_norm": 2.264702796936035,
      "learning_rate": 0.00019762500000000002,
      "loss": 2.3385,
      "step": 39
    },
    {
      "epoch": 0.625,
      "grad_norm": 2.1773226261138916,
      "learning_rate": 0.0001975625,
      "loss": 2.4168,
      "step": 40
    },
    {
      "epoch": 0.640625,
      "grad_norm": 2.198726177215576,
      "learning_rate": 0.00019750000000000003,
      "loss": 2.2166,
      "step": 41
    },
    {
      "epoch": 0.65625,
      "grad_norm": 2.0798370838165283,
      "learning_rate": 0.00019743750000000002,
      "loss": 2.4496,
      "step": 42
    },
    {
      "epoch": 0.671875,
      "grad_norm": 2.1094698905944824,
      "learning_rate": 0.00019737499999999999,
      "loss": 2.1209,
      "step": 43
    },
    {
      "epoch": 0.6875,
      "grad_norm": 2.2794485092163086,
      "learning_rate": 0.0001973125,
      "loss": 2.1789,
      "step": 44
    },
    {
      "epoch": 0.703125,
      "grad_norm": 2.2094063758850098,
      "learning_rate": 0.00019725,
      "loss": 2.366,
      "step": 45
    },
    {
      "epoch": 0.71875,
      "grad_norm": 2.1685726642608643,
      "learning_rate": 0.00019718750000000002,
      "loss": 2.0081,
      "step": 46
    },
    {
      "epoch": 0.734375,
      "grad_norm": 2.3561131954193115,
      "learning_rate": 0.000197125,
      "loss": 2.2701,
      "step": 47
    },
    {
      "epoch": 0.75,
      "grad_norm": 2.0826942920684814,
      "learning_rate": 0.00019706250000000003,
      "loss": 2.2719,
      "step": 48
    },
    {
      "epoch": 0.765625,
      "grad_norm": 2.344053030014038,
      "learning_rate": 0.00019700000000000002,
      "loss": 2.1332,
      "step": 49
    },
    {
      "epoch": 0.78125,
      "grad_norm": 2.177788734436035,
      "learning_rate": 0.0001969375,
      "loss": 2.2716,
      "step": 50
    },
    {
      "epoch": 0.796875,
      "grad_norm": 2.2522244453430176,
      "learning_rate": 0.000196875,
      "loss": 1.9018,
      "step": 51
    },
    {
      "epoch": 0.8125,
      "grad_norm": 2.3848743438720703,
      "learning_rate": 0.0001968125,
      "loss": 2.2946,
      "step": 52
    },
    {
      "epoch": 0.828125,
      "grad_norm": 2.2048258781433105,
      "learning_rate": 0.00019675,
      "loss": 1.9149,
      "step": 53
    },
    {
      "epoch": 0.84375,
      "grad_norm": 2.1094186305999756,
      "learning_rate": 0.0001966875,
      "loss": 2.3651,
      "step": 54
    },
    {
      "epoch": 0.859375,
      "grad_norm": 2.239182949066162,
      "learning_rate": 0.00019662500000000002,
      "loss": 2.3537,
      "step": 55
    },
    {
      "epoch": 0.875,
      "grad_norm": 2.2012438774108887,
      "learning_rate": 0.00019656250000000001,
      "loss": 2.0803,
      "step": 56
    },
    {
      "epoch": 0.890625,
      "grad_norm": 2.172363758087158,
      "learning_rate": 0.0001965,
      "loss": 2.0677,
      "step": 57
    },
    {
      "epoch": 0.90625,
      "grad_norm": 2.344632863998413,
      "learning_rate": 0.0001964375,
      "loss": 2.5514,
      "step": 58
    },
    {
      "epoch": 0.921875,
      "grad_norm": 2.5960001945495605,
      "learning_rate": 0.00019637500000000002,
      "loss": 2.2973,
      "step": 59
    },
    {
      "epoch": 0.9375,
      "grad_norm": 2.545449733734131,
      "learning_rate": 0.0001963125,
      "loss": 2.4617,
      "step": 60
    },
    {
      "epoch": 0.953125,
      "grad_norm": 2.0409300327301025,
      "learning_rate": 0.00019625,
      "loss": 2.1179,
      "step": 61
    },
    {
      "epoch": 0.96875,
      "grad_norm": 2.2523374557495117,
      "learning_rate": 0.00019618750000000002,
      "loss": 1.7381,
      "step": 62
    },
    {
      "epoch": 0.984375,
      "grad_norm": 2.4158308506011963,
      "learning_rate": 0.000196125,
      "loss": 2.2771,
      "step": 63
    },
    {
      "epoch": 1.0,
      "grad_norm": 2.814467668533325,
      "learning_rate": 0.00019606250000000003,
      "loss": 2.1076,
      "step": 64
    },
    {
      "epoch": 1.0,
      "eval_loss": 2.888941764831543,
      "eval_runtime": 2.9469,
      "eval_samples_per_second": 173.742,
      "eval_steps_per_second": 43.436,
      "step": 64
    },
    {
      "epoch": 1.015625,
      "grad_norm": 1.979724407196045,
      "learning_rate": 0.000196,
      "loss": 2.1654,
      "step": 65
    },
    {
      "epoch": 1.03125,
      "grad_norm": 1.9491156339645386,
      "learning_rate": 0.0001959375,
      "loss": 2.2891,
      "step": 66
    },
    {
      "epoch": 1.046875,
      "grad_norm": 2.4911608695983887,
      "learning_rate": 0.000195875,
      "loss": 2.1202,
      "step": 67
    },
    {
      "epoch": 1.0625,
      "grad_norm": 2.1192307472229004,
      "learning_rate": 0.0001958125,
      "loss": 2.4115,
      "step": 68
    },
    {
      "epoch": 1.078125,
      "grad_norm": 2.1188411712646484,
      "learning_rate": 0.00019575000000000001,
      "loss": 2.1464,
      "step": 69
    },
    {
      "epoch": 1.09375,
      "grad_norm": 2.1810834407806396,
      "learning_rate": 0.0001956875,
      "loss": 2.2084,
      "step": 70
    },
    {
      "epoch": 1.109375,
      "grad_norm": 2.2514638900756836,
      "learning_rate": 0.00019562500000000003,
      "loss": 2.0682,
      "step": 71
    },
    {
      "epoch": 1.125,
      "grad_norm": 2.291006088256836,
      "learning_rate": 0.00019556250000000002,
      "loss": 2.2581,
      "step": 72
    },
    {
      "epoch": 1.140625,
      "grad_norm": 2.042743444442749,
      "learning_rate": 0.0001955,
      "loss": 2.0841,
      "step": 73
    },
    {
      "epoch": 1.15625,
      "grad_norm": 1.8026031255722046,
      "learning_rate": 0.0001954375,
      "loss": 2.0334,
      "step": 74
    },
    {
      "epoch": 1.171875,
      "grad_norm": 2.3401403427124023,
      "learning_rate": 0.00019537500000000002,
      "loss": 2.2052,
      "step": 75
    },
    {
      "epoch": 1.1875,
      "grad_norm": 2.122159957885742,
      "learning_rate": 0.0001953125,
      "loss": 2.191,
      "step": 76
    },
    {
      "epoch": 1.203125,
      "grad_norm": 2.088920831680298,
      "learning_rate": 0.00019525,
      "loss": 2.4106,
      "step": 77
    },
    {
      "epoch": 1.21875,
      "grad_norm": 2.2362513542175293,
      "learning_rate": 0.00019518750000000002,
      "loss": 2.1989,
      "step": 78
    },
    {
      "epoch": 1.234375,
      "grad_norm": 2.283374547958374,
      "learning_rate": 0.000195125,
      "loss": 1.9124,
      "step": 79
    },
    {
      "epoch": 1.25,
      "grad_norm": 2.1380422115325928,
      "learning_rate": 0.0001950625,
      "loss": 2.2825,
      "step": 80
    },
    {
      "epoch": 1.265625,
      "grad_norm": 2.364685535430908,
      "learning_rate": 0.000195,
      "loss": 1.985,
      "step": 81
    },
    {
      "epoch": 1.28125,
      "grad_norm": 2.1319854259490967,
      "learning_rate": 0.00019493750000000002,
      "loss": 2.1911,
      "step": 82
    },
    {
      "epoch": 1.296875,
      "grad_norm": 2.033524513244629,
      "learning_rate": 0.000194875,
      "loss": 2.0377,
      "step": 83
    },
    {
      "epoch": 1.3125,
      "grad_norm": 2.0100972652435303,
      "learning_rate": 0.00019481250000000003,
      "loss": 2.1867,
      "step": 84
    },
    {
      "epoch": 1.328125,
      "grad_norm": 1.9985530376434326,
      "learning_rate": 0.00019475000000000002,
      "loss": 2.0233,
      "step": 85
    },
    {
      "epoch": 1.34375,
      "grad_norm": 2.1520256996154785,
      "learning_rate": 0.0001946875,
      "loss": 2.3613,
      "step": 86
    },
    {
      "epoch": 1.359375,
      "grad_norm": 2.461325168609619,
      "learning_rate": 0.000194625,
      "loss": 1.9714,
      "step": 87
    },
    {
      "epoch": 1.375,
      "grad_norm": 2.100831985473633,
      "learning_rate": 0.0001945625,
      "loss": 2.031,
      "step": 88
    },
    {
      "epoch": 1.390625,
      "grad_norm": 2.1566879749298096,
      "learning_rate": 0.0001945,
      "loss": 1.9603,
      "step": 89
    },
    {
      "epoch": 1.40625,
      "grad_norm": 2.3421988487243652,
      "learning_rate": 0.0001944375,
      "loss": 2.0914,
      "step": 90
    },
    {
      "epoch": 1.421875,
      "grad_norm": 2.224313974380493,
      "learning_rate": 0.00019437500000000002,
      "loss": 2.2833,
      "step": 91
    },
    {
      "epoch": 1.4375,
      "grad_norm": 1.8862192630767822,
      "learning_rate": 0.00019431250000000001,
      "loss": 2.1674,
      "step": 92
    },
    {
      "epoch": 1.453125,
      "grad_norm": 1.9711544513702393,
      "learning_rate": 0.00019425,
      "loss": 2.1249,
      "step": 93
    },
    {
      "epoch": 1.46875,
      "grad_norm": 2.1372008323669434,
      "learning_rate": 0.00019418750000000002,
      "loss": 2.2182,
      "step": 94
    },
    {
      "epoch": 1.484375,
      "grad_norm": 2.11291241645813,
      "learning_rate": 0.000194125,
      "loss": 2.0246,
      "step": 95
    },
    {
      "epoch": 1.5,
      "grad_norm": 2.0340569019317627,
      "learning_rate": 0.0001940625,
      "loss": 1.7995,
      "step": 96
    },
    {
      "epoch": 1.515625,
      "grad_norm": 2.2284557819366455,
      "learning_rate": 0.000194,
      "loss": 2.1249,
      "step": 97
    },
    {
      "epoch": 1.53125,
      "grad_norm": 2.1998884677886963,
      "learning_rate": 0.00019393750000000002,
      "loss": 2.1522,
      "step": 98
    },
    {
      "epoch": 1.546875,
      "grad_norm": 1.9677811861038208,
      "learning_rate": 0.000193875,
      "loss": 2.1189,
      "step": 99
    },
    {
      "epoch": 1.5625,
      "grad_norm": 2.209949016571045,
      "learning_rate": 0.00019381250000000003,
      "loss": 2.1512,
      "step": 100
    },
    {
      "epoch": 1.578125,
      "grad_norm": 2.3401095867156982,
      "learning_rate": 0.00019375000000000002,
      "loss": 2.1869,
      "step": 101
    },
    {
      "epoch": 1.59375,
      "grad_norm": 2.262834310531616,
      "learning_rate": 0.0001936875,
      "loss": 2.1002,
      "step": 102
    },
    {
      "epoch": 1.609375,
      "grad_norm": 2.283750534057617,
      "learning_rate": 0.000193625,
      "loss": 2.395,
      "step": 103
    },
    {
      "epoch": 1.625,
      "grad_norm": 2.18599796295166,
      "learning_rate": 0.0001935625,
      "loss": 2.3566,
      "step": 104
    },
    {
      "epoch": 1.640625,
      "grad_norm": 2.19173002243042,
      "learning_rate": 0.00019350000000000001,
      "loss": 1.7981,
      "step": 105
    },
    {
      "epoch": 1.65625,
      "grad_norm": 2.2428224086761475,
      "learning_rate": 0.0001934375,
      "loss": 2.5195,
      "step": 106
    },
    {
      "epoch": 1.671875,
      "grad_norm": 2.3016700744628906,
      "learning_rate": 0.00019337500000000002,
      "loss": 2.3752,
      "step": 107
    },
    {
      "epoch": 1.6875,
      "grad_norm": 2.406924247741699,
      "learning_rate": 0.00019331250000000002,
      "loss": 2.5682,
      "step": 108
    },
    {
      "epoch": 1.703125,
      "grad_norm": 1.887404441833496,
      "learning_rate": 0.00019325,
      "loss": 2.1274,
      "step": 109
    },
    {
      "epoch": 1.71875,
      "grad_norm": 2.090501546859741,
      "learning_rate": 0.0001931875,
      "loss": 2.2439,
      "step": 110
    },
    {
      "epoch": 1.734375,
      "grad_norm": 2.345803737640381,
      "learning_rate": 0.000193125,
      "loss": 1.9856,
      "step": 111
    },
    {
      "epoch": 1.75,
      "grad_norm": 2.154759168624878,
      "learning_rate": 0.0001930625,
      "loss": 1.7944,
      "step": 112
    },
    {
      "epoch": 1.765625,
      "grad_norm": 2.4943838119506836,
      "learning_rate": 0.000193,
      "loss": 1.7652,
      "step": 113
    },
    {
      "epoch": 1.78125,
      "grad_norm": 2.5357320308685303,
      "learning_rate": 0.00019293750000000002,
      "loss": 2.2024,
      "step": 114
    },
    {
      "epoch": 1.796875,
      "grad_norm": 2.227419376373291,
      "learning_rate": 0.000192875,
      "loss": 1.9831,
      "step": 115
    },
    {
      "epoch": 1.8125,
      "grad_norm": 2.4392809867858887,
      "learning_rate": 0.00019281250000000003,
      "loss": 2.4413,
      "step": 116
    },
    {
      "epoch": 1.828125,
      "grad_norm": 2.130974054336548,
      "learning_rate": 0.00019275,
      "loss": 2.2049,
      "step": 117
    },
    {
      "epoch": 1.84375,
      "grad_norm": 2.08553147315979,
      "learning_rate": 0.0001926875,
      "loss": 2.3971,
      "step": 118
    },
    {
      "epoch": 1.859375,
      "grad_norm": 2.328704357147217,
      "learning_rate": 0.000192625,
      "loss": 2.2679,
      "step": 119
    },
    {
      "epoch": 1.875,
      "grad_norm": 2.347860336303711,
      "learning_rate": 0.0001925625,
      "loss": 2.2017,
      "step": 120
    },
    {
      "epoch": 1.890625,
      "grad_norm": 2.3405325412750244,
      "learning_rate": 0.00019250000000000002,
      "loss": 2.2944,
      "step": 121
    },
    {
      "epoch": 1.90625,
      "grad_norm": 2.2739880084991455,
      "learning_rate": 0.0001924375,
      "loss": 2.3215,
      "step": 122
    },
    {
      "epoch": 1.921875,
      "grad_norm": 2.457771062850952,
      "learning_rate": 0.00019237500000000003,
      "loss": 2.0593,
      "step": 123
    },
    {
      "epoch": 1.9375,
      "grad_norm": 2.3360488414764404,
      "learning_rate": 0.00019231250000000002,
      "loss": 2.0238,
      "step": 124
    },
    {
      "epoch": 1.953125,
      "grad_norm": 2.4380905628204346,
      "learning_rate": 0.00019225,
      "loss": 2.1855,
      "step": 125
    },
    {
      "epoch": 1.96875,
      "grad_norm": 2.4325408935546875,
      "learning_rate": 0.0001921875,
      "loss": 2.0395,
      "step": 126
    },
    {
      "epoch": 1.984375,
      "grad_norm": 2.199697732925415,
      "learning_rate": 0.000192125,
      "loss": 2.0803,
      "step": 127
    },
    {
      "epoch": 2.0,
      "grad_norm": 2.9024784564971924,
      "learning_rate": 0.0001920625,
      "loss": 2.3262,
      "step": 128
    },
    {
      "epoch": 2.0,
      "eval_loss": 2.886915922164917,
      "eval_runtime": 2.8551,
      "eval_samples_per_second": 179.327,
      "eval_steps_per_second": 44.832,
      "step": 128
    },
    {
      "epoch": 2.015625,
      "grad_norm": 2.1814963817596436,
      "learning_rate": 0.000192,
      "loss": 1.9321,
      "step": 129
    },
    {
      "epoch": 2.03125,
      "grad_norm": 1.96181321144104,
      "learning_rate": 0.00019193750000000002,
      "loss": 1.9302,
      "step": 130
    },
    {
      "epoch": 2.046875,
      "grad_norm": 2.149388074874878,
      "learning_rate": 0.00019187500000000002,
      "loss": 1.8971,
      "step": 131
    },
    {
      "epoch": 2.0625,
      "grad_norm": 2.1721854209899902,
      "learning_rate": 0.0001918125,
      "loss": 2.2672,
      "step": 132
    },
    {
      "epoch": 2.078125,
      "grad_norm": 1.9983744621276855,
      "learning_rate": 0.00019175,
      "loss": 1.9044,
      "step": 133
    },
    {
      "epoch": 2.09375,
      "grad_norm": 1.9716640710830688,
      "learning_rate": 0.00019168750000000002,
      "loss": 2.0532,
      "step": 134
    },
    {
      "epoch": 2.109375,
      "grad_norm": 1.9328125715255737,
      "learning_rate": 0.000191625,
      "loss": 2.2892,
      "step": 135
    },
    {
      "epoch": 2.125,
      "grad_norm": 2.36125111579895,
      "learning_rate": 0.0001915625,
      "loss": 2.2464,
      "step": 136
    },
    {
      "epoch": 2.140625,
      "grad_norm": 1.9948935508728027,
      "learning_rate": 0.00019150000000000002,
      "loss": 1.8616,
      "step": 137
    },
    {
      "epoch": 2.15625,
      "grad_norm": 2.0453758239746094,
      "learning_rate": 0.0001914375,
      "loss": 1.917,
      "step": 138
    },
    {
      "epoch": 2.171875,
      "grad_norm": 2.020028591156006,
      "learning_rate": 0.000191375,
      "loss": 1.9316,
      "step": 139
    },
    {
      "epoch": 2.1875,
      "grad_norm": 2.3078503608703613,
      "learning_rate": 0.0001913125,
      "loss": 2.2993,
      "step": 140
    },
    {
      "epoch": 2.203125,
      "grad_norm": 2.2256014347076416,
      "learning_rate": 0.00019125000000000001,
      "loss": 2.257,
      "step": 141
    },
    {
      "epoch": 2.21875,
      "grad_norm": 2.516554355621338,
      "learning_rate": 0.0001911875,
      "loss": 2.0035,
      "step": 142
    },
    {
      "epoch": 2.234375,
      "grad_norm": 1.9573105573654175,
      "learning_rate": 0.000191125,
      "loss": 2.1782,
      "step": 143
    },
    {
      "epoch": 2.25,
      "grad_norm": 2.5398168563842773,
      "learning_rate": 0.00019106250000000002,
      "loss": 2.4373,
      "step": 144
    },
    {
      "epoch": 2.265625,
      "grad_norm": 1.9387798309326172,
      "learning_rate": 0.000191,
      "loss": 2.1843,
      "step": 145
    },
    {
      "epoch": 2.28125,
      "grad_norm": 2.2774064540863037,
      "learning_rate": 0.00019093750000000003,
      "loss": 2.288,
      "step": 146
    },
    {
      "epoch": 2.296875,
      "grad_norm": 2.5299739837646484,
      "learning_rate": 0.000190875,
      "loss": 1.9533,
      "step": 147
    },
    {
      "epoch": 2.3125,
      "grad_norm": 2.0150017738342285,
      "learning_rate": 0.0001908125,
      "loss": 1.8598,
      "step": 148
    },
    {
      "epoch": 2.328125,
      "grad_norm": 1.8501254320144653,
      "learning_rate": 0.00019075,
      "loss": 2.3234,
      "step": 149
    },
    {
      "epoch": 2.34375,
      "grad_norm": 1.8371587991714478,
      "learning_rate": 0.00019068750000000002,
      "loss": 2.127,
      "step": 150
    },
    {
      "epoch": 2.359375,
      "grad_norm": 1.9305922985076904,
      "learning_rate": 0.000190625,
      "loss": 2.2452,
      "step": 151
    },
    {
      "epoch": 2.375,
      "grad_norm": 2.248922348022461,
      "learning_rate": 0.0001905625,
      "loss": 1.9822,
      "step": 152
    },
    {
      "epoch": 2.390625,
      "grad_norm": 2.395979642868042,
      "learning_rate": 0.00019050000000000002,
      "loss": 2.1588,
      "step": 153
    },
    {
      "epoch": 2.40625,
      "grad_norm": 2.164294958114624,
      "learning_rate": 0.00019043750000000001,
      "loss": 2.485,
      "step": 154
    },
    {
      "epoch": 2.421875,
      "grad_norm": 2.339989423751831,
      "learning_rate": 0.000190375,
      "loss": 1.7241,
      "step": 155
    },
    {
      "epoch": 2.4375,
      "grad_norm": 2.04276704788208,
      "learning_rate": 0.0001903125,
      "loss": 2.1731,
      "step": 156
    },
    {
      "epoch": 2.453125,
      "grad_norm": 2.1837618350982666,
      "learning_rate": 0.00019025000000000002,
      "loss": 2.2736,
      "step": 157
    },
    {
      "epoch": 2.46875,
      "grad_norm": 2.312350273132324,
      "learning_rate": 0.0001901875,
      "loss": 2.165,
      "step": 158
    },
    {
      "epoch": 2.484375,
      "grad_norm": 2.177719831466675,
      "learning_rate": 0.00019012500000000003,
      "loss": 1.6659,
      "step": 159
    },
    {
      "epoch": 2.5,
      "grad_norm": 2.110541820526123,
      "learning_rate": 0.00019006250000000002,
      "loss": 2.1067,
      "step": 160
    },
    {
      "epoch": 2.515625,
      "grad_norm": 1.8952856063842773,
      "learning_rate": 0.00019,
      "loss": 2.2842,
      "step": 161
    },
    {
      "epoch": 2.53125,
      "grad_norm": 2.2907278537750244,
      "learning_rate": 0.0001899375,
      "loss": 2.296,
      "step": 162
    },
    {
      "epoch": 2.546875,
      "grad_norm": 2.335812568664551,
      "learning_rate": 0.000189875,
      "loss": 2.2967,
      "step": 163
    },
    {
      "epoch": 2.5625,
      "grad_norm": 1.9978564977645874,
      "learning_rate": 0.0001898125,
      "loss": 1.994,
      "step": 164
    },
    {
      "epoch": 2.578125,
      "grad_norm": 2.209967613220215,
      "learning_rate": 0.00018975,
      "loss": 2.0181,
      "step": 165
    },
    {
      "epoch": 2.59375,
      "grad_norm": 2.301626682281494,
      "learning_rate": 0.00018968750000000002,
      "loss": 2.0237,
      "step": 166
    },
    {
      "epoch": 2.609375,
      "grad_norm": 2.0514559745788574,
      "learning_rate": 0.00018962500000000001,
      "loss": 2.0068,
      "step": 167
    },
    {
      "epoch": 2.625,
      "grad_norm": 2.198521614074707,
      "learning_rate": 0.0001895625,
      "loss": 2.2064,
      "step": 168
    },
    {
      "epoch": 2.640625,
      "grad_norm": 2.3864235877990723,
      "learning_rate": 0.0001895,
      "loss": 2.3511,
      "step": 169
    },
    {
      "epoch": 2.65625,
      "grad_norm": 2.311760425567627,
      "learning_rate": 0.0001894375,
      "loss": 2.2032,
      "step": 170
    },
    {
      "epoch": 2.671875,
      "grad_norm": 2.4159317016601562,
      "learning_rate": 0.000189375,
      "loss": 1.86,
      "step": 171
    },
    {
      "epoch": 2.6875,
      "grad_norm": 2.1375012397766113,
      "learning_rate": 0.0001893125,
      "loss": 2.0132,
      "step": 172
    },
    {
      "epoch": 2.703125,
      "grad_norm": 2.3399925231933594,
      "learning_rate": 0.00018925000000000002,
      "loss": 1.89,
      "step": 173
    },
    {
      "epoch": 2.71875,
      "grad_norm": 2.1650888919830322,
      "learning_rate": 0.0001891875,
      "loss": 2.2221,
      "step": 174
    },
    {
      "epoch": 2.734375,
      "grad_norm": 2.235187530517578,
      "learning_rate": 0.00018912500000000003,
      "loss": 1.9724,
      "step": 175
    },
    {
      "epoch": 2.75,
      "grad_norm": 1.9995495080947876,
      "learning_rate": 0.00018906250000000002,
      "loss": 2.2262,
      "step": 176
    },
    {
      "epoch": 2.765625,
      "grad_norm": 2.251384973526001,
      "learning_rate": 0.00018899999999999999,
      "loss": 1.9207,
      "step": 177
    },
    {
      "epoch": 2.78125,
      "grad_norm": 2.1612207889556885,
      "learning_rate": 0.0001889375,
      "loss": 2.1477,
      "step": 178
    },
    {
      "epoch": 2.796875,
      "grad_norm": 2.183742046356201,
      "learning_rate": 0.000188875,
      "loss": 1.8831,
      "step": 179
    },
    {
      "epoch": 2.8125,
      "grad_norm": 2.287141799926758,
      "learning_rate": 0.00018881250000000002,
      "loss": 1.934,
      "step": 180
    },
    {
      "epoch": 2.828125,
      "grad_norm": 2.309579372406006,
      "learning_rate": 0.00018875,
      "loss": 2.2491,
      "step": 181
    },
    {
      "epoch": 2.84375,
      "grad_norm": 1.9064360857009888,
      "learning_rate": 0.00018868750000000003,
      "loss": 1.967,
      "step": 182
    },
    {
      "epoch": 2.859375,
      "grad_norm": 2.2318437099456787,
      "learning_rate": 0.00018862500000000002,
      "loss": 2.171,
      "step": 183
    },
    {
      "epoch": 2.875,
      "grad_norm": 2.4751651287078857,
      "learning_rate": 0.0001885625,
      "loss": 2.1502,
      "step": 184
    },
    {
      "epoch": 2.890625,
      "grad_norm": 2.0199458599090576,
      "learning_rate": 0.0001885,
      "loss": 2.3423,
      "step": 185
    },
    {
      "epoch": 2.90625,
      "grad_norm": 2.3951938152313232,
      "learning_rate": 0.0001884375,
      "loss": 2.1796,
      "step": 186
    },
    {
      "epoch": 2.921875,
      "grad_norm": 2.0879955291748047,
      "learning_rate": 0.000188375,
      "loss": 2.0861,
      "step": 187
    },
    {
      "epoch": 2.9375,
      "grad_norm": 1.9395372867584229,
      "learning_rate": 0.0001883125,
      "loss": 2.1733,
      "step": 188
    },
    {
      "epoch": 2.953125,
      "grad_norm": 2.210157871246338,
      "learning_rate": 0.00018825000000000002,
      "loss": 1.9572,
      "step": 189
    },
    {
      "epoch": 2.96875,
      "grad_norm": 2.308798313140869,
      "learning_rate": 0.00018818750000000001,
      "loss": 2.2594,
      "step": 190
    },
    {
      "epoch": 2.984375,
      "grad_norm": 2.210737466812134,
      "learning_rate": 0.000188125,
      "loss": 2.1295,
      "step": 191
    },
    {
      "epoch": 3.0,
      "grad_norm": 2.837263584136963,
      "learning_rate": 0.0001880625,
      "loss": 2.1014,
      "step": 192
    },
    {
      "epoch": 3.0,
      "eval_loss": 2.8908658027648926,
      "eval_runtime": 2.89,
      "eval_samples_per_second": 177.165,
      "eval_steps_per_second": 44.291,
      "step": 192
    },
    {
      "epoch": 3.015625,
      "grad_norm": 2.255011796951294,
      "learning_rate": 0.000188,
      "loss": 2.0161,
      "step": 193
    },
    {
      "epoch": 3.03125,
      "grad_norm": 1.893722653388977,
      "learning_rate": 0.0001879375,
      "loss": 2.1896,
      "step": 194
    },
    {
      "epoch": 3.046875,
      "grad_norm": 2.0481247901916504,
      "learning_rate": 0.000187875,
      "loss": 1.9638,
      "step": 195
    },
    {
      "epoch": 3.0625,
      "grad_norm": 2.0363235473632812,
      "learning_rate": 0.00018781250000000002,
      "loss": 1.8076,
      "step": 196
    },
    {
      "epoch": 3.078125,
      "grad_norm": 1.9724034070968628,
      "learning_rate": 0.00018775,
      "loss": 2.2986,
      "step": 197
    },
    {
      "epoch": 3.09375,
      "grad_norm": 2.157177448272705,
      "learning_rate": 0.00018768750000000003,
      "loss": 1.9682,
      "step": 198
    },
    {
      "epoch": 3.109375,
      "grad_norm": 2.028055191040039,
      "learning_rate": 0.000187625,
      "loss": 2.264,
      "step": 199
    },
    {
      "epoch": 3.125,
      "grad_norm": 2.452801465988159,
      "learning_rate": 0.0001875625,
      "loss": 2.002,
      "step": 200
    },
    {
      "epoch": 3.140625,
      "grad_norm": 2.008810520172119,
      "learning_rate": 0.0001875,
      "loss": 2.2561,
      "step": 201
    },
    {
      "epoch": 3.15625,
      "grad_norm": 2.267183542251587,
      "learning_rate": 0.0001874375,
      "loss": 2.0523,
      "step": 202
    },
    {
      "epoch": 3.171875,
      "grad_norm": 2.165590763092041,
      "learning_rate": 0.00018737500000000001,
      "loss": 1.867,
      "step": 203
    },
    {
      "epoch": 3.1875,
      "grad_norm": 2.0411489009857178,
      "learning_rate": 0.0001873125,
      "loss": 2.2495,
      "step": 204
    },
    {
      "epoch": 3.203125,
      "grad_norm": 1.9857840538024902,
      "learning_rate": 0.00018725000000000002,
      "loss": 2.3949,
      "step": 205
    },
    {
      "epoch": 3.21875,
      "grad_norm": 2.3708934783935547,
      "learning_rate": 0.00018718750000000002,
      "loss": 1.9615,
      "step": 206
    },
    {
      "epoch": 3.234375,
      "grad_norm": 2.4492502212524414,
      "learning_rate": 0.000187125,
      "loss": 2.0334,
      "step": 207
    },
    {
      "epoch": 3.25,
      "grad_norm": 2.033364772796631,
      "learning_rate": 0.0001870625,
      "loss": 1.9213,
      "step": 208
    },
    {
      "epoch": 3.265625,
      "grad_norm": 2.2740161418914795,
      "learning_rate": 0.00018700000000000002,
      "loss": 2.0532,
      "step": 209
    },
    {
      "epoch": 3.28125,
      "grad_norm": 2.2689876556396484,
      "learning_rate": 0.0001869375,
      "loss": 1.6974,
      "step": 210
    },
    {
      "epoch": 3.296875,
      "grad_norm": 2.292792558670044,
      "learning_rate": 0.000186875,
      "loss": 1.8126,
      "step": 211
    },
    {
      "epoch": 3.3125,
      "grad_norm": 1.9989880323410034,
      "learning_rate": 0.00018681250000000002,
      "loss": 2.2238,
      "step": 212
    },
    {
      "epoch": 3.328125,
      "grad_norm": 2.145343780517578,
      "learning_rate": 0.00018675,
      "loss": 2.1415,
      "step": 213
    },
    {
      "epoch": 3.34375,
      "grad_norm": 2.2462754249572754,
      "learning_rate": 0.0001866875,
      "loss": 1.947,
      "step": 214
    },
    {
      "epoch": 3.359375,
      "grad_norm": 2.169337511062622,
      "learning_rate": 0.000186625,
      "loss": 2.2935,
      "step": 215
    },
    {
      "epoch": 3.375,
      "grad_norm": 2.0395560264587402,
      "learning_rate": 0.00018656250000000001,
      "loss": 2.0825,
      "step": 216
    },
    {
      "epoch": 3.390625,
      "grad_norm": 2.0452020168304443,
      "learning_rate": 0.0001865,
      "loss": 2.1338,
      "step": 217
    },
    {
      "epoch": 3.40625,
      "grad_norm": 2.064013957977295,
      "learning_rate": 0.0001864375,
      "loss": 1.9782,
      "step": 218
    },
    {
      "epoch": 3.421875,
      "grad_norm": 2.102935314178467,
      "learning_rate": 0.00018637500000000002,
      "loss": 2.1176,
      "step": 219
    },
    {
      "epoch": 3.4375,
      "grad_norm": 2.0496442317962646,
      "learning_rate": 0.0001863125,
      "loss": 2.0683,
      "step": 220
    },
    {
      "epoch": 3.453125,
      "grad_norm": 2.403358221054077,
      "learning_rate": 0.00018625,
      "loss": 1.9265,
      "step": 221
    },
    {
      "epoch": 3.46875,
      "grad_norm": 2.1873435974121094,
      "learning_rate": 0.0001861875,
      "loss": 1.9203,
      "step": 222
    },
    {
      "epoch": 3.484375,
      "grad_norm": 2.0549967288970947,
      "learning_rate": 0.000186125,
      "loss": 2.1973,
      "step": 223
    },
    {
      "epoch": 3.5,
      "grad_norm": 2.338463068008423,
      "learning_rate": 0.0001860625,
      "loss": 1.712,
      "step": 224
    },
    {
      "epoch": 3.515625,
      "grad_norm": 2.238734006881714,
      "learning_rate": 0.00018600000000000002,
      "loss": 1.9559,
      "step": 225
    },
    {
      "epoch": 3.53125,
      "grad_norm": 2.125492572784424,
      "learning_rate": 0.0001859375,
      "loss": 1.9859,
      "step": 226
    },
    {
      "epoch": 3.546875,
      "grad_norm": 2.3447721004486084,
      "learning_rate": 0.000185875,
      "loss": 1.961,
      "step": 227
    },
    {
      "epoch": 3.5625,
      "grad_norm": 2.231221914291382,
      "learning_rate": 0.00018581250000000002,
      "loss": 2.0695,
      "step": 228
    },
    {
      "epoch": 3.578125,
      "grad_norm": 2.158933401107788,
      "learning_rate": 0.00018575,
      "loss": 1.919,
      "step": 229
    },
    {
      "epoch": 3.59375,
      "grad_norm": 2.365084648132324,
      "learning_rate": 0.0001856875,
      "loss": 2.2052,
      "step": 230
    },
    {
      "epoch": 3.609375,
      "grad_norm": 2.344510078430176,
      "learning_rate": 0.000185625,
      "loss": 1.8415,
      "step": 231
    },
    {
      "epoch": 3.625,
      "grad_norm": 2.0569498538970947,
      "learning_rate": 0.00018556250000000002,
      "loss": 2.1332,
      "step": 232
    },
    {
      "epoch": 3.640625,
      "grad_norm": 2.669703483581543,
      "learning_rate": 0.0001855,
      "loss": 2.2354,
      "step": 233
    },
    {
      "epoch": 3.65625,
      "grad_norm": 2.100593090057373,
      "learning_rate": 0.00018543750000000003,
      "loss": 2.4377,
      "step": 234
    },
    {
      "epoch": 3.671875,
      "grad_norm": 2.423942804336548,
      "learning_rate": 0.00018537500000000002,
      "loss": 2.1459,
      "step": 235
    },
    {
      "epoch": 3.6875,
      "grad_norm": 2.4003162384033203,
      "learning_rate": 0.0001853125,
      "loss": 2.0079,
      "step": 236
    },
    {
      "epoch": 3.703125,
      "grad_norm": 2.107288122177124,
      "learning_rate": 0.00018525,
      "loss": 1.921,
      "step": 237
    },
    {
      "epoch": 3.71875,
      "grad_norm": 2.071561098098755,
      "learning_rate": 0.0001851875,
      "loss": 1.9829,
      "step": 238
    },
    {
      "epoch": 3.734375,
      "grad_norm": 2.0371205806732178,
      "learning_rate": 0.00018512500000000001,
      "loss": 2.3567,
      "step": 239
    },
    {
      "epoch": 3.75,
      "grad_norm": 2.358224868774414,
      "learning_rate": 0.0001850625,
      "loss": 1.9961,
      "step": 240
    },
    {
      "epoch": 3.765625,
      "grad_norm": 2.1896815299987793,
      "learning_rate": 0.00018500000000000002,
      "loss": 2.112,
      "step": 241
    },
    {
      "epoch": 3.78125,
      "grad_norm": 1.9947091341018677,
      "learning_rate": 0.00018493750000000002,
      "loss": 2.0756,
      "step": 242
    },
    {
      "epoch": 3.796875,
      "grad_norm": 2.041844129562378,
      "learning_rate": 0.000184875,
      "loss": 2.2771,
      "step": 243
    },
    {
      "epoch": 3.8125,
      "grad_norm": 2.3417446613311768,
      "learning_rate": 0.0001848125,
      "loss": 2.0781,
      "step": 244
    },
    {
      "epoch": 3.828125,
      "grad_norm": 2.150759696960449,
      "learning_rate": 0.00018475,
      "loss": 1.9239,
      "step": 245
    },
    {
      "epoch": 3.84375,
      "grad_norm": 2.4807302951812744,
      "learning_rate": 0.0001846875,
      "loss": 1.794,
      "step": 246
    },
    {
      "epoch": 3.859375,
      "grad_norm": 2.2032504081726074,
      "learning_rate": 0.000184625,
      "loss": 2.1752,
      "step": 247
    },
    {
      "epoch": 3.875,
      "grad_norm": 2.025437355041504,
      "learning_rate": 0.00018456250000000002,
      "loss": 2.1705,
      "step": 248
    },
    {
      "epoch": 3.890625,
      "grad_norm": 1.9233052730560303,
      "learning_rate": 0.0001845,
      "loss": 2.1928,
      "step": 249
    },
    {
      "epoch": 3.90625,
      "grad_norm": 2.3572933673858643,
      "learning_rate": 0.00018443750000000003,
      "loss": 1.9668,
      "step": 250
    },
    {
      "epoch": 3.921875,
      "grad_norm": 2.2967443466186523,
      "learning_rate": 0.000184375,
      "loss": 2.124,
      "step": 251
    },
    {
      "epoch": 3.9375,
      "grad_norm": 1.8813748359680176,
      "learning_rate": 0.0001843125,
      "loss": 2.045,
      "step": 252
    },
    {
      "epoch": 3.953125,
      "grad_norm": 2.2912278175354004,
      "learning_rate": 0.00018425,
      "loss": 2.1945,
      "step": 253
    },
    {
      "epoch": 3.96875,
      "grad_norm": 2.3599064350128174,
      "learning_rate": 0.0001841875,
      "loss": 2.0037,
      "step": 254
    },
    {
      "epoch": 3.984375,
      "grad_norm": 2.4134318828582764,
      "learning_rate": 0.00018412500000000002,
      "loss": 2.1824,
      "step": 255
    },
    {
      "epoch": 4.0,
      "grad_norm": 2.228060245513916,
      "learning_rate": 0.0001840625,
      "loss": 2.0425,
      "step": 256
    },
    {
      "epoch": 4.0,
      "eval_loss": 2.880183219909668,
      "eval_runtime": 2.891,
      "eval_samples_per_second": 177.1,
      "eval_steps_per_second": 44.275,
      "step": 256
    },
    {
      "epoch": 4.015625,
      "grad_norm": 2.095828056335449,
      "learning_rate": 0.00018400000000000003,
      "loss": 1.8205,
      "step": 257
    },
    {
      "epoch": 4.03125,
      "grad_norm": 2.209404468536377,
      "learning_rate": 0.00018393750000000002,
      "loss": 2.1914,
      "step": 258
    },
    {
      "epoch": 4.046875,
      "grad_norm": 1.9071506261825562,
      "learning_rate": 0.000183875,
      "loss": 1.6143,
      "step": 259
    },
    {
      "epoch": 4.0625,
      "grad_norm": 1.8737274408340454,
      "learning_rate": 0.0001838125,
      "loss": 1.98,
      "step": 260
    },
    {
      "epoch": 4.078125,
      "grad_norm": 1.9302678108215332,
      "learning_rate": 0.00018375,
      "loss": 1.8749,
      "step": 261
    },
    {
      "epoch": 4.09375,
      "grad_norm": 1.9175666570663452,
      "learning_rate": 0.0001836875,
      "loss": 2.0434,
      "step": 262
    },
    {
      "epoch": 4.109375,
      "grad_norm": 2.1320698261260986,
      "learning_rate": 0.000183625,
      "loss": 1.9737,
      "step": 263
    },
    {
      "epoch": 4.125,
      "grad_norm": 2.227386713027954,
      "learning_rate": 0.00018356250000000002,
      "loss": 2.1099,
      "step": 264
    },
    {
      "epoch": 4.140625,
      "grad_norm": 2.3246567249298096,
      "learning_rate": 0.00018350000000000002,
      "loss": 2.052,
      "step": 265
    },
    {
      "epoch": 4.15625,
      "grad_norm": 2.225412607192993,
      "learning_rate": 0.0001834375,
      "loss": 1.6609,
      "step": 266
    },
    {
      "epoch": 4.171875,
      "grad_norm": 2.013270139694214,
      "learning_rate": 0.000183375,
      "loss": 1.8764,
      "step": 267
    },
    {
      "epoch": 4.1875,
      "grad_norm": 2.1060280799865723,
      "learning_rate": 0.0001833125,
      "loss": 1.9295,
      "step": 268
    },
    {
      "epoch": 4.203125,
      "grad_norm": 2.091536045074463,
      "learning_rate": 0.00018325,
      "loss": 1.8831,
      "step": 269
    },
    {
      "epoch": 4.21875,
      "grad_norm": 2.243474245071411,
      "learning_rate": 0.0001831875,
      "loss": 2.0246,
      "step": 270
    },
    {
      "epoch": 4.234375,
      "grad_norm": 2.310749053955078,
      "learning_rate": 0.00018312500000000002,
      "loss": 1.9343,
      "step": 271
    },
    {
      "epoch": 4.25,
      "grad_norm": 2.0412168502807617,
      "learning_rate": 0.0001830625,
      "loss": 2.3121,
      "step": 272
    },
    {
      "epoch": 4.265625,
      "grad_norm": 2.0920653343200684,
      "learning_rate": 0.000183,
      "loss": 1.6983,
      "step": 273
    },
    {
      "epoch": 4.28125,
      "grad_norm": 2.158403158187866,
      "learning_rate": 0.0001829375,
      "loss": 1.9478,
      "step": 274
    },
    {
      "epoch": 4.296875,
      "grad_norm": 2.2241930961608887,
      "learning_rate": 0.000182875,
      "loss": 2.0754,
      "step": 275
    },
    {
      "epoch": 4.3125,
      "grad_norm": 2.0819287300109863,
      "learning_rate": 0.0001828125,
      "loss": 2.0538,
      "step": 276
    },
    {
      "epoch": 4.328125,
      "grad_norm": 2.16375994682312,
      "learning_rate": 0.00018275,
      "loss": 2.2053,
      "step": 277
    },
    {
      "epoch": 4.34375,
      "grad_norm": 2.2670319080352783,
      "learning_rate": 0.00018268750000000002,
      "loss": 2.1062,
      "step": 278
    },
    {
      "epoch": 4.359375,
      "grad_norm": 2.1608707904815674,
      "learning_rate": 0.000182625,
      "loss": 2.4383,
      "step": 279
    },
    {
      "epoch": 4.375,
      "grad_norm": 1.961107611656189,
      "learning_rate": 0.00018256250000000003,
      "loss": 1.8317,
      "step": 280
    },
    {
      "epoch": 4.390625,
      "grad_norm": 2.1089468002319336,
      "learning_rate": 0.0001825,
      "loss": 2.1066,
      "step": 281
    },
    {
      "epoch": 4.40625,
      "grad_norm": 2.4322924613952637,
      "learning_rate": 0.0001824375,
      "loss": 2.1242,
      "step": 282
    },
    {
      "epoch": 4.421875,
      "grad_norm": 2.157012701034546,
      "learning_rate": 0.000182375,
      "loss": 1.7922,
      "step": 283
    },
    {
      "epoch": 4.4375,
      "grad_norm": 2.3589415550231934,
      "learning_rate": 0.00018231250000000002,
      "loss": 1.8798,
      "step": 284
    },
    {
      "epoch": 4.453125,
      "grad_norm": 2.314608097076416,
      "learning_rate": 0.00018225,
      "loss": 1.7439,
      "step": 285
    },
    {
      "epoch": 4.46875,
      "grad_norm": 2.434788465499878,
      "learning_rate": 0.0001821875,
      "loss": 2.079,
      "step": 286
    },
    {
      "epoch": 4.484375,
      "grad_norm": 2.1248931884765625,
      "learning_rate": 0.00018212500000000002,
      "loss": 2.0578,
      "step": 287
    },
    {
      "epoch": 4.5,
      "grad_norm": 2.3001790046691895,
      "learning_rate": 0.00018206250000000001,
      "loss": 2.1258,
      "step": 288
    },
    {
      "epoch": 4.515625,
      "grad_norm": 2.205014228820801,
      "learning_rate": 0.000182,
      "loss": 2.0428,
      "step": 289
    },
    {
      "epoch": 4.53125,
      "grad_norm": 2.309617757797241,
      "learning_rate": 0.0001819375,
      "loss": 1.9364,
      "step": 290
    },
    {
      "epoch": 4.546875,
      "grad_norm": 2.5770673751831055,
      "learning_rate": 0.00018187500000000002,
      "loss": 2.0191,
      "step": 291
    },
    {
      "epoch": 4.5625,
      "grad_norm": 2.068974733352661,
      "learning_rate": 0.0001818125,
      "loss": 2.3478,
      "step": 292
    },
    {
      "epoch": 4.578125,
      "grad_norm": 1.8852288722991943,
      "learning_rate": 0.00018175,
      "loss": 2.1205,
      "step": 293
    },
    {
      "epoch": 4.59375,
      "grad_norm": 1.9321250915527344,
      "learning_rate": 0.00018168750000000002,
      "loss": 1.8773,
      "step": 294
    },
    {
      "epoch": 4.609375,
      "grad_norm": 2.0716617107391357,
      "learning_rate": 0.000181625,
      "loss": 2.0271,
      "step": 295
    },
    {
      "epoch": 4.625,
      "grad_norm": 2.2472848892211914,
      "learning_rate": 0.0001815625,
      "loss": 1.7174,
      "step": 296
    },
    {
      "epoch": 4.640625,
      "grad_norm": 2.2851064205169678,
      "learning_rate": 0.0001815,
      "loss": 2.3991,
      "step": 297
    },
    {
      "epoch": 4.65625,
      "grad_norm": 1.9442219734191895,
      "learning_rate": 0.0001814375,
      "loss": 1.8495,
      "step": 298
    },
    {
      "epoch": 4.671875,
      "grad_norm": 2.387310743331909,
      "learning_rate": 0.000181375,
      "loss": 1.9339,
      "step": 299
    },
    {
      "epoch": 4.6875,
      "grad_norm": 2.5196168422698975,
      "learning_rate": 0.00018131250000000002,
      "loss": 1.91,
      "step": 300
    },
    {
      "epoch": 4.703125,
      "grad_norm": 2.518406867980957,
      "learning_rate": 0.00018125000000000001,
      "loss": 2.2362,
      "step": 301
    },
    {
      "epoch": 4.71875,
      "grad_norm": 2.1618905067443848,
      "learning_rate": 0.0001811875,
      "loss": 2.0222,
      "step": 302
    },
    {
      "epoch": 4.734375,
      "grad_norm": 2.1897802352905273,
      "learning_rate": 0.000181125,
      "loss": 2.0974,
      "step": 303
    },
    {
      "epoch": 4.75,
      "grad_norm": 2.0711474418640137,
      "learning_rate": 0.0001810625,
      "loss": 1.5881,
      "step": 304
    },
    {
      "epoch": 4.765625,
      "grad_norm": 2.538362741470337,
      "learning_rate": 0.000181,
      "loss": 1.8384,
      "step": 305
    },
    {
      "epoch": 4.78125,
      "grad_norm": 2.1048808097839355,
      "learning_rate": 0.0001809375,
      "loss": 2.0613,
      "step": 306
    },
    {
      "epoch": 4.796875,
      "grad_norm": 2.568692445755005,
      "learning_rate": 0.00018087500000000002,
      "loss": 1.94,
      "step": 307
    },
    {
      "epoch": 4.8125,
      "grad_norm": 2.109229326248169,
      "learning_rate": 0.0001808125,
      "loss": 2.0983,
      "step": 308
    },
    {
      "epoch": 4.828125,
      "grad_norm": 2.0645155906677246,
      "learning_rate": 0.00018075000000000003,
      "loss": 2.014,
      "step": 309
    },
    {
      "epoch": 4.84375,
      "grad_norm": 2.011540174484253,
      "learning_rate": 0.00018068750000000002,
      "loss": 2.0,
      "step": 310
    },
    {
      "epoch": 4.859375,
      "grad_norm": 2.318828582763672,
      "learning_rate": 0.000180625,
      "loss": 2.0616,
      "step": 311
    },
    {
      "epoch": 4.875,
      "grad_norm": 2.28702712059021,
      "learning_rate": 0.0001805625,
      "loss": 2.048,
      "step": 312
    },
    {
      "epoch": 4.890625,
      "grad_norm": 2.178147315979004,
      "learning_rate": 0.0001805,
      "loss": 2.0022,
      "step": 313
    },
    {
      "epoch": 4.90625,
      "grad_norm": 2.02931547164917,
      "learning_rate": 0.00018043750000000002,
      "loss": 2.1363,
      "step": 314
    },
    {
      "epoch": 4.921875,
      "grad_norm": 2.2108683586120605,
      "learning_rate": 0.000180375,
      "loss": 2.1562,
      "step": 315
    },
    {
      "epoch": 4.9375,
      "grad_norm": 2.154141426086426,
      "learning_rate": 0.00018031250000000003,
      "loss": 1.8616,
      "step": 316
    },
    {
      "epoch": 4.953125,
      "grad_norm": 2.2269527912139893,
      "learning_rate": 0.00018025000000000002,
      "loss": 2.1677,
      "step": 317
    },
    {
      "epoch": 4.96875,
      "grad_norm": 2.435858726501465,
      "learning_rate": 0.0001801875,
      "loss": 1.8111,
      "step": 318
    },
    {
      "epoch": 4.984375,
      "grad_norm": 2.051780939102173,
      "learning_rate": 0.000180125,
      "loss": 2.1448,
      "step": 319
    },
    {
      "epoch": 5.0,
      "grad_norm": 2.3915035724639893,
      "learning_rate": 0.0001800625,
      "loss": 2.2102,
      "step": 320
    },
    {
      "epoch": 5.0,
      "eval_loss": 2.917266607284546,
      "eval_runtime": 2.9005,
      "eval_samples_per_second": 176.52,
      "eval_steps_per_second": 44.13,
      "step": 320
    },
    {
      "epoch": 5.015625,
      "grad_norm": 2.120759963989258,
      "learning_rate": 0.00018,
      "loss": 2.1839,
      "step": 321
    },
    {
      "epoch": 5.03125,
      "grad_norm": 2.2079567909240723,
      "learning_rate": 0.0001799375,
      "loss": 2.0844,
      "step": 322
    },
    {
      "epoch": 5.046875,
      "grad_norm": 2.627847194671631,
      "learning_rate": 0.00017987500000000002,
      "loss": 2.2549,
      "step": 323
    },
    {
      "epoch": 5.0625,
      "grad_norm": 1.8195698261260986,
      "learning_rate": 0.00017981250000000001,
      "loss": 2.1788,
      "step": 324
    },
    {
      "epoch": 5.078125,
      "grad_norm": 2.0179152488708496,
      "learning_rate": 0.00017975,
      "loss": 2.0846,
      "step": 325
    },
    {
      "epoch": 5.09375,
      "grad_norm": 2.0287959575653076,
      "learning_rate": 0.0001796875,
      "loss": 1.7265,
      "step": 326
    },
    {
      "epoch": 5.109375,
      "grad_norm": 2.2764880657196045,
      "learning_rate": 0.000179625,
      "loss": 1.9101,
      "step": 327
    },
    {
      "epoch": 5.125,
      "grad_norm": 2.0309367179870605,
      "learning_rate": 0.0001795625,
      "loss": 1.8625,
      "step": 328
    },
    {
      "epoch": 5.140625,
      "grad_norm": 2.1108546257019043,
      "learning_rate": 0.0001795,
      "loss": 2.1135,
      "step": 329
    },
    {
      "epoch": 5.15625,
      "grad_norm": 2.1286518573760986,
      "learning_rate": 0.00017943750000000002,
      "loss": 2.0253,
      "step": 330
    },
    {
      "epoch": 5.171875,
      "grad_norm": 2.4778544902801514,
      "learning_rate": 0.000179375,
      "loss": 1.9572,
      "step": 331
    },
    {
      "epoch": 5.1875,
      "grad_norm": 2.449178457260132,
      "learning_rate": 0.00017931250000000003,
      "loss": 2.1139,
      "step": 332
    },
    {
      "epoch": 5.203125,
      "grad_norm": 2.284151077270508,
      "learning_rate": 0.00017925000000000002,
      "loss": 2.0068,
      "step": 333
    },
    {
      "epoch": 5.21875,
      "grad_norm": 2.4463508129119873,
      "learning_rate": 0.0001791875,
      "loss": 2.1864,
      "step": 334
    },
    {
      "epoch": 5.234375,
      "grad_norm": 2.2183680534362793,
      "learning_rate": 0.000179125,
      "loss": 1.7477,
      "step": 335
    },
    {
      "epoch": 5.25,
      "grad_norm": 2.235490083694458,
      "learning_rate": 0.0001790625,
      "loss": 1.795,
      "step": 336
    },
    {
      "epoch": 5.265625,
      "grad_norm": 2.383176326751709,
      "learning_rate": 0.00017900000000000001,
      "loss": 1.965,
      "step": 337
    },
    {
      "epoch": 5.28125,
      "grad_norm": 2.0087239742279053,
      "learning_rate": 0.0001789375,
      "loss": 1.9875,
      "step": 338
    },
    {
      "epoch": 5.296875,
      "grad_norm": 2.1201846599578857,
      "learning_rate": 0.00017887500000000002,
      "loss": 2.1969,
      "step": 339
    },
    {
      "epoch": 5.3125,
      "grad_norm": 2.114637613296509,
      "learning_rate": 0.00017881250000000002,
      "loss": 1.952,
      "step": 340
    },
    {
      "epoch": 5.328125,
      "grad_norm": 2.170652389526367,
      "learning_rate": 0.00017875,
      "loss": 1.8871,
      "step": 341
    },
    {
      "epoch": 5.34375,
      "grad_norm": 2.072985887527466,
      "learning_rate": 0.0001786875,
      "loss": 1.8237,
      "step": 342
    },
    {
      "epoch": 5.359375,
      "grad_norm": 2.025641918182373,
      "learning_rate": 0.000178625,
      "loss": 2.2651,
      "step": 343
    },
    {
      "epoch": 5.375,
      "grad_norm": 2.1659653186798096,
      "learning_rate": 0.0001785625,
      "loss": 1.7982,
      "step": 344
    },
    {
      "epoch": 5.390625,
      "grad_norm": 2.167815685272217,
      "learning_rate": 0.0001785,
      "loss": 1.9615,
      "step": 345
    },
    {
      "epoch": 5.40625,
      "grad_norm": 2.150723695755005,
      "learning_rate": 0.00017843750000000002,
      "loss": 2.0605,
      "step": 346
    },
    {
      "epoch": 5.421875,
      "grad_norm": 2.6162667274475098,
      "learning_rate": 0.000178375,
      "loss": 1.9482,
      "step": 347
    },
    {
      "epoch": 5.4375,
      "grad_norm": 2.548208713531494,
      "learning_rate": 0.0001783125,
      "loss": 2.288,
      "step": 348
    },
    {
      "epoch": 5.453125,
      "grad_norm": 2.1978580951690674,
      "learning_rate": 0.00017825,
      "loss": 2.1435,
      "step": 349
    },
    {
      "epoch": 5.46875,
      "grad_norm": 2.079634189605713,
      "learning_rate": 0.00017818750000000001,
      "loss": 2.1062,
      "step": 350
    },
    {
      "epoch": 5.484375,
      "grad_norm": 2.2800724506378174,
      "learning_rate": 0.000178125,
      "loss": 1.7649,
      "step": 351
    },
    {
      "epoch": 5.5,
      "grad_norm": 1.9444358348846436,
      "learning_rate": 0.0001780625,
      "loss": 1.753,
      "step": 352
    },
    {
      "epoch": 5.515625,
      "grad_norm": 2.0687570571899414,
      "learning_rate": 0.00017800000000000002,
      "loss": 1.8249,
      "step": 353
    },
    {
      "epoch": 5.53125,
      "grad_norm": 2.2764570713043213,
      "learning_rate": 0.0001779375,
      "loss": 2.2716,
      "step": 354
    },
    {
      "epoch": 5.546875,
      "grad_norm": 2.154142141342163,
      "learning_rate": 0.000177875,
      "loss": 1.8829,
      "step": 355
    },
    {
      "epoch": 5.5625,
      "grad_norm": 1.9636160135269165,
      "learning_rate": 0.0001778125,
      "loss": 2.2211,
      "step": 356
    },
    {
      "epoch": 5.578125,
      "grad_norm": 2.4619970321655273,
      "learning_rate": 0.00017775,
      "loss": 1.9222,
      "step": 357
    },
    {
      "epoch": 5.59375,
      "grad_norm": 2.0046257972717285,
      "learning_rate": 0.0001776875,
      "loss": 1.966,
      "step": 358
    },
    {
      "epoch": 5.609375,
      "grad_norm": 2.217724323272705,
      "learning_rate": 0.00017762500000000002,
      "loss": 1.9105,
      "step": 359
    },
    {
      "epoch": 5.625,
      "grad_norm": 2.0602707862854004,
      "learning_rate": 0.0001775625,
      "loss": 2.0448,
      "step": 360
    },
    {
      "epoch": 5.640625,
      "grad_norm": 2.1410017013549805,
      "learning_rate": 0.0001775,
      "loss": 2.0401,
      "step": 361
    },
    {
      "epoch": 5.65625,
      "grad_norm": 2.350202798843384,
      "learning_rate": 0.00017743750000000002,
      "loss": 1.7784,
      "step": 362
    },
    {
      "epoch": 5.671875,
      "grad_norm": 2.18508243560791,
      "learning_rate": 0.00017737500000000002,
      "loss": 1.7233,
      "step": 363
    },
    {
      "epoch": 5.6875,
      "grad_norm": 2.28578782081604,
      "learning_rate": 0.0001773125,
      "loss": 1.8179,
      "step": 364
    },
    {
      "epoch": 5.703125,
      "grad_norm": 2.1914920806884766,
      "learning_rate": 0.00017725,
      "loss": 1.8497,
      "step": 365
    },
    {
      "epoch": 5.71875,
      "grad_norm": 2.1230130195617676,
      "learning_rate": 0.00017718750000000002,
      "loss": 1.9903,
      "step": 366
    },
    {
      "epoch": 5.734375,
      "grad_norm": 2.1576662063598633,
      "learning_rate": 0.000177125,
      "loss": 1.9616,
      "step": 367
    },
    {
      "epoch": 5.75,
      "grad_norm": 2.268322467803955,
      "learning_rate": 0.0001770625,
      "loss": 1.6958,
      "step": 368
    },
    {
      "epoch": 5.765625,
      "grad_norm": 1.9469624757766724,
      "learning_rate": 0.00017700000000000002,
      "loss": 1.3919,
      "step": 369
    },
    {
      "epoch": 5.78125,
      "grad_norm": 2.078338384628296,
      "learning_rate": 0.0001769375,
      "loss": 1.8864,
      "step": 370
    },
    {
      "epoch": 5.796875,
      "grad_norm": 2.227593421936035,
      "learning_rate": 0.000176875,
      "loss": 1.7437,
      "step": 371
    },
    {
      "epoch": 5.8125,
      "grad_norm": 2.328946113586426,
      "learning_rate": 0.0001768125,
      "loss": 1.7807,
      "step": 372
    },
    {
      "epoch": 5.828125,
      "grad_norm": 2.099926471710205,
      "learning_rate": 0.00017675000000000001,
      "loss": 2.1933,
      "step": 373
    },
    {
      "epoch": 5.84375,
      "grad_norm": 2.5344040393829346,
      "learning_rate": 0.0001766875,
      "loss": 1.9879,
      "step": 374
    },
    {
      "epoch": 5.859375,
      "grad_norm": 2.617253541946411,
      "learning_rate": 0.00017662500000000002,
      "loss": 1.6151,
      "step": 375
    },
    {
      "epoch": 5.875,
      "grad_norm": 2.6120169162750244,
      "learning_rate": 0.00017656250000000002,
      "loss": 2.0491,
      "step": 376
    },
    {
      "epoch": 5.890625,
      "grad_norm": 2.5537655353546143,
      "learning_rate": 0.0001765,
      "loss": 2.0614,
      "step": 377
    },
    {
      "epoch": 5.90625,
      "grad_norm": 1.7676783800125122,
      "learning_rate": 0.0001764375,
      "loss": 1.9326,
      "step": 378
    },
    {
      "epoch": 5.921875,
      "grad_norm": 1.9976332187652588,
      "learning_rate": 0.000176375,
      "loss": 2.0548,
      "step": 379
    },
    {
      "epoch": 5.9375,
      "grad_norm": 2.033198595046997,
      "learning_rate": 0.0001763125,
      "loss": 1.8251,
      "step": 380
    },
    {
      "epoch": 5.953125,
      "grad_norm": 2.4363577365875244,
      "learning_rate": 0.00017625,
      "loss": 2.2543,
      "step": 381
    },
    {
      "epoch": 5.96875,
      "grad_norm": 2.1352076530456543,
      "learning_rate": 0.00017618750000000002,
      "loss": 2.0294,
      "step": 382
    },
    {
      "epoch": 5.984375,
      "grad_norm": 2.1414918899536133,
      "learning_rate": 0.000176125,
      "loss": 1.8644,
      "step": 383
    },
    {
      "epoch": 6.0,
      "grad_norm": 2.892679452896118,
      "learning_rate": 0.00017606250000000003,
      "loss": 2.2449,
      "step": 384
    },
    {
      "epoch": 6.0,
      "eval_loss": 2.909318447113037,
      "eval_runtime": 2.9488,
      "eval_samples_per_second": 173.633,
      "eval_steps_per_second": 43.408,
      "step": 384
    },
    {
      "epoch": 6.015625,
      "grad_norm": 2.0949251651763916,
      "learning_rate": 0.00017600000000000002,
      "loss": 1.9729,
      "step": 385
    },
    {
      "epoch": 6.03125,
      "grad_norm": 2.0999860763549805,
      "learning_rate": 0.0001759375,
      "loss": 1.8161,
      "step": 386
    },
    {
      "epoch": 6.046875,
      "grad_norm": 2.2663252353668213,
      "learning_rate": 0.000175875,
      "loss": 1.927,
      "step": 387
    },
    {
      "epoch": 6.0625,
      "grad_norm": 1.9525647163391113,
      "learning_rate": 0.0001758125,
      "loss": 1.5963,
      "step": 388
    },
    {
      "epoch": 6.078125,
      "grad_norm": 2.3670706748962402,
      "learning_rate": 0.00017575000000000002,
      "loss": 1.634,
      "step": 389
    },
    {
      "epoch": 6.09375,
      "grad_norm": 2.2162442207336426,
      "learning_rate": 0.0001756875,
      "loss": 1.9384,
      "step": 390
    },
    {
      "epoch": 6.109375,
      "grad_norm": 2.2549643516540527,
      "learning_rate": 0.00017562500000000003,
      "loss": 1.968,
      "step": 391
    },
    {
      "epoch": 6.125,
      "grad_norm": 2.337101459503174,
      "learning_rate": 0.00017556250000000002,
      "loss": 2.173,
      "step": 392
    },
    {
      "epoch": 6.140625,
      "grad_norm": 2.2713441848754883,
      "learning_rate": 0.0001755,
      "loss": 2.0718,
      "step": 393
    },
    {
      "epoch": 6.15625,
      "grad_norm": 2.076369524002075,
      "learning_rate": 0.0001754375,
      "loss": 1.8723,
      "step": 394
    },
    {
      "epoch": 6.171875,
      "grad_norm": 1.8909765481948853,
      "learning_rate": 0.000175375,
      "loss": 2.0393,
      "step": 395
    },
    {
      "epoch": 6.1875,
      "grad_norm": 2.1223037242889404,
      "learning_rate": 0.0001753125,
      "loss": 1.8475,
      "step": 396
    },
    {
      "epoch": 6.203125,
      "grad_norm": 2.019847869873047,
      "learning_rate": 0.00017525,
      "loss": 1.9105,
      "step": 397
    },
    {
      "epoch": 6.21875,
      "grad_norm": 2.370594024658203,
      "learning_rate": 0.00017518750000000002,
      "loss": 1.7972,
      "step": 398
    },
    {
      "epoch": 6.234375,
      "grad_norm": 1.8369842767715454,
      "learning_rate": 0.00017512500000000001,
      "loss": 2.0244,
      "step": 399
    },
    {
      "epoch": 6.25,
      "grad_norm": 2.1094727516174316,
      "learning_rate": 0.0001750625,
      "loss": 1.8593,
      "step": 400
    },
    {
      "epoch": 6.265625,
      "grad_norm": 2.0651814937591553,
      "learning_rate": 0.000175,
      "loss": 1.9739,
      "step": 401
    },
    {
      "epoch": 6.28125,
      "grad_norm": 2.0755412578582764,
      "learning_rate": 0.0001749375,
      "loss": 1.4391,
      "step": 402
    },
    {
      "epoch": 6.296875,
      "grad_norm": 2.16894793510437,
      "learning_rate": 0.000174875,
      "loss": 1.9601,
      "step": 403
    },
    {
      "epoch": 6.3125,
      "grad_norm": 2.3159677982330322,
      "learning_rate": 0.0001748125,
      "loss": 1.8649,
      "step": 404
    },
    {
      "epoch": 6.328125,
      "grad_norm": 2.5531983375549316,
      "learning_rate": 0.00017475000000000002,
      "loss": 1.7747,
      "step": 405
    },
    {
      "epoch": 6.34375,
      "grad_norm": 2.1167869567871094,
      "learning_rate": 0.0001746875,
      "loss": 1.5205,
      "step": 406
    },
    {
      "epoch": 6.359375,
      "grad_norm": 2.3358945846557617,
      "learning_rate": 0.00017462500000000003,
      "loss": 1.9744,
      "step": 407
    },
    {
      "epoch": 6.375,
      "grad_norm": 1.9740853309631348,
      "learning_rate": 0.0001745625,
      "loss": 1.7333,
      "step": 408
    },
    {
      "epoch": 6.390625,
      "grad_norm": 1.9886573553085327,
      "learning_rate": 0.0001745,
      "loss": 1.8523,
      "step": 409
    },
    {
      "epoch": 6.40625,
      "grad_norm": 2.5249621868133545,
      "learning_rate": 0.0001744375,
      "loss": 1.9561,
      "step": 410
    },
    {
      "epoch": 6.421875,
      "grad_norm": 2.4359922409057617,
      "learning_rate": 0.000174375,
      "loss": 1.9896,
      "step": 411
    },
    {
      "epoch": 6.4375,
      "grad_norm": 2.20324444770813,
      "learning_rate": 0.00017431250000000002,
      "loss": 1.8581,
      "step": 412
    },
    {
      "epoch": 6.453125,
      "grad_norm": 2.133639097213745,
      "learning_rate": 0.00017425,
      "loss": 1.6483,
      "step": 413
    },
    {
      "epoch": 6.46875,
      "grad_norm": 2.39289927482605,
      "learning_rate": 0.00017418750000000003,
      "loss": 1.9668,
      "step": 414
    },
    {
      "epoch": 6.484375,
      "grad_norm": 2.47939395904541,
      "learning_rate": 0.00017412500000000002,
      "loss": 1.85,
      "step": 415
    },
    {
      "epoch": 6.5,
      "grad_norm": 2.122438669204712,
      "learning_rate": 0.0001740625,
      "loss": 1.6601,
      "step": 416
    },
    {
      "epoch": 6.515625,
      "grad_norm": 2.3710861206054688,
      "learning_rate": 0.000174,
      "loss": 2.0525,
      "step": 417
    },
    {
      "epoch": 6.53125,
      "grad_norm": 2.3549163341522217,
      "learning_rate": 0.0001739375,
      "loss": 2.0888,
      "step": 418
    },
    {
      "epoch": 6.546875,
      "grad_norm": 2.275439977645874,
      "learning_rate": 0.000173875,
      "loss": 2.0012,
      "step": 419
    },
    {
      "epoch": 6.5625,
      "grad_norm": 2.3488376140594482,
      "learning_rate": 0.0001738125,
      "loss": 1.9676,
      "step": 420
    },
    {
      "epoch": 6.578125,
      "grad_norm": 2.138463258743286,
      "learning_rate": 0.00017375000000000002,
      "loss": 2.2738,
      "step": 421
    },
    {
      "epoch": 6.59375,
      "grad_norm": 2.1751039028167725,
      "learning_rate": 0.00017368750000000001,
      "loss": 1.8936,
      "step": 422
    },
    {
      "epoch": 6.609375,
      "grad_norm": 2.7962615489959717,
      "learning_rate": 0.000173625,
      "loss": 1.9442,
      "step": 423
    },
    {
      "epoch": 6.625,
      "grad_norm": 2.549434185028076,
      "learning_rate": 0.0001735625,
      "loss": 2.3174,
      "step": 424
    },
    {
      "epoch": 6.640625,
      "grad_norm": 2.026712417602539,
      "learning_rate": 0.00017350000000000002,
      "loss": 1.5497,
      "step": 425
    },
    {
      "epoch": 6.65625,
      "grad_norm": 2.0612540245056152,
      "learning_rate": 0.0001734375,
      "loss": 1.6709,
      "step": 426
    },
    {
      "epoch": 6.671875,
      "grad_norm": 2.213505506515503,
      "learning_rate": 0.000173375,
      "loss": 2.013,
      "step": 427
    },
    {
      "epoch": 6.6875,
      "grad_norm": 1.9914507865905762,
      "learning_rate": 0.00017331250000000002,
      "loss": 1.5845,
      "step": 428
    },
    {
      "epoch": 6.703125,
      "grad_norm": 2.3677332401275635,
      "learning_rate": 0.00017325,
      "loss": 1.8039,
      "step": 429
    },
    {
      "epoch": 6.71875,
      "grad_norm": 2.103104591369629,
      "learning_rate": 0.0001731875,
      "loss": 2.1197,
      "step": 430
    },
    {
      "epoch": 6.734375,
      "grad_norm": 2.3859806060791016,
      "learning_rate": 0.000173125,
      "loss": 1.7168,
      "step": 431
    },
    {
      "epoch": 6.75,
      "grad_norm": 2.205716848373413,
      "learning_rate": 0.0001730625,
      "loss": 2.1259,
      "step": 432
    },
    {
      "epoch": 6.765625,
      "grad_norm": 2.2673592567443848,
      "learning_rate": 0.000173,
      "loss": 2.2256,
      "step": 433
    },
    {
      "epoch": 6.78125,
      "grad_norm": 2.1451516151428223,
      "learning_rate": 0.00017293750000000002,
      "loss": 1.9625,
      "step": 434
    },
    {
      "epoch": 6.796875,
      "grad_norm": 2.2576138973236084,
      "learning_rate": 0.00017287500000000001,
      "loss": 1.8213,
      "step": 435
    },
    {
      "epoch": 6.8125,
      "grad_norm": 2.2774157524108887,
      "learning_rate": 0.0001728125,
      "loss": 1.9723,
      "step": 436
    },
    {
      "epoch": 6.828125,
      "grad_norm": 2.3646910190582275,
      "learning_rate": 0.00017275000000000002,
      "loss": 1.8294,
      "step": 437
    },
    {
      "epoch": 6.84375,
      "grad_norm": 2.563215494155884,
      "learning_rate": 0.0001726875,
      "loss": 2.0801,
      "step": 438
    },
    {
      "epoch": 6.859375,
      "grad_norm": 2.101099967956543,
      "learning_rate": 0.000172625,
      "loss": 1.8522,
      "step": 439
    },
    {
      "epoch": 6.875,
      "grad_norm": 2.3665337562561035,
      "learning_rate": 0.0001725625,
      "loss": 1.8743,
      "step": 440
    },
    {
      "epoch": 6.890625,
      "grad_norm": 1.9036853313446045,
      "learning_rate": 0.00017250000000000002,
      "loss": 1.9667,
      "step": 441
    },
    {
      "epoch": 6.90625,
      "grad_norm": 2.129814624786377,
      "learning_rate": 0.0001724375,
      "loss": 1.9709,
      "step": 442
    },
    {
      "epoch": 6.921875,
      "grad_norm": 2.2054028511047363,
      "learning_rate": 0.000172375,
      "loss": 2.1082,
      "step": 443
    },
    {
      "epoch": 6.9375,
      "grad_norm": 2.271404981613159,
      "learning_rate": 0.00017231250000000002,
      "loss": 1.6582,
      "step": 444
    },
    {
      "epoch": 6.953125,
      "grad_norm": 2.333354949951172,
      "learning_rate": 0.00017225,
      "loss": 2.2754,
      "step": 445
    },
    {
      "epoch": 6.96875,
      "grad_norm": 2.268486976623535,
      "learning_rate": 0.0001721875,
      "loss": 2.1082,
      "step": 446
    },
    {
      "epoch": 6.984375,
      "grad_norm": 2.1974897384643555,
      "learning_rate": 0.000172125,
      "loss": 2.2616,
      "step": 447
    },
    {
      "epoch": 7.0,
      "grad_norm": 2.505781650543213,
      "learning_rate": 0.00017206250000000001,
      "loss": 2.2871,
      "step": 448
    },
    {
      "epoch": 7.0,
      "eval_loss": 2.908600330352783,
      "eval_runtime": 2.9041,
      "eval_samples_per_second": 176.305,
      "eval_steps_per_second": 44.076,
      "step": 448
    },
    {
      "epoch": 7.015625,
      "grad_norm": 2.1730833053588867,
      "learning_rate": 0.000172,
      "loss": 2.0247,
      "step": 449
    },
    {
      "epoch": 7.03125,
      "grad_norm": 2.273693323135376,
      "learning_rate": 0.00017193750000000003,
      "loss": 1.7866,
      "step": 450
    },
    {
      "epoch": 7.046875,
      "grad_norm": 2.3635385036468506,
      "learning_rate": 0.00017187500000000002,
      "loss": 1.7044,
      "step": 451
    },
    {
      "epoch": 7.0625,
      "grad_norm": 2.1189608573913574,
      "learning_rate": 0.0001718125,
      "loss": 2.0632,
      "step": 452
    },
    {
      "epoch": 7.078125,
      "grad_norm": 2.11604905128479,
      "learning_rate": 0.00017175,
      "loss": 2.0753,
      "step": 453
    },
    {
      "epoch": 7.09375,
      "grad_norm": 2.2215404510498047,
      "learning_rate": 0.0001716875,
      "loss": 1.9778,
      "step": 454
    },
    {
      "epoch": 7.109375,
      "grad_norm": 2.328511953353882,
      "learning_rate": 0.000171625,
      "loss": 1.6274,
      "step": 455
    },
    {
      "epoch": 7.125,
      "grad_norm": 2.255847454071045,
      "learning_rate": 0.0001715625,
      "loss": 1.7153,
      "step": 456
    },
    {
      "epoch": 7.140625,
      "grad_norm": 2.353257894515991,
      "learning_rate": 0.00017150000000000002,
      "loss": 2.0269,
      "step": 457
    },
    {
      "epoch": 7.15625,
      "grad_norm": 2.2891416549682617,
      "learning_rate": 0.0001714375,
      "loss": 1.9384,
      "step": 458
    },
    {
      "epoch": 7.171875,
      "grad_norm": 1.9596359729766846,
      "learning_rate": 0.00017137500000000003,
      "loss": 2.0374,
      "step": 459
    },
    {
      "epoch": 7.1875,
      "grad_norm": 2.015895128250122,
      "learning_rate": 0.0001713125,
      "loss": 1.8218,
      "step": 460
    },
    {
      "epoch": 7.203125,
      "grad_norm": 2.0854995250701904,
      "learning_rate": 0.00017125,
      "loss": 1.9988,
      "step": 461
    },
    {
      "epoch": 7.21875,
      "grad_norm": 2.1683177947998047,
      "learning_rate": 0.0001711875,
      "loss": 2.0296,
      "step": 462
    },
    {
      "epoch": 7.234375,
      "grad_norm": 2.025178909301758,
      "learning_rate": 0.000171125,
      "loss": 2.0032,
      "step": 463
    },
    {
      "epoch": 7.25,
      "grad_norm": 2.2997260093688965,
      "learning_rate": 0.00017106250000000002,
      "loss": 1.69,
      "step": 464
    },
    {
      "epoch": 7.265625,
      "grad_norm": 1.9921984672546387,
      "learning_rate": 0.000171,
      "loss": 1.7677,
      "step": 465
    },
    {
      "epoch": 7.28125,
      "grad_norm": 2.1038150787353516,
      "learning_rate": 0.00017093750000000003,
      "loss": 1.4735,
      "step": 466
    },
    {
      "epoch": 7.296875,
      "grad_norm": 1.8505055904388428,
      "learning_rate": 0.00017087500000000002,
      "loss": 1.7832,
      "step": 467
    },
    {
      "epoch": 7.3125,
      "grad_norm": 2.0327870845794678,
      "learning_rate": 0.00017081249999999998,
      "loss": 2.0372,
      "step": 468
    },
    {
      "epoch": 7.328125,
      "grad_norm": 1.965147852897644,
      "learning_rate": 0.00017075,
      "loss": 1.8144,
      "step": 469
    },
    {
      "epoch": 7.34375,
      "grad_norm": 2.3589670658111572,
      "learning_rate": 0.0001706875,
      "loss": 2.022,
      "step": 470
    },
    {
      "epoch": 7.359375,
      "grad_norm": 1.778564214706421,
      "learning_rate": 0.00017062500000000001,
      "loss": 2.2311,
      "step": 471
    },
    {
      "epoch": 7.375,
      "grad_norm": 2.2599751949310303,
      "learning_rate": 0.0001705625,
      "loss": 1.7277,
      "step": 472
    },
    {
      "epoch": 7.390625,
      "grad_norm": 2.129779577255249,
      "learning_rate": 0.00017050000000000002,
      "loss": 1.8958,
      "step": 473
    },
    {
      "epoch": 7.40625,
      "grad_norm": 2.1927859783172607,
      "learning_rate": 0.00017043750000000002,
      "loss": 1.9236,
      "step": 474
    },
    {
      "epoch": 7.421875,
      "grad_norm": 2.2799794673919678,
      "learning_rate": 0.000170375,
      "loss": 1.852,
      "step": 475
    },
    {
      "epoch": 7.4375,
      "grad_norm": 2.1906204223632812,
      "learning_rate": 0.0001703125,
      "loss": 2.0026,
      "step": 476
    },
    {
      "epoch": 7.453125,
      "grad_norm": 2.1385979652404785,
      "learning_rate": 0.00017025,
      "loss": 1.8226,
      "step": 477
    },
    {
      "epoch": 7.46875,
      "grad_norm": 2.044093608856201,
      "learning_rate": 0.0001701875,
      "loss": 1.7795,
      "step": 478
    },
    {
      "epoch": 7.484375,
      "grad_norm": 2.576472282409668,
      "learning_rate": 0.000170125,
      "loss": 2.1028,
      "step": 479
    },
    {
      "epoch": 7.5,
      "grad_norm": 2.540919303894043,
      "learning_rate": 0.00017006250000000002,
      "loss": 1.7424,
      "step": 480
    },
    {
      "epoch": 7.515625,
      "grad_norm": 2.2926149368286133,
      "learning_rate": 0.00017,
      "loss": 1.7037,
      "step": 481
    },
    {
      "epoch": 7.53125,
      "grad_norm": 2.2666690349578857,
      "learning_rate": 0.0001699375,
      "loss": 1.7081,
      "step": 482
    },
    {
      "epoch": 7.546875,
      "grad_norm": 2.4212775230407715,
      "learning_rate": 0.000169875,
      "loss": 2.1838,
      "step": 483
    },
    {
      "epoch": 7.5625,
      "grad_norm": 2.3984153270721436,
      "learning_rate": 0.00016981250000000001,
      "loss": 1.9018,
      "step": 484
    },
    {
      "epoch": 7.578125,
      "grad_norm": 2.138152837753296,
      "learning_rate": 0.00016975,
      "loss": 2.1786,
      "step": 485
    },
    {
      "epoch": 7.59375,
      "grad_norm": 2.5542893409729004,
      "learning_rate": 0.0001696875,
      "loss": 1.8776,
      "step": 486
    },
    {
      "epoch": 7.609375,
      "grad_norm": 2.243337869644165,
      "learning_rate": 0.00016962500000000002,
      "loss": 1.8271,
      "step": 487
    },
    {
      "epoch": 7.625,
      "grad_norm": 2.309987783432007,
      "learning_rate": 0.0001695625,
      "loss": 1.9508,
      "step": 488
    },
    {
      "epoch": 7.640625,
      "grad_norm": 2.437939405441284,
      "learning_rate": 0.00016950000000000003,
      "loss": 1.7204,
      "step": 489
    },
    {
      "epoch": 7.65625,
      "grad_norm": 2.1012470722198486,
      "learning_rate": 0.0001694375,
      "loss": 2.0556,
      "step": 490
    },
    {
      "epoch": 7.671875,
      "grad_norm": 2.333364248275757,
      "learning_rate": 0.000169375,
      "loss": 1.8387,
      "step": 491
    },
    {
      "epoch": 7.6875,
      "grad_norm": 2.127133846282959,
      "learning_rate": 0.0001693125,
      "loss": 1.5448,
      "step": 492
    },
    {
      "epoch": 7.703125,
      "grad_norm": 2.2239840030670166,
      "learning_rate": 0.00016925,
      "loss": 1.7443,
      "step": 493
    },
    {
      "epoch": 7.71875,
      "grad_norm": 2.1287906169891357,
      "learning_rate": 0.0001691875,
      "loss": 1.8095,
      "step": 494
    },
    {
      "epoch": 7.734375,
      "grad_norm": 2.1724462509155273,
      "learning_rate": 0.000169125,
      "loss": 1.845,
      "step": 495
    },
    {
      "epoch": 7.75,
      "grad_norm": 2.041456460952759,
      "learning_rate": 0.00016906250000000002,
      "loss": 1.9991,
      "step": 496
    },
    {
      "epoch": 7.765625,
      "grad_norm": 2.2009589672088623,
      "learning_rate": 0.00016900000000000002,
      "loss": 1.7645,
      "step": 497
    },
    {
      "epoch": 7.78125,
      "grad_norm": 2.622309446334839,
      "learning_rate": 0.0001689375,
      "loss": 1.826,
      "step": 498
    },
    {
      "epoch": 7.796875,
      "grad_norm": 2.404343366622925,
      "learning_rate": 0.000168875,
      "loss": 2.0272,
      "step": 499
    },
    {
      "epoch": 7.8125,
      "grad_norm": 2.2061214447021484,
      "learning_rate": 0.00016881250000000002,
      "loss": 1.9844,
      "step": 500
    },
    {
      "epoch": 7.828125,
      "grad_norm": 2.4137282371520996,
      "learning_rate": 0.00016875,
      "loss": 1.899,
      "step": 501
    },
    {
      "epoch": 7.84375,
      "grad_norm": 2.1362524032592773,
      "learning_rate": 0.0001686875,
      "loss": 2.0194,
      "step": 502
    },
    {
      "epoch": 7.859375,
      "grad_norm": 2.23091721534729,
      "learning_rate": 0.00016862500000000002,
      "loss": 1.817,
      "step": 503
    },
    {
      "epoch": 7.875,
      "grad_norm": 2.433748245239258,
      "learning_rate": 0.0001685625,
      "loss": 1.7627,
      "step": 504
    },
    {
      "epoch": 7.890625,
      "grad_norm": 2.4921767711639404,
      "learning_rate": 0.0001685,
      "loss": 2.2163,
      "step": 505
    },
    {
      "epoch": 7.90625,
      "grad_norm": 2.2893688678741455,
      "learning_rate": 0.0001684375,
      "loss": 1.8779,
      "step": 506
    },
    {
      "epoch": 7.921875,
      "grad_norm": 2.175520896911621,
      "learning_rate": 0.000168375,
      "loss": 1.8964,
      "step": 507
    },
    {
      "epoch": 7.9375,
      "grad_norm": 2.2230632305145264,
      "learning_rate": 0.0001683125,
      "loss": 1.8831,
      "step": 508
    },
    {
      "epoch": 7.953125,
      "grad_norm": 2.0542025566101074,
      "learning_rate": 0.00016825000000000002,
      "loss": 1.8514,
      "step": 509
    },
    {
      "epoch": 7.96875,
      "grad_norm": 2.2847704887390137,
      "learning_rate": 0.00016818750000000002,
      "loss": 1.9232,
      "step": 510
    },
    {
      "epoch": 7.984375,
      "grad_norm": 2.3431057929992676,
      "learning_rate": 0.000168125,
      "loss": 1.8358,
      "step": 511
    },
    {
      "epoch": 8.0,
      "grad_norm": 2.5366930961608887,
      "learning_rate": 0.0001680625,
      "loss": 1.9398,
      "step": 512
    },
    {
      "epoch": 8.0,
      "eval_loss": 2.9269089698791504,
      "eval_runtime": 2.9025,
      "eval_samples_per_second": 176.401,
      "eval_steps_per_second": 44.1,
      "step": 512
    },
    {
      "epoch": 8.015625,
      "grad_norm": 2.2312567234039307,
      "learning_rate": 0.000168,
      "loss": 1.7331,
      "step": 513
    },
    {
      "epoch": 8.03125,
      "grad_norm": 2.233720541000366,
      "learning_rate": 0.0001679375,
      "loss": 1.8958,
      "step": 514
    },
    {
      "epoch": 8.046875,
      "grad_norm": 2.16745662689209,
      "learning_rate": 0.000167875,
      "loss": 2.1404,
      "step": 515
    },
    {
      "epoch": 8.0625,
      "grad_norm": 2.412119150161743,
      "learning_rate": 0.00016781250000000002,
      "loss": 1.5207,
      "step": 516
    },
    {
      "epoch": 8.078125,
      "grad_norm": 2.2841386795043945,
      "learning_rate": 0.00016775,
      "loss": 1.9406,
      "step": 517
    },
    {
      "epoch": 8.09375,
      "grad_norm": 1.9643691778182983,
      "learning_rate": 0.0001676875,
      "loss": 1.906,
      "step": 518
    },
    {
      "epoch": 8.109375,
      "grad_norm": 2.1247873306274414,
      "learning_rate": 0.00016762500000000002,
      "loss": 1.772,
      "step": 519
    },
    {
      "epoch": 8.125,
      "grad_norm": 2.5045268535614014,
      "learning_rate": 0.0001675625,
      "loss": 1.9035,
      "step": 520
    },
    {
      "epoch": 8.140625,
      "grad_norm": 2.2936697006225586,
      "learning_rate": 0.0001675,
      "loss": 1.5967,
      "step": 521
    },
    {
      "epoch": 8.15625,
      "grad_norm": 2.457949161529541,
      "learning_rate": 0.0001674375,
      "loss": 1.8664,
      "step": 522
    },
    {
      "epoch": 8.171875,
      "grad_norm": 2.7671637535095215,
      "learning_rate": 0.00016737500000000002,
      "loss": 2.0034,
      "step": 523
    },
    {
      "epoch": 8.1875,
      "grad_norm": 2.030120849609375,
      "learning_rate": 0.0001673125,
      "loss": 1.9058,
      "step": 524
    },
    {
      "epoch": 8.203125,
      "grad_norm": 2.1007683277130127,
      "learning_rate": 0.00016725000000000003,
      "loss": 1.794,
      "step": 525
    },
    {
      "epoch": 8.21875,
      "grad_norm": 2.1022138595581055,
      "learning_rate": 0.00016718750000000002,
      "loss": 1.8206,
      "step": 526
    },
    {
      "epoch": 8.234375,
      "grad_norm": 1.9925649166107178,
      "learning_rate": 0.000167125,
      "loss": 2.0515,
      "step": 527
    },
    {
      "epoch": 8.25,
      "grad_norm": 2.052025318145752,
      "learning_rate": 0.0001670625,
      "loss": 1.8172,
      "step": 528
    },
    {
      "epoch": 8.265625,
      "grad_norm": 2.3949203491210938,
      "learning_rate": 0.000167,
      "loss": 1.753,
      "step": 529
    },
    {
      "epoch": 8.28125,
      "grad_norm": 2.0052671432495117,
      "learning_rate": 0.0001669375,
      "loss": 1.8944,
      "step": 530
    },
    {
      "epoch": 8.296875,
      "grad_norm": 2.1424994468688965,
      "learning_rate": 0.000166875,
      "loss": 1.8728,
      "step": 531
    },
    {
      "epoch": 8.3125,
      "grad_norm": 2.4485023021698,
      "learning_rate": 0.00016681250000000002,
      "loss": 1.8589,
      "step": 532
    },
    {
      "epoch": 8.328125,
      "grad_norm": 2.230360984802246,
      "learning_rate": 0.00016675000000000001,
      "loss": 1.914,
      "step": 533
    },
    {
      "epoch": 8.34375,
      "grad_norm": 2.442310094833374,
      "learning_rate": 0.0001666875,
      "loss": 1.6858,
      "step": 534
    },
    {
      "epoch": 8.359375,
      "grad_norm": 2.4155821800231934,
      "learning_rate": 0.000166625,
      "loss": 1.9184,
      "step": 535
    },
    {
      "epoch": 8.375,
      "grad_norm": 2.297701597213745,
      "learning_rate": 0.0001665625,
      "loss": 2.0219,
      "step": 536
    },
    {
      "epoch": 8.390625,
      "grad_norm": 1.9815428256988525,
      "learning_rate": 0.0001665,
      "loss": 1.9617,
      "step": 537
    },
    {
      "epoch": 8.40625,
      "grad_norm": 2.370063066482544,
      "learning_rate": 0.0001664375,
      "loss": 2.0286,
      "step": 538
    },
    {
      "epoch": 8.421875,
      "grad_norm": 2.0552966594696045,
      "learning_rate": 0.00016637500000000002,
      "loss": 1.6543,
      "step": 539
    },
    {
      "epoch": 8.4375,
      "grad_norm": 1.978365421295166,
      "learning_rate": 0.0001663125,
      "loss": 1.9567,
      "step": 540
    },
    {
      "epoch": 8.453125,
      "grad_norm": 2.078308343887329,
      "learning_rate": 0.00016625000000000003,
      "loss": 1.7143,
      "step": 541
    },
    {
      "epoch": 8.46875,
      "grad_norm": 2.2571537494659424,
      "learning_rate": 0.0001661875,
      "loss": 1.8759,
      "step": 542
    },
    {
      "epoch": 8.484375,
      "grad_norm": 2.2274227142333984,
      "learning_rate": 0.00016612499999999999,
      "loss": 1.7802,
      "step": 543
    },
    {
      "epoch": 8.5,
      "grad_norm": 2.2626538276672363,
      "learning_rate": 0.0001660625,
      "loss": 1.8241,
      "step": 544
    },
    {
      "epoch": 8.515625,
      "grad_norm": 2.2740519046783447,
      "learning_rate": 0.000166,
      "loss": 1.6011,
      "step": 545
    },
    {
      "epoch": 8.53125,
      "grad_norm": 2.3994264602661133,
      "learning_rate": 0.00016593750000000002,
      "loss": 1.733,
      "step": 546
    },
    {
      "epoch": 8.546875,
      "grad_norm": 2.5213706493377686,
      "learning_rate": 0.000165875,
      "loss": 1.7876,
      "step": 547
    },
    {
      "epoch": 8.5625,
      "grad_norm": 2.1720640659332275,
      "learning_rate": 0.00016581250000000003,
      "loss": 1.9883,
      "step": 548
    },
    {
      "epoch": 8.578125,
      "grad_norm": 2.2489707469940186,
      "learning_rate": 0.00016575000000000002,
      "loss": 1.8256,
      "step": 549
    },
    {
      "epoch": 8.59375,
      "grad_norm": 2.006251096725464,
      "learning_rate": 0.0001656875,
      "loss": 1.7222,
      "step": 550
    },
    {
      "epoch": 8.609375,
      "grad_norm": 2.212290048599243,
      "learning_rate": 0.000165625,
      "loss": 1.9278,
      "step": 551
    },
    {
      "epoch": 8.625,
      "grad_norm": 2.2717537879943848,
      "learning_rate": 0.0001655625,
      "loss": 1.8404,
      "step": 552
    },
    {
      "epoch": 8.640625,
      "grad_norm": 2.3527419567108154,
      "learning_rate": 0.0001655,
      "loss": 1.8931,
      "step": 553
    },
    {
      "epoch": 8.65625,
      "grad_norm": 2.2251646518707275,
      "learning_rate": 0.0001654375,
      "loss": 1.4868,
      "step": 554
    },
    {
      "epoch": 8.671875,
      "grad_norm": 2.096773862838745,
      "learning_rate": 0.00016537500000000002,
      "loss": 1.9814,
      "step": 555
    },
    {
      "epoch": 8.6875,
      "grad_norm": 1.9918646812438965,
      "learning_rate": 0.0001653125,
      "loss": 1.8595,
      "step": 556
    },
    {
      "epoch": 8.703125,
      "grad_norm": 2.0563623905181885,
      "learning_rate": 0.00016525,
      "loss": 1.6651,
      "step": 557
    },
    {
      "epoch": 8.71875,
      "grad_norm": 2.167246103286743,
      "learning_rate": 0.0001651875,
      "loss": 1.9166,
      "step": 558
    },
    {
      "epoch": 8.734375,
      "grad_norm": 2.4928109645843506,
      "learning_rate": 0.00016512500000000002,
      "loss": 2.101,
      "step": 559
    },
    {
      "epoch": 8.75,
      "grad_norm": 2.379021406173706,
      "learning_rate": 0.0001650625,
      "loss": 1.7689,
      "step": 560
    },
    {
      "epoch": 8.765625,
      "grad_norm": 2.023972988128662,
      "learning_rate": 0.000165,
      "loss": 1.7739,
      "step": 561
    },
    {
      "epoch": 8.78125,
      "grad_norm": 2.7759628295898438,
      "learning_rate": 0.00016493750000000002,
      "loss": 1.7692,
      "step": 562
    },
    {
      "epoch": 8.796875,
      "grad_norm": 2.211047887802124,
      "learning_rate": 0.000164875,
      "loss": 1.7788,
      "step": 563
    },
    {
      "epoch": 8.8125,
      "grad_norm": 2.0137572288513184,
      "learning_rate": 0.0001648125,
      "loss": 1.8604,
      "step": 564
    },
    {
      "epoch": 8.828125,
      "grad_norm": 2.289703369140625,
      "learning_rate": 0.00016475,
      "loss": 1.8493,
      "step": 565
    },
    {
      "epoch": 8.84375,
      "grad_norm": 2.289735794067383,
      "learning_rate": 0.0001646875,
      "loss": 1.9658,
      "step": 566
    },
    {
      "epoch": 8.859375,
      "grad_norm": 2.2268307209014893,
      "learning_rate": 0.000164625,
      "loss": 2.0311,
      "step": 567
    },
    {
      "epoch": 8.875,
      "grad_norm": 2.265080690383911,
      "learning_rate": 0.0001645625,
      "loss": 1.7892,
      "step": 568
    },
    {
      "epoch": 8.890625,
      "grad_norm": 2.3123762607574463,
      "learning_rate": 0.00016450000000000001,
      "loss": 1.9499,
      "step": 569
    },
    {
      "epoch": 8.90625,
      "grad_norm": 2.1713109016418457,
      "learning_rate": 0.0001644375,
      "loss": 1.6361,
      "step": 570
    },
    {
      "epoch": 8.921875,
      "grad_norm": 2.400923252105713,
      "learning_rate": 0.00016437500000000002,
      "loss": 1.5824,
      "step": 571
    },
    {
      "epoch": 8.9375,
      "grad_norm": 2.2059102058410645,
      "learning_rate": 0.0001643125,
      "loss": 1.7101,
      "step": 572
    },
    {
      "epoch": 8.953125,
      "grad_norm": 2.387740135192871,
      "learning_rate": 0.00016425,
      "loss": 1.8816,
      "step": 573
    },
    {
      "epoch": 8.96875,
      "grad_norm": 2.113971471786499,
      "learning_rate": 0.0001641875,
      "loss": 1.8288,
      "step": 574
    },
    {
      "epoch": 8.984375,
      "grad_norm": 2.4622631072998047,
      "learning_rate": 0.00016412500000000002,
      "loss": 1.9893,
      "step": 575
    },
    {
      "epoch": 9.0,
      "grad_norm": 2.0924339294433594,
      "learning_rate": 0.0001640625,
      "loss": 1.4158,
      "step": 576
    },
    {
      "epoch": 9.0,
      "eval_loss": 2.928107261657715,
      "eval_runtime": 2.8863,
      "eval_samples_per_second": 177.39,
      "eval_steps_per_second": 44.347,
      "step": 576
    },
    {
      "epoch": 9.015625,
      "grad_norm": 2.24358868598938,
      "learning_rate": 0.000164,
      "loss": 1.5612,
      "step": 577
    },
    {
      "epoch": 9.03125,
      "grad_norm": 2.5201094150543213,
      "learning_rate": 0.00016393750000000002,
      "loss": 1.8987,
      "step": 578
    },
    {
      "epoch": 9.046875,
      "grad_norm": 2.0477795600891113,
      "learning_rate": 0.000163875,
      "loss": 1.8309,
      "step": 579
    },
    {
      "epoch": 9.0625,
      "grad_norm": 2.166933298110962,
      "learning_rate": 0.0001638125,
      "loss": 2.2187,
      "step": 580
    },
    {
      "epoch": 9.078125,
      "grad_norm": 2.25307035446167,
      "learning_rate": 0.00016375,
      "loss": 1.7973,
      "step": 581
    },
    {
      "epoch": 9.09375,
      "grad_norm": 1.8691284656524658,
      "learning_rate": 0.00016368750000000001,
      "loss": 1.9039,
      "step": 582
    },
    {
      "epoch": 9.109375,
      "grad_norm": 2.2708940505981445,
      "learning_rate": 0.000163625,
      "loss": 1.8442,
      "step": 583
    },
    {
      "epoch": 9.125,
      "grad_norm": 2.265814781188965,
      "learning_rate": 0.00016356250000000003,
      "loss": 1.8637,
      "step": 584
    },
    {
      "epoch": 9.140625,
      "grad_norm": 2.0573971271514893,
      "learning_rate": 0.00016350000000000002,
      "loss": 1.7959,
      "step": 585
    },
    {
      "epoch": 9.15625,
      "grad_norm": 2.319406747817993,
      "learning_rate": 0.0001634375,
      "loss": 1.9761,
      "step": 586
    },
    {
      "epoch": 9.171875,
      "grad_norm": 2.278812885284424,
      "learning_rate": 0.000163375,
      "loss": 1.7241,
      "step": 587
    },
    {
      "epoch": 9.1875,
      "grad_norm": 1.9598394632339478,
      "learning_rate": 0.0001633125,
      "loss": 1.5724,
      "step": 588
    },
    {
      "epoch": 9.203125,
      "grad_norm": 2.190126419067383,
      "learning_rate": 0.00016325,
      "loss": 1.5286,
      "step": 589
    },
    {
      "epoch": 9.21875,
      "grad_norm": 2.445688009262085,
      "learning_rate": 0.0001631875,
      "loss": 1.5293,
      "step": 590
    },
    {
      "epoch": 9.234375,
      "grad_norm": 2.2908105850219727,
      "learning_rate": 0.00016312500000000002,
      "loss": 1.7985,
      "step": 591
    },
    {
      "epoch": 9.25,
      "grad_norm": 2.1769723892211914,
      "learning_rate": 0.0001630625,
      "loss": 1.6969,
      "step": 592
    },
    {
      "epoch": 9.265625,
      "grad_norm": 2.383904218673706,
      "learning_rate": 0.000163,
      "loss": 1.7127,
      "step": 593
    },
    {
      "epoch": 9.28125,
      "grad_norm": 2.2862374782562256,
      "learning_rate": 0.0001629375,
      "loss": 1.8739,
      "step": 594
    },
    {
      "epoch": 9.296875,
      "grad_norm": 2.223707675933838,
      "learning_rate": 0.000162875,
      "loss": 1.7962,
      "step": 595
    },
    {
      "epoch": 9.3125,
      "grad_norm": 2.494347333908081,
      "learning_rate": 0.0001628125,
      "loss": 1.5911,
      "step": 596
    },
    {
      "epoch": 9.328125,
      "grad_norm": 2.4015231132507324,
      "learning_rate": 0.00016275,
      "loss": 1.5654,
      "step": 597
    },
    {
      "epoch": 9.34375,
      "grad_norm": 2.2532594203948975,
      "learning_rate": 0.00016268750000000002,
      "loss": 1.8316,
      "step": 598
    },
    {
      "epoch": 9.359375,
      "grad_norm": 2.3892202377319336,
      "learning_rate": 0.000162625,
      "loss": 1.6442,
      "step": 599
    },
    {
      "epoch": 9.375,
      "grad_norm": 2.2312209606170654,
      "learning_rate": 0.00016256250000000003,
      "loss": 1.8729,
      "step": 600
    },
    {
      "epoch": 9.390625,
      "grad_norm": 2.133073329925537,
      "learning_rate": 0.00016250000000000002,
      "loss": 1.929,
      "step": 601
    },
    {
      "epoch": 9.40625,
      "grad_norm": 2.2961177825927734,
      "learning_rate": 0.0001624375,
      "loss": 1.8615,
      "step": 602
    },
    {
      "epoch": 9.421875,
      "grad_norm": 1.9926925897598267,
      "learning_rate": 0.000162375,
      "loss": 1.8261,
      "step": 603
    },
    {
      "epoch": 9.4375,
      "grad_norm": 2.0108225345611572,
      "learning_rate": 0.0001623125,
      "loss": 1.7296,
      "step": 604
    },
    {
      "epoch": 9.453125,
      "grad_norm": 2.34220814704895,
      "learning_rate": 0.00016225000000000001,
      "loss": 1.9017,
      "step": 605
    },
    {
      "epoch": 9.46875,
      "grad_norm": 2.4773130416870117,
      "learning_rate": 0.0001621875,
      "loss": 2.0684,
      "step": 606
    },
    {
      "epoch": 9.484375,
      "grad_norm": 2.0894439220428467,
      "learning_rate": 0.00016212500000000002,
      "loss": 1.6763,
      "step": 607
    },
    {
      "epoch": 9.5,
      "grad_norm": 2.464740037918091,
      "learning_rate": 0.00016206250000000002,
      "loss": 1.6531,
      "step": 608
    },
    {
      "epoch": 9.515625,
      "grad_norm": 2.224437713623047,
      "learning_rate": 0.000162,
      "loss": 1.8867,
      "step": 609
    },
    {
      "epoch": 9.53125,
      "grad_norm": 2.143871307373047,
      "learning_rate": 0.0001619375,
      "loss": 1.739,
      "step": 610
    },
    {
      "epoch": 9.546875,
      "grad_norm": 2.223813533782959,
      "learning_rate": 0.000161875,
      "loss": 1.7978,
      "step": 611
    },
    {
      "epoch": 9.5625,
      "grad_norm": 2.311605453491211,
      "learning_rate": 0.0001618125,
      "loss": 1.969,
      "step": 612
    },
    {
      "epoch": 9.578125,
      "grad_norm": 2.2747652530670166,
      "learning_rate": 0.00016175,
      "loss": 1.752,
      "step": 613
    },
    {
      "epoch": 9.59375,
      "grad_norm": 2.538778066635132,
      "learning_rate": 0.00016168750000000002,
      "loss": 1.8089,
      "step": 614
    },
    {
      "epoch": 9.609375,
      "grad_norm": 2.1739494800567627,
      "learning_rate": 0.000161625,
      "loss": 1.764,
      "step": 615
    },
    {
      "epoch": 9.625,
      "grad_norm": 2.3505361080169678,
      "learning_rate": 0.0001615625,
      "loss": 1.8846,
      "step": 616
    },
    {
      "epoch": 9.640625,
      "grad_norm": 2.149409294128418,
      "learning_rate": 0.0001615,
      "loss": 1.9567,
      "step": 617
    },
    {
      "epoch": 9.65625,
      "grad_norm": 2.533881187438965,
      "learning_rate": 0.0001614375,
      "loss": 1.71,
      "step": 618
    },
    {
      "epoch": 9.671875,
      "grad_norm": 2.3384110927581787,
      "learning_rate": 0.000161375,
      "loss": 1.5389,
      "step": 619
    },
    {
      "epoch": 9.6875,
      "grad_norm": 2.364567756652832,
      "learning_rate": 0.0001613125,
      "loss": 1.8382,
      "step": 620
    },
    {
      "epoch": 9.703125,
      "grad_norm": 2.2870023250579834,
      "learning_rate": 0.00016125000000000002,
      "loss": 1.853,
      "step": 621
    },
    {
      "epoch": 9.71875,
      "grad_norm": 2.275662660598755,
      "learning_rate": 0.0001611875,
      "loss": 1.8396,
      "step": 622
    },
    {
      "epoch": 9.734375,
      "grad_norm": 1.9663711786270142,
      "learning_rate": 0.00016112500000000003,
      "loss": 1.5773,
      "step": 623
    },
    {
      "epoch": 9.75,
      "grad_norm": 2.3097522258758545,
      "learning_rate": 0.00016106250000000002,
      "loss": 2.0851,
      "step": 624
    },
    {
      "epoch": 9.765625,
      "grad_norm": 2.246379852294922,
      "learning_rate": 0.000161,
      "loss": 1.7247,
      "step": 625
    },
    {
      "epoch": 9.78125,
      "grad_norm": 2.169344902038574,
      "learning_rate": 0.0001609375,
      "loss": 2.1276,
      "step": 626
    },
    {
      "epoch": 9.796875,
      "grad_norm": 1.906529426574707,
      "learning_rate": 0.000160875,
      "loss": 2.0364,
      "step": 627
    },
    {
      "epoch": 9.8125,
      "grad_norm": 2.5416038036346436,
      "learning_rate": 0.0001608125,
      "loss": 1.8036,
      "step": 628
    },
    {
      "epoch": 9.828125,
      "grad_norm": 2.49198842048645,
      "learning_rate": 0.00016075,
      "loss": 1.8787,
      "step": 629
    },
    {
      "epoch": 9.84375,
      "grad_norm": 2.4452872276306152,
      "learning_rate": 0.00016068750000000002,
      "loss": 1.7336,
      "step": 630
    },
    {
      "epoch": 9.859375,
      "grad_norm": 2.217872381210327,
      "learning_rate": 0.00016062500000000001,
      "loss": 1.8199,
      "step": 631
    },
    {
      "epoch": 9.875,
      "grad_norm": 2.285579204559326,
      "learning_rate": 0.0001605625,
      "loss": 2.1605,
      "step": 632
    },
    {
      "epoch": 9.890625,
      "grad_norm": 2.3379883766174316,
      "learning_rate": 0.0001605,
      "loss": 1.7252,
      "step": 633
    },
    {
      "epoch": 9.90625,
      "grad_norm": 2.3336715698242188,
      "learning_rate": 0.00016043750000000002,
      "loss": 1.7826,
      "step": 634
    },
    {
      "epoch": 9.921875,
      "grad_norm": 2.584094762802124,
      "learning_rate": 0.000160375,
      "loss": 1.7143,
      "step": 635
    },
    {
      "epoch": 9.9375,
      "grad_norm": 2.209721565246582,
      "learning_rate": 0.0001603125,
      "loss": 1.8642,
      "step": 636
    },
    {
      "epoch": 9.953125,
      "grad_norm": 2.2932307720184326,
      "learning_rate": 0.00016025000000000002,
      "loss": 1.5394,
      "step": 637
    },
    {
      "epoch": 9.96875,
      "grad_norm": 2.361186981201172,
      "learning_rate": 0.0001601875,
      "loss": 1.809,
      "step": 638
    },
    {
      "epoch": 9.984375,
      "grad_norm": 2.1276447772979736,
      "learning_rate": 0.000160125,
      "loss": 1.9314,
      "step": 639
    },
    {
      "epoch": 10.0,
      "grad_norm": 2.1837852001190186,
      "learning_rate": 0.0001600625,
      "loss": 1.6095,
      "step": 640
    },
    {
      "epoch": 10.0,
      "eval_loss": 2.930947780609131,
      "eval_runtime": 2.8899,
      "eval_samples_per_second": 177.171,
      "eval_steps_per_second": 44.293,
      "step": 640
    },
    {
      "epoch": 10.015625,
      "grad_norm": 2.3074164390563965,
      "learning_rate": 0.00016,
      "loss": 1.5907,
      "step": 641
    },
    {
      "epoch": 10.03125,
      "grad_norm": 2.2002217769622803,
      "learning_rate": 0.0001599375,
      "loss": 1.6856,
      "step": 642
    },
    {
      "epoch": 10.046875,
      "grad_norm": 2.100633144378662,
      "learning_rate": 0.000159875,
      "loss": 1.5141,
      "step": 643
    },
    {
      "epoch": 10.0625,
      "grad_norm": 2.284348726272583,
      "learning_rate": 0.00015981250000000002,
      "loss": 1.8884,
      "step": 644
    },
    {
      "epoch": 10.078125,
      "grad_norm": 2.16558575630188,
      "learning_rate": 0.00015975,
      "loss": 1.7063,
      "step": 645
    },
    {
      "epoch": 10.09375,
      "grad_norm": 1.9127576351165771,
      "learning_rate": 0.0001596875,
      "loss": 2.0437,
      "step": 646
    },
    {
      "epoch": 10.109375,
      "grad_norm": 2.167001485824585,
      "learning_rate": 0.000159625,
      "loss": 1.7786,
      "step": 647
    },
    {
      "epoch": 10.125,
      "grad_norm": 2.4043283462524414,
      "learning_rate": 0.0001595625,
      "loss": 1.5661,
      "step": 648
    },
    {
      "epoch": 10.140625,
      "grad_norm": 2.1501786708831787,
      "learning_rate": 0.0001595,
      "loss": 1.4626,
      "step": 649
    },
    {
      "epoch": 10.15625,
      "grad_norm": 2.6493427753448486,
      "learning_rate": 0.00015943750000000002,
      "loss": 1.5654,
      "step": 650
    },
    {
      "epoch": 10.171875,
      "grad_norm": 1.9820441007614136,
      "learning_rate": 0.000159375,
      "loss": 1.713,
      "step": 651
    },
    {
      "epoch": 10.1875,
      "grad_norm": 2.4711151123046875,
      "learning_rate": 0.0001593125,
      "loss": 1.3668,
      "step": 652
    },
    {
      "epoch": 10.203125,
      "grad_norm": 2.16019868850708,
      "learning_rate": 0.00015925000000000002,
      "loss": 1.9269,
      "step": 653
    },
    {
      "epoch": 10.21875,
      "grad_norm": 2.097602605819702,
      "learning_rate": 0.00015918750000000001,
      "loss": 1.7947,
      "step": 654
    },
    {
      "epoch": 10.234375,
      "grad_norm": 2.2739429473876953,
      "learning_rate": 0.000159125,
      "loss": 1.5876,
      "step": 655
    },
    {
      "epoch": 10.25,
      "grad_norm": 2.3722472190856934,
      "learning_rate": 0.0001590625,
      "loss": 1.4462,
      "step": 656
    },
    {
      "epoch": 10.265625,
      "grad_norm": 2.7367172241210938,
      "learning_rate": 0.00015900000000000002,
      "loss": 1.8342,
      "step": 657
    },
    {
      "epoch": 10.28125,
      "grad_norm": 2.368837356567383,
      "learning_rate": 0.0001589375,
      "loss": 2.0767,
      "step": 658
    },
    {
      "epoch": 10.296875,
      "grad_norm": 2.8920278549194336,
      "learning_rate": 0.00015887500000000003,
      "loss": 1.8081,
      "step": 659
    },
    {
      "epoch": 10.3125,
      "grad_norm": 2.034310817718506,
      "learning_rate": 0.00015881250000000002,
      "loss": 1.6619,
      "step": 660
    },
    {
      "epoch": 10.328125,
      "grad_norm": 2.1725914478302,
      "learning_rate": 0.00015875,
      "loss": 1.8582,
      "step": 661
    },
    {
      "epoch": 10.34375,
      "grad_norm": 2.308198928833008,
      "learning_rate": 0.0001586875,
      "loss": 1.695,
      "step": 662
    },
    {
      "epoch": 10.359375,
      "grad_norm": 2.3269429206848145,
      "learning_rate": 0.000158625,
      "loss": 1.8155,
      "step": 663
    },
    {
      "epoch": 10.375,
      "grad_norm": 2.2426273822784424,
      "learning_rate": 0.0001585625,
      "loss": 1.6828,
      "step": 664
    },
    {
      "epoch": 10.390625,
      "grad_norm": 2.6042051315307617,
      "learning_rate": 0.0001585,
      "loss": 2.0511,
      "step": 665
    },
    {
      "epoch": 10.40625,
      "grad_norm": 2.4417216777801514,
      "learning_rate": 0.00015843750000000002,
      "loss": 1.7759,
      "step": 666
    },
    {
      "epoch": 10.421875,
      "grad_norm": 2.467437505722046,
      "learning_rate": 0.00015837500000000001,
      "loss": 1.9634,
      "step": 667
    },
    {
      "epoch": 10.4375,
      "grad_norm": 2.048336982727051,
      "learning_rate": 0.0001583125,
      "loss": 1.7383,
      "step": 668
    },
    {
      "epoch": 10.453125,
      "grad_norm": 2.320762872695923,
      "learning_rate": 0.00015825,
      "loss": 1.6279,
      "step": 669
    },
    {
      "epoch": 10.46875,
      "grad_norm": 2.3071465492248535,
      "learning_rate": 0.0001581875,
      "loss": 2.0746,
      "step": 670
    },
    {
      "epoch": 10.484375,
      "grad_norm": 2.2344799041748047,
      "learning_rate": 0.000158125,
      "loss": 1.4898,
      "step": 671
    },
    {
      "epoch": 10.5,
      "grad_norm": 2.644700288772583,
      "learning_rate": 0.0001580625,
      "loss": 1.8627,
      "step": 672
    },
    {
      "epoch": 10.515625,
      "grad_norm": 2.677361011505127,
      "learning_rate": 0.00015800000000000002,
      "loss": 1.7063,
      "step": 673
    },
    {
      "epoch": 10.53125,
      "grad_norm": 2.102217674255371,
      "learning_rate": 0.0001579375,
      "loss": 1.6779,
      "step": 674
    },
    {
      "epoch": 10.546875,
      "grad_norm": 2.2334985733032227,
      "learning_rate": 0.00015787500000000003,
      "loss": 1.6433,
      "step": 675
    },
    {
      "epoch": 10.5625,
      "grad_norm": 2.1724841594696045,
      "learning_rate": 0.00015781250000000002,
      "loss": 1.942,
      "step": 676
    },
    {
      "epoch": 10.578125,
      "grad_norm": 2.4805729389190674,
      "learning_rate": 0.00015774999999999999,
      "loss": 1.4976,
      "step": 677
    },
    {
      "epoch": 10.59375,
      "grad_norm": 2.160943031311035,
      "learning_rate": 0.0001576875,
      "loss": 1.9822,
      "step": 678
    },
    {
      "epoch": 10.609375,
      "grad_norm": 2.2224185466766357,
      "learning_rate": 0.000157625,
      "loss": 1.9038,
      "step": 679
    },
    {
      "epoch": 10.625,
      "grad_norm": 2.3008692264556885,
      "learning_rate": 0.00015756250000000001,
      "loss": 1.8542,
      "step": 680
    },
    {
      "epoch": 10.640625,
      "grad_norm": 2.7485814094543457,
      "learning_rate": 0.0001575,
      "loss": 1.8951,
      "step": 681
    },
    {
      "epoch": 10.65625,
      "grad_norm": 2.5242137908935547,
      "learning_rate": 0.00015743750000000003,
      "loss": 1.6224,
      "step": 682
    },
    {
      "epoch": 10.671875,
      "grad_norm": 2.610788106918335,
      "learning_rate": 0.00015737500000000002,
      "loss": 1.8606,
      "step": 683
    },
    {
      "epoch": 10.6875,
      "grad_norm": 2.333233594894409,
      "learning_rate": 0.0001573125,
      "loss": 2.0502,
      "step": 684
    },
    {
      "epoch": 10.703125,
      "grad_norm": 2.3016293048858643,
      "learning_rate": 0.00015725,
      "loss": 1.9288,
      "step": 685
    },
    {
      "epoch": 10.71875,
      "grad_norm": 2.4280173778533936,
      "learning_rate": 0.0001571875,
      "loss": 1.6144,
      "step": 686
    },
    {
      "epoch": 10.734375,
      "grad_norm": 2.3456406593322754,
      "learning_rate": 0.000157125,
      "loss": 1.615,
      "step": 687
    },
    {
      "epoch": 10.75,
      "grad_norm": 2.2216200828552246,
      "learning_rate": 0.0001570625,
      "loss": 1.577,
      "step": 688
    },
    {
      "epoch": 10.765625,
      "grad_norm": 2.2782180309295654,
      "learning_rate": 0.00015700000000000002,
      "loss": 1.7033,
      "step": 689
    },
    {
      "epoch": 10.78125,
      "grad_norm": 2.222768545150757,
      "learning_rate": 0.0001569375,
      "loss": 1.9761,
      "step": 690
    },
    {
      "epoch": 10.796875,
      "grad_norm": 2.4566001892089844,
      "learning_rate": 0.000156875,
      "loss": 1.8484,
      "step": 691
    },
    {
      "epoch": 10.8125,
      "grad_norm": 2.4440314769744873,
      "learning_rate": 0.0001568125,
      "loss": 1.7062,
      "step": 692
    },
    {
      "epoch": 10.828125,
      "grad_norm": 2.3741772174835205,
      "learning_rate": 0.00015675,
      "loss": 1.6175,
      "step": 693
    },
    {
      "epoch": 10.84375,
      "grad_norm": 2.1645596027374268,
      "learning_rate": 0.0001566875,
      "loss": 1.6258,
      "step": 694
    },
    {
      "epoch": 10.859375,
      "grad_norm": 2.2140426635742188,
      "learning_rate": 0.000156625,
      "loss": 1.6349,
      "step": 695
    },
    {
      "epoch": 10.875,
      "grad_norm": 2.0620169639587402,
      "learning_rate": 0.00015656250000000002,
      "loss": 1.7791,
      "step": 696
    },
    {
      "epoch": 10.890625,
      "grad_norm": 2.2362124919891357,
      "learning_rate": 0.0001565,
      "loss": 1.9917,
      "step": 697
    },
    {
      "epoch": 10.90625,
      "grad_norm": 2.2664988040924072,
      "learning_rate": 0.00015643750000000003,
      "loss": 1.8599,
      "step": 698
    },
    {
      "epoch": 10.921875,
      "grad_norm": 2.234086513519287,
      "learning_rate": 0.000156375,
      "loss": 1.8841,
      "step": 699
    },
    {
      "epoch": 10.9375,
      "grad_norm": 2.305896520614624,
      "learning_rate": 0.0001563125,
      "loss": 1.5737,
      "step": 700
    },
    {
      "epoch": 10.953125,
      "grad_norm": 1.8637771606445312,
      "learning_rate": 0.00015625,
      "loss": 1.7968,
      "step": 701
    },
    {
      "epoch": 10.96875,
      "grad_norm": 2.2396085262298584,
      "learning_rate": 0.0001561875,
      "loss": 2.061,
      "step": 702
    },
    {
      "epoch": 10.984375,
      "grad_norm": 2.551762580871582,
      "learning_rate": 0.00015612500000000001,
      "loss": 1.936,
      "step": 703
    },
    {
      "epoch": 11.0,
      "grad_norm": 2.411637783050537,
      "learning_rate": 0.0001560625,
      "loss": 1.7919,
      "step": 704
    },
    {
      "epoch": 11.0,
      "eval_loss": 2.933500289916992,
      "eval_runtime": 2.9035,
      "eval_samples_per_second": 176.339,
      "eval_steps_per_second": 44.085,
      "step": 704
    },
    {
      "epoch": 11.015625,
      "grad_norm": 2.3431365489959717,
      "learning_rate": 0.00015600000000000002,
      "loss": 1.5709,
      "step": 705
    },
    {
      "epoch": 11.03125,
      "grad_norm": 2.1066782474517822,
      "learning_rate": 0.00015593750000000002,
      "loss": 1.7924,
      "step": 706
    },
    {
      "epoch": 11.046875,
      "grad_norm": 2.1444506645202637,
      "learning_rate": 0.000155875,
      "loss": 1.8558,
      "step": 707
    },
    {
      "epoch": 11.0625,
      "grad_norm": 2.1080524921417236,
      "learning_rate": 0.0001558125,
      "loss": 1.5755,
      "step": 708
    },
    {
      "epoch": 11.078125,
      "grad_norm": 2.5518875122070312,
      "learning_rate": 0.00015575000000000002,
      "loss": 1.7563,
      "step": 709
    },
    {
      "epoch": 11.09375,
      "grad_norm": 2.5860557556152344,
      "learning_rate": 0.0001556875,
      "loss": 1.8912,
      "step": 710
    },
    {
      "epoch": 11.109375,
      "grad_norm": 2.2122278213500977,
      "learning_rate": 0.000155625,
      "loss": 1.4562,
      "step": 711
    },
    {
      "epoch": 11.125,
      "grad_norm": 2.137925624847412,
      "learning_rate": 0.00015556250000000002,
      "loss": 1.6892,
      "step": 712
    },
    {
      "epoch": 11.140625,
      "grad_norm": 2.1827003955841064,
      "learning_rate": 0.0001555,
      "loss": 1.7366,
      "step": 713
    },
    {
      "epoch": 11.15625,
      "grad_norm": 2.128649950027466,
      "learning_rate": 0.0001554375,
      "loss": 1.7177,
      "step": 714
    },
    {
      "epoch": 11.171875,
      "grad_norm": 2.0550808906555176,
      "learning_rate": 0.000155375,
      "loss": 1.6169,
      "step": 715
    },
    {
      "epoch": 11.1875,
      "grad_norm": 2.247244358062744,
      "learning_rate": 0.00015531250000000001,
      "loss": 1.6341,
      "step": 716
    },
    {
      "epoch": 11.203125,
      "grad_norm": 2.2258553504943848,
      "learning_rate": 0.00015525,
      "loss": 1.9626,
      "step": 717
    },
    {
      "epoch": 11.21875,
      "grad_norm": 2.0635721683502197,
      "learning_rate": 0.0001551875,
      "loss": 1.6116,
      "step": 718
    },
    {
      "epoch": 11.234375,
      "grad_norm": 2.351469039916992,
      "learning_rate": 0.00015512500000000002,
      "loss": 1.6064,
      "step": 719
    },
    {
      "epoch": 11.25,
      "grad_norm": 2.403266191482544,
      "learning_rate": 0.0001550625,
      "loss": 1.655,
      "step": 720
    },
    {
      "epoch": 11.265625,
      "grad_norm": 2.124384641647339,
      "learning_rate": 0.000155,
      "loss": 1.7365,
      "step": 721
    },
    {
      "epoch": 11.28125,
      "grad_norm": 2.024657964706421,
      "learning_rate": 0.0001549375,
      "loss": 1.7925,
      "step": 722
    },
    {
      "epoch": 11.296875,
      "grad_norm": 2.3111588954925537,
      "learning_rate": 0.000154875,
      "loss": 1.4276,
      "step": 723
    },
    {
      "epoch": 11.3125,
      "grad_norm": 2.4023120403289795,
      "learning_rate": 0.0001548125,
      "loss": 1.8466,
      "step": 724
    },
    {
      "epoch": 11.328125,
      "grad_norm": 2.4005258083343506,
      "learning_rate": 0.00015475000000000002,
      "loss": 1.7928,
      "step": 725
    },
    {
      "epoch": 11.34375,
      "grad_norm": 2.5525643825531006,
      "learning_rate": 0.0001546875,
      "loss": 1.6003,
      "step": 726
    },
    {
      "epoch": 11.359375,
      "grad_norm": 2.63726544380188,
      "learning_rate": 0.000154625,
      "loss": 1.8533,
      "step": 727
    },
    {
      "epoch": 11.375,
      "grad_norm": 2.507533550262451,
      "learning_rate": 0.00015456250000000002,
      "loss": 1.7914,
      "step": 728
    },
    {
      "epoch": 11.390625,
      "grad_norm": 2.072483539581299,
      "learning_rate": 0.0001545,
      "loss": 1.7905,
      "step": 729
    },
    {
      "epoch": 11.40625,
      "grad_norm": 2.537595510482788,
      "learning_rate": 0.0001544375,
      "loss": 1.8421,
      "step": 730
    },
    {
      "epoch": 11.421875,
      "grad_norm": 2.6354904174804688,
      "learning_rate": 0.000154375,
      "loss": 1.5856,
      "step": 731
    },
    {
      "epoch": 11.4375,
      "grad_norm": 2.5034871101379395,
      "learning_rate": 0.00015431250000000002,
      "loss": 1.7827,
      "step": 732
    },
    {
      "epoch": 11.453125,
      "grad_norm": 2.3304390907287598,
      "learning_rate": 0.00015425,
      "loss": 1.8816,
      "step": 733
    },
    {
      "epoch": 11.46875,
      "grad_norm": 2.2017648220062256,
      "learning_rate": 0.00015418750000000003,
      "loss": 1.684,
      "step": 734
    },
    {
      "epoch": 11.484375,
      "grad_norm": 2.13208270072937,
      "learning_rate": 0.00015412500000000002,
      "loss": 1.8194,
      "step": 735
    },
    {
      "epoch": 11.5,
      "grad_norm": 2.448881149291992,
      "learning_rate": 0.0001540625,
      "loss": 1.9284,
      "step": 736
    },
    {
      "epoch": 11.515625,
      "grad_norm": 2.501077890396118,
      "learning_rate": 0.000154,
      "loss": 1.5388,
      "step": 737
    },
    {
      "epoch": 11.53125,
      "grad_norm": 2.045219659805298,
      "learning_rate": 0.0001539375,
      "loss": 2.0848,
      "step": 738
    },
    {
      "epoch": 11.546875,
      "grad_norm": 2.539376974105835,
      "learning_rate": 0.000153875,
      "loss": 1.4876,
      "step": 739
    },
    {
      "epoch": 11.5625,
      "grad_norm": 2.131182909011841,
      "learning_rate": 0.0001538125,
      "loss": 1.6671,
      "step": 740
    },
    {
      "epoch": 11.578125,
      "grad_norm": 2.3153252601623535,
      "learning_rate": 0.00015375000000000002,
      "loss": 1.5894,
      "step": 741
    },
    {
      "epoch": 11.59375,
      "grad_norm": 2.2606701850891113,
      "learning_rate": 0.00015368750000000002,
      "loss": 1.7759,
      "step": 742
    },
    {
      "epoch": 11.609375,
      "grad_norm": 2.421609878540039,
      "learning_rate": 0.000153625,
      "loss": 1.7606,
      "step": 743
    },
    {
      "epoch": 11.625,
      "grad_norm": 2.0911622047424316,
      "learning_rate": 0.0001535625,
      "loss": 1.5704,
      "step": 744
    },
    {
      "epoch": 11.640625,
      "grad_norm": 2.380746841430664,
      "learning_rate": 0.0001535,
      "loss": 1.852,
      "step": 745
    },
    {
      "epoch": 11.65625,
      "grad_norm": 2.300365447998047,
      "learning_rate": 0.0001534375,
      "loss": 1.4159,
      "step": 746
    },
    {
      "epoch": 11.671875,
      "grad_norm": 2.2237894535064697,
      "learning_rate": 0.000153375,
      "loss": 1.7343,
      "step": 747
    },
    {
      "epoch": 11.6875,
      "grad_norm": 2.5165700912475586,
      "learning_rate": 0.00015331250000000002,
      "loss": 1.874,
      "step": 748
    },
    {
      "epoch": 11.703125,
      "grad_norm": 2.378422975540161,
      "learning_rate": 0.00015325,
      "loss": 1.6803,
      "step": 749
    },
    {
      "epoch": 11.71875,
      "grad_norm": 2.4529387950897217,
      "learning_rate": 0.00015318750000000003,
      "loss": 1.7839,
      "step": 750
    },
    {
      "epoch": 11.734375,
      "grad_norm": 2.4714441299438477,
      "learning_rate": 0.000153125,
      "loss": 1.8609,
      "step": 751
    },
    {
      "epoch": 11.75,
      "grad_norm": 2.1130104064941406,
      "learning_rate": 0.0001530625,
      "loss": 1.8626,
      "step": 752
    },
    {
      "epoch": 11.765625,
      "grad_norm": 2.325894355773926,
      "learning_rate": 0.000153,
      "loss": 1.6519,
      "step": 753
    },
    {
      "epoch": 11.78125,
      "grad_norm": 2.3130369186401367,
      "learning_rate": 0.0001529375,
      "loss": 1.6659,
      "step": 754
    },
    {
      "epoch": 11.796875,
      "grad_norm": 2.5026419162750244,
      "learning_rate": 0.00015287500000000002,
      "loss": 1.8941,
      "step": 755
    },
    {
      "epoch": 11.8125,
      "grad_norm": 2.1206886768341064,
      "learning_rate": 0.0001528125,
      "loss": 1.6757,
      "step": 756
    },
    {
      "epoch": 11.828125,
      "grad_norm": 2.693256378173828,
      "learning_rate": 0.00015275000000000003,
      "loss": 1.7977,
      "step": 757
    },
    {
      "epoch": 11.84375,
      "grad_norm": 2.3294525146484375,
      "learning_rate": 0.00015268750000000002,
      "loss": 1.8732,
      "step": 758
    },
    {
      "epoch": 11.859375,
      "grad_norm": 2.4213156700134277,
      "learning_rate": 0.000152625,
      "loss": 1.7088,
      "step": 759
    },
    {
      "epoch": 11.875,
      "grad_norm": 2.429788112640381,
      "learning_rate": 0.0001525625,
      "loss": 1.7764,
      "step": 760
    },
    {
      "epoch": 11.890625,
      "grad_norm": 2.2664430141448975,
      "learning_rate": 0.0001525,
      "loss": 1.8447,
      "step": 761
    },
    {
      "epoch": 11.90625,
      "grad_norm": 2.193838357925415,
      "learning_rate": 0.0001524375,
      "loss": 1.8189,
      "step": 762
    },
    {
      "epoch": 11.921875,
      "grad_norm": 2.524077892303467,
      "learning_rate": 0.000152375,
      "loss": 1.8827,
      "step": 763
    },
    {
      "epoch": 11.9375,
      "grad_norm": 2.4857916831970215,
      "learning_rate": 0.00015231250000000002,
      "loss": 1.8148,
      "step": 764
    },
    {
      "epoch": 11.953125,
      "grad_norm": 2.3152382373809814,
      "learning_rate": 0.00015225000000000001,
      "loss": 1.9399,
      "step": 765
    },
    {
      "epoch": 11.96875,
      "grad_norm": 2.6506340503692627,
      "learning_rate": 0.0001521875,
      "loss": 1.7395,
      "step": 766
    },
    {
      "epoch": 11.984375,
      "grad_norm": 1.96360445022583,
      "learning_rate": 0.000152125,
      "loss": 1.9026,
      "step": 767
    },
    {
      "epoch": 12.0,
      "grad_norm": 3.217047691345215,
      "learning_rate": 0.0001520625,
      "loss": 1.8328,
      "step": 768
    },
    {
      "epoch": 12.0,
      "eval_loss": 2.936393976211548,
      "eval_runtime": 2.888,
      "eval_samples_per_second": 177.287,
      "eval_steps_per_second": 44.322,
      "step": 768
    },
    {
      "epoch": 12.015625,
      "grad_norm": 2.1806581020355225,
      "learning_rate": 0.000152,
      "loss": 1.4526,
      "step": 769
    },
    {
      "epoch": 12.03125,
      "grad_norm": 2.261138439178467,
      "learning_rate": 0.0001519375,
      "loss": 1.5071,
      "step": 770
    },
    {
      "epoch": 12.046875,
      "grad_norm": 2.3709311485290527,
      "learning_rate": 0.00015187500000000002,
      "loss": 1.5693,
      "step": 771
    },
    {
      "epoch": 12.0625,
      "grad_norm": 2.2497310638427734,
      "learning_rate": 0.0001518125,
      "loss": 1.8743,
      "step": 772
    },
    {
      "epoch": 12.078125,
      "grad_norm": 2.4895365238189697,
      "learning_rate": 0.00015175,
      "loss": 1.6991,
      "step": 773
    },
    {
      "epoch": 12.09375,
      "grad_norm": 1.912434697151184,
      "learning_rate": 0.0001516875,
      "loss": 1.6996,
      "step": 774
    },
    {
      "epoch": 12.109375,
      "grad_norm": 2.0765397548675537,
      "learning_rate": 0.000151625,
      "loss": 1.8858,
      "step": 775
    },
    {
      "epoch": 12.125,
      "grad_norm": 2.1880040168762207,
      "learning_rate": 0.0001515625,
      "loss": 1.8728,
      "step": 776
    },
    {
      "epoch": 12.140625,
      "grad_norm": 2.0657882690429688,
      "learning_rate": 0.0001515,
      "loss": 1.6736,
      "step": 777
    },
    {
      "epoch": 12.15625,
      "grad_norm": 2.1537880897521973,
      "learning_rate": 0.00015143750000000002,
      "loss": 1.8122,
      "step": 778
    },
    {
      "epoch": 12.171875,
      "grad_norm": 2.2130722999572754,
      "learning_rate": 0.000151375,
      "loss": 1.7629,
      "step": 779
    },
    {
      "epoch": 12.1875,
      "grad_norm": 2.0490453243255615,
      "learning_rate": 0.00015131250000000003,
      "loss": 1.5339,
      "step": 780
    },
    {
      "epoch": 12.203125,
      "grad_norm": 2.4373228549957275,
      "learning_rate": 0.00015125,
      "loss": 1.7192,
      "step": 781
    },
    {
      "epoch": 12.21875,
      "grad_norm": 2.299255609512329,
      "learning_rate": 0.0001511875,
      "loss": 1.5529,
      "step": 782
    },
    {
      "epoch": 12.234375,
      "grad_norm": 2.6383755207061768,
      "learning_rate": 0.000151125,
      "loss": 1.4909,
      "step": 783
    },
    {
      "epoch": 12.25,
      "grad_norm": 2.3646392822265625,
      "learning_rate": 0.00015106250000000002,
      "loss": 1.806,
      "step": 784
    },
    {
      "epoch": 12.265625,
      "grad_norm": 2.186878204345703,
      "learning_rate": 0.000151,
      "loss": 1.6702,
      "step": 785
    },
    {
      "epoch": 12.28125,
      "grad_norm": 2.3069112300872803,
      "learning_rate": 0.0001509375,
      "loss": 1.6629,
      "step": 786
    },
    {
      "epoch": 12.296875,
      "grad_norm": 2.39296555519104,
      "learning_rate": 0.00015087500000000002,
      "loss": 1.6937,
      "step": 787
    },
    {
      "epoch": 12.3125,
      "grad_norm": 2.1380372047424316,
      "learning_rate": 0.0001508125,
      "loss": 1.7837,
      "step": 788
    },
    {
      "epoch": 12.328125,
      "grad_norm": 1.9749077558517456,
      "learning_rate": 0.00015075,
      "loss": 1.9821,
      "step": 789
    },
    {
      "epoch": 12.34375,
      "grad_norm": 1.9467655420303345,
      "learning_rate": 0.0001506875,
      "loss": 1.5947,
      "step": 790
    },
    {
      "epoch": 12.359375,
      "grad_norm": 2.550492286682129,
      "learning_rate": 0.00015062500000000002,
      "loss": 1.9328,
      "step": 791
    },
    {
      "epoch": 12.375,
      "grad_norm": 2.418513774871826,
      "learning_rate": 0.0001505625,
      "loss": 1.7648,
      "step": 792
    },
    {
      "epoch": 12.390625,
      "grad_norm": 2.3875229358673096,
      "learning_rate": 0.0001505,
      "loss": 1.7567,
      "step": 793
    },
    {
      "epoch": 12.40625,
      "grad_norm": 2.139333724975586,
      "learning_rate": 0.00015043750000000002,
      "loss": 1.981,
      "step": 794
    },
    {
      "epoch": 12.421875,
      "grad_norm": 2.304769515991211,
      "learning_rate": 0.000150375,
      "loss": 1.5419,
      "step": 795
    },
    {
      "epoch": 12.4375,
      "grad_norm": 2.3143787384033203,
      "learning_rate": 0.0001503125,
      "loss": 2.0096,
      "step": 796
    },
    {
      "epoch": 12.453125,
      "grad_norm": 2.0932581424713135,
      "learning_rate": 0.00015025,
      "loss": 1.6989,
      "step": 797
    },
    {
      "epoch": 12.46875,
      "grad_norm": 2.3125252723693848,
      "learning_rate": 0.0001501875,
      "loss": 1.7414,
      "step": 798
    },
    {
      "epoch": 12.484375,
      "grad_norm": 2.2531585693359375,
      "learning_rate": 0.000150125,
      "loss": 1.8905,
      "step": 799
    },
    {
      "epoch": 12.5,
      "grad_norm": 2.462465286254883,
      "learning_rate": 0.00015006250000000002,
      "loss": 1.5972,
      "step": 800
    },
    {
      "epoch": 12.515625,
      "grad_norm": 2.3050501346588135,
      "learning_rate": 0.00015000000000000001,
      "loss": 1.6143,
      "step": 801
    },
    {
      "epoch": 12.53125,
      "grad_norm": 2.3778867721557617,
      "learning_rate": 0.0001499375,
      "loss": 1.4376,
      "step": 802
    },
    {
      "epoch": 12.546875,
      "grad_norm": 2.325366258621216,
      "learning_rate": 0.000149875,
      "loss": 1.7356,
      "step": 803
    },
    {
      "epoch": 12.5625,
      "grad_norm": 1.908149242401123,
      "learning_rate": 0.0001498125,
      "loss": 1.5784,
      "step": 804
    },
    {
      "epoch": 12.578125,
      "grad_norm": 2.239684820175171,
      "learning_rate": 0.00014975,
      "loss": 1.7846,
      "step": 805
    },
    {
      "epoch": 12.59375,
      "grad_norm": 2.1662378311157227,
      "learning_rate": 0.0001496875,
      "loss": 1.6768,
      "step": 806
    },
    {
      "epoch": 12.609375,
      "grad_norm": 1.9423195123672485,
      "learning_rate": 0.00014962500000000002,
      "loss": 1.6428,
      "step": 807
    },
    {
      "epoch": 12.625,
      "grad_norm": 2.4466397762298584,
      "learning_rate": 0.0001495625,
      "loss": 1.7174,
      "step": 808
    },
    {
      "epoch": 12.640625,
      "grad_norm": 2.308647394180298,
      "learning_rate": 0.00014950000000000003,
      "loss": 1.4999,
      "step": 809
    },
    {
      "epoch": 12.65625,
      "grad_norm": 2.320967674255371,
      "learning_rate": 0.00014943750000000002,
      "loss": 1.7758,
      "step": 810
    },
    {
      "epoch": 12.671875,
      "grad_norm": 2.4182605743408203,
      "learning_rate": 0.00014937499999999999,
      "loss": 1.5329,
      "step": 811
    },
    {
      "epoch": 12.6875,
      "grad_norm": 2.222609043121338,
      "learning_rate": 0.0001493125,
      "loss": 1.5643,
      "step": 812
    },
    {
      "epoch": 12.703125,
      "grad_norm": 2.31217885017395,
      "learning_rate": 0.00014925,
      "loss": 1.4123,
      "step": 813
    },
    {
      "epoch": 12.71875,
      "grad_norm": 2.196685791015625,
      "learning_rate": 0.00014918750000000001,
      "loss": 1.5794,
      "step": 814
    },
    {
      "epoch": 12.734375,
      "grad_norm": 2.4597160816192627,
      "learning_rate": 0.000149125,
      "loss": 1.753,
      "step": 815
    },
    {
      "epoch": 12.75,
      "grad_norm": 2.1286306381225586,
      "learning_rate": 0.00014906250000000003,
      "loss": 1.738,
      "step": 816
    },
    {
      "epoch": 12.765625,
      "grad_norm": 2.439685106277466,
      "learning_rate": 0.00014900000000000002,
      "loss": 1.8108,
      "step": 817
    },
    {
      "epoch": 12.78125,
      "grad_norm": 2.307039737701416,
      "learning_rate": 0.0001489375,
      "loss": 1.7519,
      "step": 818
    },
    {
      "epoch": 12.796875,
      "grad_norm": 2.3184056282043457,
      "learning_rate": 0.000148875,
      "loss": 1.6765,
      "step": 819
    },
    {
      "epoch": 12.8125,
      "grad_norm": 2.474787712097168,
      "learning_rate": 0.0001488125,
      "loss": 1.7011,
      "step": 820
    },
    {
      "epoch": 12.828125,
      "grad_norm": 2.261180877685547,
      "learning_rate": 0.00014875,
      "loss": 1.8114,
      "step": 821
    },
    {
      "epoch": 12.84375,
      "grad_norm": 2.33044171333313,
      "learning_rate": 0.0001486875,
      "loss": 1.6676,
      "step": 822
    },
    {
      "epoch": 12.859375,
      "grad_norm": 2.2974300384521484,
      "learning_rate": 0.00014862500000000002,
      "loss": 1.7073,
      "step": 823
    },
    {
      "epoch": 12.875,
      "grad_norm": 2.390789031982422,
      "learning_rate": 0.0001485625,
      "loss": 1.8156,
      "step": 824
    },
    {
      "epoch": 12.890625,
      "grad_norm": 2.438309907913208,
      "learning_rate": 0.0001485,
      "loss": 1.5817,
      "step": 825
    },
    {
      "epoch": 12.90625,
      "grad_norm": 2.461362838745117,
      "learning_rate": 0.0001484375,
      "loss": 1.5546,
      "step": 826
    },
    {
      "epoch": 12.921875,
      "grad_norm": 2.021925449371338,
      "learning_rate": 0.000148375,
      "loss": 1.6127,
      "step": 827
    },
    {
      "epoch": 12.9375,
      "grad_norm": 2.6304450035095215,
      "learning_rate": 0.0001483125,
      "loss": 1.7129,
      "step": 828
    },
    {
      "epoch": 12.953125,
      "grad_norm": 2.4592854976654053,
      "learning_rate": 0.00014825,
      "loss": 1.684,
      "step": 829
    },
    {
      "epoch": 12.96875,
      "grad_norm": 2.6205832958221436,
      "learning_rate": 0.00014818750000000002,
      "loss": 1.6507,
      "step": 830
    },
    {
      "epoch": 12.984375,
      "grad_norm": 2.2272934913635254,
      "learning_rate": 0.000148125,
      "loss": 1.8444,
      "step": 831
    },
    {
      "epoch": 13.0,
      "grad_norm": 2.1665198802948,
      "learning_rate": 0.00014806250000000003,
      "loss": 1.6627,
      "step": 832
    },
    {
      "epoch": 13.0,
      "eval_loss": 2.9539947509765625,
      "eval_runtime": 2.8913,
      "eval_samples_per_second": 177.086,
      "eval_steps_per_second": 44.271,
      "step": 832
    },
    {
      "epoch": 13.015625,
      "grad_norm": 2.2163047790527344,
      "learning_rate": 0.000148,
      "loss": 1.684,
      "step": 833
    },
    {
      "epoch": 13.03125,
      "grad_norm": 2.176849603652954,
      "learning_rate": 0.0001479375,
      "loss": 1.7957,
      "step": 834
    },
    {
      "epoch": 13.046875,
      "grad_norm": 2.0644352436065674,
      "learning_rate": 0.000147875,
      "loss": 1.5804,
      "step": 835
    },
    {
      "epoch": 13.0625,
      "grad_norm": 2.1073601245880127,
      "learning_rate": 0.0001478125,
      "loss": 1.5945,
      "step": 836
    },
    {
      "epoch": 13.078125,
      "grad_norm": 2.3463070392608643,
      "learning_rate": 0.00014775,
      "loss": 1.6302,
      "step": 837
    },
    {
      "epoch": 13.09375,
      "grad_norm": 2.345533609390259,
      "learning_rate": 0.0001476875,
      "loss": 1.5602,
      "step": 838
    },
    {
      "epoch": 13.109375,
      "grad_norm": 2.2397751808166504,
      "learning_rate": 0.00014762500000000002,
      "loss": 1.6736,
      "step": 839
    },
    {
      "epoch": 13.125,
      "grad_norm": 2.158754587173462,
      "learning_rate": 0.00014756250000000002,
      "loss": 1.4848,
      "step": 840
    },
    {
      "epoch": 13.140625,
      "grad_norm": 2.405318260192871,
      "learning_rate": 0.0001475,
      "loss": 1.773,
      "step": 841
    },
    {
      "epoch": 13.15625,
      "grad_norm": 2.150970697402954,
      "learning_rate": 0.0001474375,
      "loss": 1.6848,
      "step": 842
    },
    {
      "epoch": 13.171875,
      "grad_norm": 2.3447697162628174,
      "learning_rate": 0.000147375,
      "loss": 1.3988,
      "step": 843
    },
    {
      "epoch": 13.1875,
      "grad_norm": 2.402231216430664,
      "learning_rate": 0.0001473125,
      "loss": 1.5852,
      "step": 844
    },
    {
      "epoch": 13.203125,
      "grad_norm": 2.2670698165893555,
      "learning_rate": 0.00014725,
      "loss": 1.7982,
      "step": 845
    },
    {
      "epoch": 13.21875,
      "grad_norm": 2.422499656677246,
      "learning_rate": 0.00014718750000000002,
      "loss": 1.8139,
      "step": 846
    },
    {
      "epoch": 13.234375,
      "grad_norm": 2.1656076908111572,
      "learning_rate": 0.000147125,
      "loss": 1.5975,
      "step": 847
    },
    {
      "epoch": 13.25,
      "grad_norm": 2.113365888595581,
      "learning_rate": 0.0001470625,
      "loss": 1.5301,
      "step": 848
    },
    {
      "epoch": 13.265625,
      "grad_norm": 2.355135679244995,
      "learning_rate": 0.000147,
      "loss": 1.7725,
      "step": 849
    },
    {
      "epoch": 13.28125,
      "grad_norm": 2.3939335346221924,
      "learning_rate": 0.00014693750000000001,
      "loss": 1.8969,
      "step": 850
    },
    {
      "epoch": 13.296875,
      "grad_norm": 2.4114556312561035,
      "learning_rate": 0.000146875,
      "loss": 1.8699,
      "step": 851
    },
    {
      "epoch": 13.3125,
      "grad_norm": 2.335155725479126,
      "learning_rate": 0.0001468125,
      "loss": 1.5963,
      "step": 852
    },
    {
      "epoch": 13.328125,
      "grad_norm": 2.5055391788482666,
      "learning_rate": 0.00014675000000000002,
      "loss": 1.5611,
      "step": 853
    },
    {
      "epoch": 13.34375,
      "grad_norm": 2.273737907409668,
      "learning_rate": 0.0001466875,
      "loss": 1.5253,
      "step": 854
    },
    {
      "epoch": 13.359375,
      "grad_norm": 2.604353189468384,
      "learning_rate": 0.000146625,
      "loss": 1.6831,
      "step": 855
    },
    {
      "epoch": 13.375,
      "grad_norm": 2.2345447540283203,
      "learning_rate": 0.0001465625,
      "loss": 1.4297,
      "step": 856
    },
    {
      "epoch": 13.390625,
      "grad_norm": 2.400346040725708,
      "learning_rate": 0.0001465,
      "loss": 1.5072,
      "step": 857
    },
    {
      "epoch": 13.40625,
      "grad_norm": 2.2563118934631348,
      "learning_rate": 0.0001464375,
      "loss": 1.7134,
      "step": 858
    },
    {
      "epoch": 13.421875,
      "grad_norm": 2.046412706375122,
      "learning_rate": 0.00014637500000000002,
      "loss": 1.769,
      "step": 859
    },
    {
      "epoch": 13.4375,
      "grad_norm": 2.3495256900787354,
      "learning_rate": 0.0001463125,
      "loss": 1.5213,
      "step": 860
    },
    {
      "epoch": 13.453125,
      "grad_norm": 2.5876684188842773,
      "learning_rate": 0.00014625,
      "loss": 1.8633,
      "step": 861
    },
    {
      "epoch": 13.46875,
      "grad_norm": 2.1621179580688477,
      "learning_rate": 0.00014618750000000002,
      "loss": 1.7374,
      "step": 862
    },
    {
      "epoch": 13.484375,
      "grad_norm": 2.257976770401001,
      "learning_rate": 0.000146125,
      "loss": 1.8461,
      "step": 863
    },
    {
      "epoch": 13.5,
      "grad_norm": 2.4573545455932617,
      "learning_rate": 0.0001460625,
      "loss": 1.7806,
      "step": 864
    },
    {
      "epoch": 13.515625,
      "grad_norm": 2.472666025161743,
      "learning_rate": 0.000146,
      "loss": 1.9347,
      "step": 865
    },
    {
      "epoch": 13.53125,
      "grad_norm": 2.272412061691284,
      "learning_rate": 0.00014593750000000002,
      "loss": 1.5674,
      "step": 866
    },
    {
      "epoch": 13.546875,
      "grad_norm": 2.142456293106079,
      "learning_rate": 0.000145875,
      "loss": 1.3235,
      "step": 867
    },
    {
      "epoch": 13.5625,
      "grad_norm": 2.727940320968628,
      "learning_rate": 0.0001458125,
      "loss": 1.7492,
      "step": 868
    },
    {
      "epoch": 13.578125,
      "grad_norm": 2.230180025100708,
      "learning_rate": 0.00014575000000000002,
      "loss": 1.6728,
      "step": 869
    },
    {
      "epoch": 13.59375,
      "grad_norm": 2.354210376739502,
      "learning_rate": 0.0001456875,
      "loss": 1.77,
      "step": 870
    },
    {
      "epoch": 13.609375,
      "grad_norm": 2.3010544776916504,
      "learning_rate": 0.000145625,
      "loss": 1.4937,
      "step": 871
    },
    {
      "epoch": 13.625,
      "grad_norm": 2.3652899265289307,
      "learning_rate": 0.0001455625,
      "loss": 1.6196,
      "step": 872
    },
    {
      "epoch": 13.640625,
      "grad_norm": 2.720435380935669,
      "learning_rate": 0.0001455,
      "loss": 1.4526,
      "step": 873
    },
    {
      "epoch": 13.65625,
      "grad_norm": 2.475680112838745,
      "learning_rate": 0.0001454375,
      "loss": 1.7873,
      "step": 874
    },
    {
      "epoch": 13.671875,
      "grad_norm": 2.2365708351135254,
      "learning_rate": 0.00014537500000000002,
      "loss": 1.8028,
      "step": 875
    },
    {
      "epoch": 13.6875,
      "grad_norm": 2.641083240509033,
      "learning_rate": 0.00014531250000000002,
      "loss": 1.6711,
      "step": 876
    },
    {
      "epoch": 13.703125,
      "grad_norm": 2.278646945953369,
      "learning_rate": 0.00014525,
      "loss": 1.5848,
      "step": 877
    },
    {
      "epoch": 13.71875,
      "grad_norm": 2.2853996753692627,
      "learning_rate": 0.0001451875,
      "loss": 1.6043,
      "step": 878
    },
    {
      "epoch": 13.734375,
      "grad_norm": 2.1722795963287354,
      "learning_rate": 0.000145125,
      "loss": 1.6159,
      "step": 879
    },
    {
      "epoch": 13.75,
      "grad_norm": 2.383362054824829,
      "learning_rate": 0.0001450625,
      "loss": 1.6921,
      "step": 880
    },
    {
      "epoch": 13.765625,
      "grad_norm": 2.4194018840789795,
      "learning_rate": 0.000145,
      "loss": 1.6402,
      "step": 881
    },
    {
      "epoch": 13.78125,
      "grad_norm": 2.1498444080352783,
      "learning_rate": 0.00014493750000000002,
      "loss": 1.7647,
      "step": 882
    },
    {
      "epoch": 13.796875,
      "grad_norm": 2.1050143241882324,
      "learning_rate": 0.000144875,
      "loss": 1.5873,
      "step": 883
    },
    {
      "epoch": 13.8125,
      "grad_norm": 2.5488946437835693,
      "learning_rate": 0.00014481250000000003,
      "loss": 1.7966,
      "step": 884
    },
    {
      "epoch": 13.828125,
      "grad_norm": 2.5401437282562256,
      "learning_rate": 0.00014475,
      "loss": 1.7784,
      "step": 885
    },
    {
      "epoch": 13.84375,
      "grad_norm": 2.3881566524505615,
      "learning_rate": 0.0001446875,
      "loss": 1.7112,
      "step": 886
    },
    {
      "epoch": 13.859375,
      "grad_norm": 2.210606336593628,
      "learning_rate": 0.000144625,
      "loss": 1.7077,
      "step": 887
    },
    {
      "epoch": 13.875,
      "grad_norm": 2.2548110485076904,
      "learning_rate": 0.0001445625,
      "loss": 1.7401,
      "step": 888
    },
    {
      "epoch": 13.890625,
      "grad_norm": 2.4882664680480957,
      "learning_rate": 0.00014450000000000002,
      "loss": 1.7652,
      "step": 889
    },
    {
      "epoch": 13.90625,
      "grad_norm": 2.2545735836029053,
      "learning_rate": 0.0001444375,
      "loss": 1.7183,
      "step": 890
    },
    {
      "epoch": 13.921875,
      "grad_norm": 2.1958916187286377,
      "learning_rate": 0.00014437500000000003,
      "loss": 1.7518,
      "step": 891
    },
    {
      "epoch": 13.9375,
      "grad_norm": 2.353698968887329,
      "learning_rate": 0.00014431250000000002,
      "loss": 1.8408,
      "step": 892
    },
    {
      "epoch": 13.953125,
      "grad_norm": 2.4690675735473633,
      "learning_rate": 0.00014425,
      "loss": 2.1124,
      "step": 893
    },
    {
      "epoch": 13.96875,
      "grad_norm": 2.2375316619873047,
      "learning_rate": 0.0001441875,
      "loss": 1.7176,
      "step": 894
    },
    {
      "epoch": 13.984375,
      "grad_norm": 2.989802122116089,
      "learning_rate": 0.000144125,
      "loss": 1.7305,
      "step": 895
    },
    {
      "epoch": 14.0,
      "grad_norm": 2.443420171737671,
      "learning_rate": 0.0001440625,
      "loss": 1.56,
      "step": 896
    },
    {
      "epoch": 14.0,
      "eval_loss": 2.965451717376709,
      "eval_runtime": 2.8971,
      "eval_samples_per_second": 176.73,
      "eval_steps_per_second": 44.182,
      "step": 896
    },
    {
      "epoch": 14.015625,
      "grad_norm": 2.5092055797576904,
      "learning_rate": 0.000144,
      "loss": 1.4683,
      "step": 897
    },
    {
      "epoch": 14.03125,
      "grad_norm": 2.190365791320801,
      "learning_rate": 0.00014393750000000002,
      "loss": 1.7887,
      "step": 898
    },
    {
      "epoch": 14.046875,
      "grad_norm": 2.1534461975097656,
      "learning_rate": 0.00014387500000000001,
      "loss": 1.4994,
      "step": 899
    },
    {
      "epoch": 14.0625,
      "grad_norm": 2.4148619174957275,
      "learning_rate": 0.0001438125,
      "loss": 1.4983,
      "step": 900
    },
    {
      "epoch": 14.078125,
      "grad_norm": 2.0283493995666504,
      "learning_rate": 0.00014375,
      "loss": 1.9023,
      "step": 901
    },
    {
      "epoch": 14.09375,
      "grad_norm": 2.3432018756866455,
      "learning_rate": 0.0001436875,
      "loss": 1.7842,
      "step": 902
    },
    {
      "epoch": 14.109375,
      "grad_norm": 2.023453712463379,
      "learning_rate": 0.000143625,
      "loss": 1.7151,
      "step": 903
    },
    {
      "epoch": 14.125,
      "grad_norm": 2.4511466026306152,
      "learning_rate": 0.0001435625,
      "loss": 1.809,
      "step": 904
    },
    {
      "epoch": 14.140625,
      "grad_norm": 2.359386920928955,
      "learning_rate": 0.00014350000000000002,
      "loss": 1.7338,
      "step": 905
    },
    {
      "epoch": 14.15625,
      "grad_norm": 2.153615713119507,
      "learning_rate": 0.0001434375,
      "loss": 1.401,
      "step": 906
    },
    {
      "epoch": 14.171875,
      "grad_norm": 2.1073477268218994,
      "learning_rate": 0.000143375,
      "loss": 1.6182,
      "step": 907
    },
    {
      "epoch": 14.1875,
      "grad_norm": 2.159209728240967,
      "learning_rate": 0.0001433125,
      "loss": 1.5592,
      "step": 908
    },
    {
      "epoch": 14.203125,
      "grad_norm": 2.325587272644043,
      "learning_rate": 0.00014325,
      "loss": 1.8959,
      "step": 909
    },
    {
      "epoch": 14.21875,
      "grad_norm": 2.3604912757873535,
      "learning_rate": 0.0001431875,
      "loss": 1.8426,
      "step": 910
    },
    {
      "epoch": 14.234375,
      "grad_norm": 2.434843063354492,
      "learning_rate": 0.000143125,
      "loss": 1.693,
      "step": 911
    },
    {
      "epoch": 14.25,
      "grad_norm": 2.0467562675476074,
      "learning_rate": 0.00014306250000000001,
      "loss": 1.5186,
      "step": 912
    },
    {
      "epoch": 14.265625,
      "grad_norm": 2.4285778999328613,
      "learning_rate": 0.000143,
      "loss": 1.8027,
      "step": 913
    },
    {
      "epoch": 14.28125,
      "grad_norm": 2.2321033477783203,
      "learning_rate": 0.00014293750000000003,
      "loss": 1.5331,
      "step": 914
    },
    {
      "epoch": 14.296875,
      "grad_norm": 2.5176374912261963,
      "learning_rate": 0.000142875,
      "loss": 1.5294,
      "step": 915
    },
    {
      "epoch": 14.3125,
      "grad_norm": 2.3702774047851562,
      "learning_rate": 0.0001428125,
      "loss": 1.9311,
      "step": 916
    },
    {
      "epoch": 14.328125,
      "grad_norm": 2.493407964706421,
      "learning_rate": 0.00014275,
      "loss": 1.7014,
      "step": 917
    },
    {
      "epoch": 14.34375,
      "grad_norm": 2.5053606033325195,
      "learning_rate": 0.0001426875,
      "loss": 1.8236,
      "step": 918
    },
    {
      "epoch": 14.359375,
      "grad_norm": 2.279064416885376,
      "learning_rate": 0.000142625,
      "loss": 1.7029,
      "step": 919
    },
    {
      "epoch": 14.375,
      "grad_norm": 2.112210750579834,
      "learning_rate": 0.0001425625,
      "loss": 1.6311,
      "step": 920
    },
    {
      "epoch": 14.390625,
      "grad_norm": 2.4912266731262207,
      "learning_rate": 0.00014250000000000002,
      "loss": 1.5384,
      "step": 921
    },
    {
      "epoch": 14.40625,
      "grad_norm": 2.2691032886505127,
      "learning_rate": 0.0001424375,
      "loss": 1.5634,
      "step": 922
    },
    {
      "epoch": 14.421875,
      "grad_norm": 2.1717724800109863,
      "learning_rate": 0.000142375,
      "loss": 1.5163,
      "step": 923
    },
    {
      "epoch": 14.4375,
      "grad_norm": 2.511040449142456,
      "learning_rate": 0.0001423125,
      "loss": 1.7157,
      "step": 924
    },
    {
      "epoch": 14.453125,
      "grad_norm": 2.415785789489746,
      "learning_rate": 0.00014225000000000002,
      "loss": 1.7166,
      "step": 925
    },
    {
      "epoch": 14.46875,
      "grad_norm": 2.5659291744232178,
      "learning_rate": 0.0001421875,
      "loss": 1.6621,
      "step": 926
    },
    {
      "epoch": 14.484375,
      "grad_norm": 1.9743083715438843,
      "learning_rate": 0.000142125,
      "loss": 1.5146,
      "step": 927
    },
    {
      "epoch": 14.5,
      "grad_norm": 2.6673779487609863,
      "learning_rate": 0.00014206250000000002,
      "loss": 1.5015,
      "step": 928
    },
    {
      "epoch": 14.515625,
      "grad_norm": 2.3460447788238525,
      "learning_rate": 0.000142,
      "loss": 1.663,
      "step": 929
    },
    {
      "epoch": 14.53125,
      "grad_norm": 2.474961280822754,
      "learning_rate": 0.0001419375,
      "loss": 1.8449,
      "step": 930
    },
    {
      "epoch": 14.546875,
      "grad_norm": 2.5219473838806152,
      "learning_rate": 0.000141875,
      "loss": 1.6719,
      "step": 931
    },
    {
      "epoch": 14.5625,
      "grad_norm": 2.3573157787323,
      "learning_rate": 0.0001418125,
      "loss": 1.5151,
      "step": 932
    },
    {
      "epoch": 14.578125,
      "grad_norm": 2.7143466472625732,
      "learning_rate": 0.00014175,
      "loss": 1.5871,
      "step": 933
    },
    {
      "epoch": 14.59375,
      "grad_norm": 2.6236276626586914,
      "learning_rate": 0.00014168750000000002,
      "loss": 1.4639,
      "step": 934
    },
    {
      "epoch": 14.609375,
      "grad_norm": 2.4395551681518555,
      "learning_rate": 0.00014162500000000001,
      "loss": 1.5129,
      "step": 935
    },
    {
      "epoch": 14.625,
      "grad_norm": 2.4406802654266357,
      "learning_rate": 0.0001415625,
      "loss": 1.6651,
      "step": 936
    },
    {
      "epoch": 14.640625,
      "grad_norm": 2.175191879272461,
      "learning_rate": 0.0001415,
      "loss": 1.6725,
      "step": 937
    },
    {
      "epoch": 14.65625,
      "grad_norm": 2.5274364948272705,
      "learning_rate": 0.0001414375,
      "loss": 1.4572,
      "step": 938
    },
    {
      "epoch": 14.671875,
      "grad_norm": 2.4648993015289307,
      "learning_rate": 0.000141375,
      "loss": 1.6842,
      "step": 939
    },
    {
      "epoch": 14.6875,
      "grad_norm": 2.554225206375122,
      "learning_rate": 0.0001413125,
      "loss": 1.7334,
      "step": 940
    },
    {
      "epoch": 14.703125,
      "grad_norm": 2.255279302597046,
      "learning_rate": 0.00014125000000000002,
      "loss": 1.8204,
      "step": 941
    },
    {
      "epoch": 14.71875,
      "grad_norm": 2.0615689754486084,
      "learning_rate": 0.0001411875,
      "loss": 1.858,
      "step": 942
    },
    {
      "epoch": 14.734375,
      "grad_norm": 2.1186935901641846,
      "learning_rate": 0.000141125,
      "loss": 1.9727,
      "step": 943
    },
    {
      "epoch": 14.75,
      "grad_norm": 2.6326489448547363,
      "learning_rate": 0.00014106250000000002,
      "loss": 1.4377,
      "step": 944
    },
    {
      "epoch": 14.765625,
      "grad_norm": 2.309861660003662,
      "learning_rate": 0.000141,
      "loss": 1.5305,
      "step": 945
    },
    {
      "epoch": 14.78125,
      "grad_norm": 2.8338942527770996,
      "learning_rate": 0.0001409375,
      "loss": 1.723,
      "step": 946
    },
    {
      "epoch": 14.796875,
      "grad_norm": 2.162248134613037,
      "learning_rate": 0.000140875,
      "loss": 1.5943,
      "step": 947
    },
    {
      "epoch": 14.8125,
      "grad_norm": 2.5996196269989014,
      "learning_rate": 0.00014081250000000001,
      "loss": 1.6674,
      "step": 948
    },
    {
      "epoch": 14.828125,
      "grad_norm": 2.5607497692108154,
      "learning_rate": 0.00014075,
      "loss": 1.6514,
      "step": 949
    },
    {
      "epoch": 14.84375,
      "grad_norm": 103.29804229736328,
      "learning_rate": 0.00014068750000000002,
      "loss": 1.4626,
      "step": 950
    },
    {
      "epoch": 14.859375,
      "grad_norm": 2.1845431327819824,
      "learning_rate": 0.00014062500000000002,
      "loss": 1.5425,
      "step": 951
    },
    {
      "epoch": 14.875,
      "grad_norm": 2.547502040863037,
      "learning_rate": 0.0001405625,
      "loss": 1.4619,
      "step": 952
    },
    {
      "epoch": 14.890625,
      "grad_norm": 2.825326442718506,
      "learning_rate": 0.0001405,
      "loss": 1.6599,
      "step": 953
    },
    {
      "epoch": 14.90625,
      "grad_norm": 2.4049885272979736,
      "learning_rate": 0.0001404375,
      "loss": 1.736,
      "step": 954
    },
    {
      "epoch": 14.921875,
      "grad_norm": 2.198308229446411,
      "learning_rate": 0.000140375,
      "loss": 1.5035,
      "step": 955
    },
    {
      "epoch": 14.9375,
      "grad_norm": 2.221790313720703,
      "learning_rate": 0.0001403125,
      "loss": 1.6978,
      "step": 956
    },
    {
      "epoch": 14.953125,
      "grad_norm": 2.249948501586914,
      "learning_rate": 0.00014025000000000002,
      "loss": 1.5607,
      "step": 957
    },
    {
      "epoch": 14.96875,
      "grad_norm": 2.181615114212036,
      "learning_rate": 0.0001401875,
      "loss": 1.8106,
      "step": 958
    },
    {
      "epoch": 14.984375,
      "grad_norm": 2.2967913150787354,
      "learning_rate": 0.000140125,
      "loss": 1.5298,
      "step": 959
    },
    {
      "epoch": 15.0,
      "grad_norm": 2.090559959411621,
      "learning_rate": 0.0001400625,
      "loss": 1.2253,
      "step": 960
    },
    {
      "epoch": 15.0,
      "eval_loss": 2.9748682975769043,
      "eval_runtime": 2.8539,
      "eval_samples_per_second": 179.401,
      "eval_steps_per_second": 44.85,
      "step": 960
    },
    {
      "epoch": 15.015625,
      "grad_norm": 2.1657869815826416,
      "learning_rate": 0.00014,
      "loss": 1.5102,
      "step": 961
    },
    {
      "epoch": 15.03125,
      "grad_norm": 2.0783071517944336,
      "learning_rate": 0.0001399375,
      "loss": 1.4082,
      "step": 962
    },
    {
      "epoch": 15.046875,
      "grad_norm": 2.4565839767456055,
      "learning_rate": 0.000139875,
      "loss": 1.4786,
      "step": 963
    },
    {
      "epoch": 15.0625,
      "grad_norm": 2.161759376525879,
      "learning_rate": 0.00013981250000000002,
      "loss": 1.6642,
      "step": 964
    },
    {
      "epoch": 15.078125,
      "grad_norm": 2.144174814224243,
      "learning_rate": 0.00013975,
      "loss": 1.6442,
      "step": 965
    },
    {
      "epoch": 15.09375,
      "grad_norm": 2.073519229888916,
      "learning_rate": 0.00013968750000000003,
      "loss": 1.8856,
      "step": 966
    },
    {
      "epoch": 15.109375,
      "grad_norm": 2.168311595916748,
      "learning_rate": 0.00013962500000000002,
      "loss": 1.75,
      "step": 967
    },
    {
      "epoch": 15.125,
      "grad_norm": 2.286858320236206,
      "learning_rate": 0.00013956249999999998,
      "loss": 1.6699,
      "step": 968
    },
    {
      "epoch": 15.140625,
      "grad_norm": 2.406161069869995,
      "learning_rate": 0.0001395,
      "loss": 1.3362,
      "step": 969
    },
    {
      "epoch": 15.15625,
      "grad_norm": 2.522167921066284,
      "learning_rate": 0.0001394375,
      "loss": 1.5163,
      "step": 970
    },
    {
      "epoch": 15.171875,
      "grad_norm": 2.2905728816986084,
      "learning_rate": 0.000139375,
      "loss": 1.4861,
      "step": 971
    },
    {
      "epoch": 15.1875,
      "grad_norm": 2.4107401371002197,
      "learning_rate": 0.0001393125,
      "loss": 1.8821,
      "step": 972
    },
    {
      "epoch": 15.203125,
      "grad_norm": 2.3180198669433594,
      "learning_rate": 0.00013925000000000002,
      "loss": 1.5459,
      "step": 973
    },
    {
      "epoch": 15.21875,
      "grad_norm": 2.3226561546325684,
      "learning_rate": 0.00013918750000000002,
      "loss": 1.6264,
      "step": 974
    },
    {
      "epoch": 15.234375,
      "grad_norm": 2.4935762882232666,
      "learning_rate": 0.000139125,
      "loss": 1.8213,
      "step": 975
    },
    {
      "epoch": 15.25,
      "grad_norm": 2.378350019454956,
      "learning_rate": 0.0001390625,
      "loss": 1.8378,
      "step": 976
    },
    {
      "epoch": 15.265625,
      "grad_norm": 2.297888994216919,
      "learning_rate": 0.000139,
      "loss": 1.7162,
      "step": 977
    },
    {
      "epoch": 15.28125,
      "grad_norm": 2.3169381618499756,
      "learning_rate": 0.0001389375,
      "loss": 1.6625,
      "step": 978
    },
    {
      "epoch": 15.296875,
      "grad_norm": 2.0399932861328125,
      "learning_rate": 0.000138875,
      "loss": 1.5948,
      "step": 979
    },
    {
      "epoch": 15.3125,
      "grad_norm": 2.3443424701690674,
      "learning_rate": 0.00013881250000000002,
      "loss": 1.6741,
      "step": 980
    },
    {
      "epoch": 15.328125,
      "grad_norm": 2.4872660636901855,
      "learning_rate": 0.00013875,
      "loss": 1.7431,
      "step": 981
    },
    {
      "epoch": 15.34375,
      "grad_norm": 2.023224115371704,
      "learning_rate": 0.0001386875,
      "loss": 1.6897,
      "step": 982
    },
    {
      "epoch": 15.359375,
      "grad_norm": 2.121760606765747,
      "learning_rate": 0.000138625,
      "loss": 1.7795,
      "step": 983
    },
    {
      "epoch": 15.375,
      "grad_norm": 2.353027582168579,
      "learning_rate": 0.00013856250000000001,
      "loss": 1.349,
      "step": 984
    },
    {
      "epoch": 15.390625,
      "grad_norm": 2.0666913986206055,
      "learning_rate": 0.0001385,
      "loss": 1.4151,
      "step": 985
    },
    {
      "epoch": 15.40625,
      "grad_norm": 2.267416477203369,
      "learning_rate": 0.0001384375,
      "loss": 1.4877,
      "step": 986
    },
    {
      "epoch": 15.421875,
      "grad_norm": 2.2818191051483154,
      "learning_rate": 0.00013837500000000002,
      "loss": 1.5288,
      "step": 987
    },
    {
      "epoch": 15.4375,
      "grad_norm": 2.261308431625366,
      "learning_rate": 0.0001383125,
      "loss": 1.6105,
      "step": 988
    },
    {
      "epoch": 15.453125,
      "grad_norm": 2.4918394088745117,
      "learning_rate": 0.00013825,
      "loss": 1.5466,
      "step": 989
    },
    {
      "epoch": 15.46875,
      "grad_norm": 2.6580026149749756,
      "learning_rate": 0.0001381875,
      "loss": 1.61,
      "step": 990
    },
    {
      "epoch": 15.484375,
      "grad_norm": 2.3631784915924072,
      "learning_rate": 0.000138125,
      "loss": 1.587,
      "step": 991
    },
    {
      "epoch": 15.5,
      "grad_norm": 2.2214295864105225,
      "learning_rate": 0.0001380625,
      "loss": 1.812,
      "step": 992
    },
    {
      "epoch": 15.515625,
      "grad_norm": 2.7446587085723877,
      "learning_rate": 0.000138,
      "loss": 1.7022,
      "step": 993
    },
    {
      "epoch": 15.53125,
      "grad_norm": 2.0972964763641357,
      "learning_rate": 0.0001379375,
      "loss": 1.7234,
      "step": 994
    },
    {
      "epoch": 15.546875,
      "grad_norm": 2.229729175567627,
      "learning_rate": 0.000137875,
      "loss": 1.6717,
      "step": 995
    },
    {
      "epoch": 15.5625,
      "grad_norm": 2.2229976654052734,
      "learning_rate": 0.00013781250000000002,
      "loss": 1.7567,
      "step": 996
    },
    {
      "epoch": 15.578125,
      "grad_norm": 2.279646873474121,
      "learning_rate": 0.00013775000000000001,
      "loss": 1.4595,
      "step": 997
    },
    {
      "epoch": 15.59375,
      "grad_norm": 2.2558557987213135,
      "learning_rate": 0.0001376875,
      "loss": 1.7511,
      "step": 998
    },
    {
      "epoch": 15.609375,
      "grad_norm": 2.4054348468780518,
      "learning_rate": 0.000137625,
      "loss": 1.7211,
      "step": 999
    },
    {
      "epoch": 15.625,
      "grad_norm": 2.6602680683135986,
      "learning_rate": 0.00013756250000000002,
      "loss": 1.7527,
      "step": 1000
    },
    {
      "epoch": 15.640625,
      "grad_norm": 2.5577571392059326,
      "learning_rate": 0.0001375,
      "loss": 1.75,
      "step": 1001
    },
    {
      "epoch": 15.65625,
      "grad_norm": 2.416569232940674,
      "learning_rate": 0.0001374375,
      "loss": 1.4958,
      "step": 1002
    },
    {
      "epoch": 15.671875,
      "grad_norm": 2.334887981414795,
      "learning_rate": 0.00013737500000000002,
      "loss": 1.692,
      "step": 1003
    },
    {
      "epoch": 15.6875,
      "grad_norm": 2.4420547485351562,
      "learning_rate": 0.0001373125,
      "loss": 1.5571,
      "step": 1004
    },
    {
      "epoch": 15.703125,
      "grad_norm": 2.435744047164917,
      "learning_rate": 0.00013725,
      "loss": 1.3606,
      "step": 1005
    },
    {
      "epoch": 15.71875,
      "grad_norm": 2.4233322143554688,
      "learning_rate": 0.0001371875,
      "loss": 1.521,
      "step": 1006
    },
    {
      "epoch": 15.734375,
      "grad_norm": 2.4812967777252197,
      "learning_rate": 0.000137125,
      "loss": 1.4564,
      "step": 1007
    },
    {
      "epoch": 15.75,
      "grad_norm": 2.4611287117004395,
      "learning_rate": 0.0001370625,
      "loss": 1.582,
      "step": 1008
    },
    {
      "epoch": 15.765625,
      "grad_norm": 2.2939231395721436,
      "learning_rate": 0.00013700000000000002,
      "loss": 1.5069,
      "step": 1009
    },
    {
      "epoch": 15.78125,
      "grad_norm": 2.345013380050659,
      "learning_rate": 0.00013693750000000001,
      "loss": 1.6769,
      "step": 1010
    },
    {
      "epoch": 15.796875,
      "grad_norm": 2.2392804622650146,
      "learning_rate": 0.000136875,
      "loss": 1.5209,
      "step": 1011
    },
    {
      "epoch": 15.8125,
      "grad_norm": 2.375722885131836,
      "learning_rate": 0.0001368125,
      "loss": 1.5926,
      "step": 1012
    },
    {
      "epoch": 15.828125,
      "grad_norm": 2.142960548400879,
      "learning_rate": 0.00013675,
      "loss": 1.6471,
      "step": 1013
    },
    {
      "epoch": 15.84375,
      "grad_norm": 2.4333248138427734,
      "learning_rate": 0.0001366875,
      "loss": 1.6622,
      "step": 1014
    },
    {
      "epoch": 15.859375,
      "grad_norm": 2.5236752033233643,
      "learning_rate": 0.000136625,
      "loss": 1.6081,
      "step": 1015
    },
    {
      "epoch": 15.875,
      "grad_norm": 2.213369131088257,
      "learning_rate": 0.00013656250000000002,
      "loss": 1.3854,
      "step": 1016
    },
    {
      "epoch": 15.890625,
      "grad_norm": 2.2718708515167236,
      "learning_rate": 0.0001365,
      "loss": 1.7314,
      "step": 1017
    },
    {
      "epoch": 15.90625,
      "grad_norm": 2.3474979400634766,
      "learning_rate": 0.0001364375,
      "loss": 1.398,
      "step": 1018
    },
    {
      "epoch": 15.921875,
      "grad_norm": 2.588716506958008,
      "learning_rate": 0.00013637500000000002,
      "loss": 1.6211,
      "step": 1019
    },
    {
      "epoch": 15.9375,
      "grad_norm": 2.3394827842712402,
      "learning_rate": 0.00013631249999999999,
      "loss": 1.7644,
      "step": 1020
    },
    {
      "epoch": 15.953125,
      "grad_norm": 2.5340046882629395,
      "learning_rate": 0.00013625,
      "loss": 1.47,
      "step": 1021
    },
    {
      "epoch": 15.96875,
      "grad_norm": 2.2061123847961426,
      "learning_rate": 0.0001361875,
      "loss": 1.7944,
      "step": 1022
    },
    {
      "epoch": 15.984375,
      "grad_norm": 2.3324475288391113,
      "learning_rate": 0.00013612500000000002,
      "loss": 1.7662,
      "step": 1023
    },
    {
      "epoch": 16.0,
      "grad_norm": 3.132556676864624,
      "learning_rate": 0.0001360625,
      "loss": 1.8936,
      "step": 1024
    },
    {
      "epoch": 16.0,
      "eval_loss": 2.9830784797668457,
      "eval_runtime": 2.8558,
      "eval_samples_per_second": 179.283,
      "eval_steps_per_second": 44.821,
      "step": 1024
    },
    {
      "epoch": 16.015625,
      "grad_norm": 2.3021750450134277,
      "learning_rate": 0.00013600000000000003,
      "loss": 1.7555,
      "step": 1025
    },
    {
      "epoch": 16.03125,
      "grad_norm": 2.243462562561035,
      "learning_rate": 0.00013593750000000002,
      "loss": 1.3651,
      "step": 1026
    },
    {
      "epoch": 16.046875,
      "grad_norm": 2.005464792251587,
      "learning_rate": 0.000135875,
      "loss": 1.4422,
      "step": 1027
    },
    {
      "epoch": 16.0625,
      "grad_norm": 2.3944201469421387,
      "learning_rate": 0.0001358125,
      "loss": 1.8051,
      "step": 1028
    },
    {
      "epoch": 16.078125,
      "grad_norm": 2.312873363494873,
      "learning_rate": 0.00013575,
      "loss": 1.6024,
      "step": 1029
    },
    {
      "epoch": 16.09375,
      "grad_norm": 1.9431297779083252,
      "learning_rate": 0.0001356875,
      "loss": 1.6167,
      "step": 1030
    },
    {
      "epoch": 16.109375,
      "grad_norm": 2.211374282836914,
      "learning_rate": 0.000135625,
      "loss": 1.4715,
      "step": 1031
    },
    {
      "epoch": 16.125,
      "grad_norm": 1.8873523473739624,
      "learning_rate": 0.00013556250000000002,
      "loss": 1.644,
      "step": 1032
    },
    {
      "epoch": 16.140625,
      "grad_norm": 2.1864633560180664,
      "learning_rate": 0.00013550000000000001,
      "loss": 1.5954,
      "step": 1033
    },
    {
      "epoch": 16.15625,
      "grad_norm": 2.1880531311035156,
      "learning_rate": 0.0001354375,
      "loss": 1.5014,
      "step": 1034
    },
    {
      "epoch": 16.171875,
      "grad_norm": 2.276336431503296,
      "learning_rate": 0.000135375,
      "loss": 1.5845,
      "step": 1035
    },
    {
      "epoch": 16.1875,
      "grad_norm": 2.331857442855835,
      "learning_rate": 0.0001353125,
      "loss": 1.4072,
      "step": 1036
    },
    {
      "epoch": 16.203125,
      "grad_norm": 2.196221113204956,
      "learning_rate": 0.00013525,
      "loss": 1.4511,
      "step": 1037
    },
    {
      "epoch": 16.21875,
      "grad_norm": 2.582775831222534,
      "learning_rate": 0.0001351875,
      "loss": 1.3954,
      "step": 1038
    },
    {
      "epoch": 16.234375,
      "grad_norm": 2.2767174243927,
      "learning_rate": 0.00013512500000000002,
      "loss": 1.5797,
      "step": 1039
    },
    {
      "epoch": 16.25,
      "grad_norm": 2.697880268096924,
      "learning_rate": 0.0001350625,
      "loss": 1.6964,
      "step": 1040
    },
    {
      "epoch": 16.265625,
      "grad_norm": 2.4245786666870117,
      "learning_rate": 0.00013500000000000003,
      "loss": 1.5786,
      "step": 1041
    },
    {
      "epoch": 16.28125,
      "grad_norm": 2.4134976863861084,
      "learning_rate": 0.0001349375,
      "loss": 1.5526,
      "step": 1042
    },
    {
      "epoch": 16.296875,
      "grad_norm": 2.415506362915039,
      "learning_rate": 0.00013487499999999999,
      "loss": 1.7083,
      "step": 1043
    },
    {
      "epoch": 16.3125,
      "grad_norm": 2.7574875354766846,
      "learning_rate": 0.0001348125,
      "loss": 1.5129,
      "step": 1044
    },
    {
      "epoch": 16.328125,
      "grad_norm": 2.488355875015259,
      "learning_rate": 0.00013475,
      "loss": 1.6727,
      "step": 1045
    },
    {
      "epoch": 16.34375,
      "grad_norm": 2.3214612007141113,
      "learning_rate": 0.00013468750000000001,
      "loss": 1.5747,
      "step": 1046
    },
    {
      "epoch": 16.359375,
      "grad_norm": 2.589024066925049,
      "learning_rate": 0.000134625,
      "loss": 1.4574,
      "step": 1047
    },
    {
      "epoch": 16.375,
      "grad_norm": 2.4039664268493652,
      "learning_rate": 0.00013456250000000002,
      "loss": 1.4018,
      "step": 1048
    },
    {
      "epoch": 16.390625,
      "grad_norm": 2.2596893310546875,
      "learning_rate": 0.00013450000000000002,
      "loss": 1.6187,
      "step": 1049
    },
    {
      "epoch": 16.40625,
      "grad_norm": 2.1928601264953613,
      "learning_rate": 0.0001344375,
      "loss": 1.5739,
      "step": 1050
    },
    {
      "epoch": 16.421875,
      "grad_norm": 2.433225631713867,
      "learning_rate": 0.000134375,
      "loss": 1.4919,
      "step": 1051
    },
    {
      "epoch": 16.4375,
      "grad_norm": 2.3973569869995117,
      "learning_rate": 0.0001343125,
      "loss": 1.4218,
      "step": 1052
    },
    {
      "epoch": 16.453125,
      "grad_norm": 2.5160622596740723,
      "learning_rate": 0.00013425,
      "loss": 1.5507,
      "step": 1053
    },
    {
      "epoch": 16.46875,
      "grad_norm": 2.416083812713623,
      "learning_rate": 0.0001341875,
      "loss": 1.4882,
      "step": 1054
    },
    {
      "epoch": 16.484375,
      "grad_norm": 2.337022304534912,
      "learning_rate": 0.00013412500000000002,
      "loss": 1.6486,
      "step": 1055
    },
    {
      "epoch": 16.5,
      "grad_norm": 2.0781219005584717,
      "learning_rate": 0.0001340625,
      "loss": 1.5095,
      "step": 1056
    },
    {
      "epoch": 16.515625,
      "grad_norm": 2.63716983795166,
      "learning_rate": 0.000134,
      "loss": 1.4384,
      "step": 1057
    },
    {
      "epoch": 16.53125,
      "grad_norm": 2.3575124740600586,
      "learning_rate": 0.0001339375,
      "loss": 1.2154,
      "step": 1058
    },
    {
      "epoch": 16.546875,
      "grad_norm": 2.3231959342956543,
      "learning_rate": 0.00013387500000000002,
      "loss": 1.7211,
      "step": 1059
    },
    {
      "epoch": 16.5625,
      "grad_norm": 2.302767515182495,
      "learning_rate": 0.0001338125,
      "loss": 1.7291,
      "step": 1060
    },
    {
      "epoch": 16.578125,
      "grad_norm": 2.5566024780273438,
      "learning_rate": 0.00013375,
      "loss": 1.7933,
      "step": 1061
    },
    {
      "epoch": 16.59375,
      "grad_norm": 2.4309442043304443,
      "learning_rate": 0.00013368750000000002,
      "loss": 1.4771,
      "step": 1062
    },
    {
      "epoch": 16.609375,
      "grad_norm": 2.545501232147217,
      "learning_rate": 0.000133625,
      "loss": 1.5502,
      "step": 1063
    },
    {
      "epoch": 16.625,
      "grad_norm": 2.1699366569519043,
      "learning_rate": 0.0001335625,
      "loss": 1.7008,
      "step": 1064
    },
    {
      "epoch": 16.640625,
      "grad_norm": 1.9884780645370483,
      "learning_rate": 0.0001335,
      "loss": 1.2689,
      "step": 1065
    },
    {
      "epoch": 16.65625,
      "grad_norm": 2.4189162254333496,
      "learning_rate": 0.0001334375,
      "loss": 1.4221,
      "step": 1066
    },
    {
      "epoch": 16.671875,
      "grad_norm": 2.1498124599456787,
      "learning_rate": 0.000133375,
      "loss": 1.6256,
      "step": 1067
    },
    {
      "epoch": 16.6875,
      "grad_norm": 2.475755214691162,
      "learning_rate": 0.0001333125,
      "loss": 1.7144,
      "step": 1068
    },
    {
      "epoch": 16.703125,
      "grad_norm": 2.6625492572784424,
      "learning_rate": 0.00013325,
      "loss": 1.6612,
      "step": 1069
    },
    {
      "epoch": 16.71875,
      "grad_norm": 2.5170583724975586,
      "learning_rate": 0.0001331875,
      "loss": 1.5959,
      "step": 1070
    },
    {
      "epoch": 16.734375,
      "grad_norm": 2.486543893814087,
      "learning_rate": 0.00013312500000000002,
      "loss": 1.6417,
      "step": 1071
    },
    {
      "epoch": 16.75,
      "grad_norm": 2.1745715141296387,
      "learning_rate": 0.0001330625,
      "loss": 1.8338,
      "step": 1072
    },
    {
      "epoch": 16.765625,
      "grad_norm": 2.1931350231170654,
      "learning_rate": 0.000133,
      "loss": 1.507,
      "step": 1073
    },
    {
      "epoch": 16.78125,
      "grad_norm": 2.228127956390381,
      "learning_rate": 0.0001329375,
      "loss": 1.8705,
      "step": 1074
    },
    {
      "epoch": 16.796875,
      "grad_norm": 2.3877980709075928,
      "learning_rate": 0.00013287500000000002,
      "loss": 1.5675,
      "step": 1075
    },
    {
      "epoch": 16.8125,
      "grad_norm": 2.3446741104125977,
      "learning_rate": 0.0001328125,
      "loss": 1.3286,
      "step": 1076
    },
    {
      "epoch": 16.828125,
      "grad_norm": 2.3592019081115723,
      "learning_rate": 0.00013275,
      "loss": 1.6372,
      "step": 1077
    },
    {
      "epoch": 16.84375,
      "grad_norm": 2.2352447509765625,
      "learning_rate": 0.00013268750000000002,
      "loss": 1.6779,
      "step": 1078
    },
    {
      "epoch": 16.859375,
      "grad_norm": 2.2648143768310547,
      "learning_rate": 0.000132625,
      "loss": 1.7872,
      "step": 1079
    },
    {
      "epoch": 16.875,
      "grad_norm": 2.6004092693328857,
      "learning_rate": 0.0001325625,
      "loss": 1.751,
      "step": 1080
    },
    {
      "epoch": 16.890625,
      "grad_norm": 2.1731975078582764,
      "learning_rate": 0.0001325,
      "loss": 1.8435,
      "step": 1081
    },
    {
      "epoch": 16.90625,
      "grad_norm": 2.4793663024902344,
      "learning_rate": 0.00013243750000000001,
      "loss": 1.8153,
      "step": 1082
    },
    {
      "epoch": 16.921875,
      "grad_norm": 2.5848615169525146,
      "learning_rate": 0.000132375,
      "loss": 1.7281,
      "step": 1083
    },
    {
      "epoch": 16.9375,
      "grad_norm": 2.0462636947631836,
      "learning_rate": 0.00013231250000000002,
      "loss": 1.5061,
      "step": 1084
    },
    {
      "epoch": 16.953125,
      "grad_norm": 2.203373432159424,
      "learning_rate": 0.00013225000000000002,
      "loss": 1.6661,
      "step": 1085
    },
    {
      "epoch": 16.96875,
      "grad_norm": 2.47755765914917,
      "learning_rate": 0.0001321875,
      "loss": 1.6857,
      "step": 1086
    },
    {
      "epoch": 16.984375,
      "grad_norm": 2.564730405807495,
      "learning_rate": 0.000132125,
      "loss": 1.735,
      "step": 1087
    },
    {
      "epoch": 17.0,
      "grad_norm": 2.376537561416626,
      "learning_rate": 0.0001320625,
      "loss": 1.2246,
      "step": 1088
    },
    {
      "epoch": 17.0,
      "eval_loss": 2.980295419692993,
      "eval_runtime": 2.8614,
      "eval_samples_per_second": 178.935,
      "eval_steps_per_second": 44.734,
      "step": 1088
    },
    {
      "epoch": 17.015625,
      "grad_norm": 2.3528060913085938,
      "learning_rate": 0.000132,
      "loss": 1.6169,
      "step": 1089
    },
    {
      "epoch": 17.03125,
      "grad_norm": 2.002195358276367,
      "learning_rate": 0.0001319375,
      "loss": 1.4763,
      "step": 1090
    },
    {
      "epoch": 17.046875,
      "grad_norm": 2.236665964126587,
      "learning_rate": 0.00013187500000000002,
      "loss": 1.3794,
      "step": 1091
    },
    {
      "epoch": 17.0625,
      "grad_norm": 2.3097972869873047,
      "learning_rate": 0.0001318125,
      "loss": 1.6049,
      "step": 1092
    },
    {
      "epoch": 17.078125,
      "grad_norm": 2.6020915508270264,
      "learning_rate": 0.00013175,
      "loss": 1.5856,
      "step": 1093
    },
    {
      "epoch": 17.09375,
      "grad_norm": 2.164355516433716,
      "learning_rate": 0.0001316875,
      "loss": 1.253,
      "step": 1094
    },
    {
      "epoch": 17.109375,
      "grad_norm": 2.3002941608428955,
      "learning_rate": 0.000131625,
      "loss": 1.8258,
      "step": 1095
    },
    {
      "epoch": 17.125,
      "grad_norm": 2.2714743614196777,
      "learning_rate": 0.0001315625,
      "loss": 1.7068,
      "step": 1096
    },
    {
      "epoch": 17.140625,
      "grad_norm": 2.2572391033172607,
      "learning_rate": 0.0001315,
      "loss": 1.3978,
      "step": 1097
    },
    {
      "epoch": 17.15625,
      "grad_norm": 2.2467024326324463,
      "learning_rate": 0.00013143750000000002,
      "loss": 1.6707,
      "step": 1098
    },
    {
      "epoch": 17.171875,
      "grad_norm": 2.265258312225342,
      "learning_rate": 0.000131375,
      "loss": 1.4282,
      "step": 1099
    },
    {
      "epoch": 17.1875,
      "grad_norm": 2.2689175605773926,
      "learning_rate": 0.00013131250000000003,
      "loss": 1.5521,
      "step": 1100
    },
    {
      "epoch": 17.203125,
      "grad_norm": 2.6518325805664062,
      "learning_rate": 0.00013125000000000002,
      "loss": 1.766,
      "step": 1101
    },
    {
      "epoch": 17.21875,
      "grad_norm": 2.391998529434204,
      "learning_rate": 0.00013118749999999998,
      "loss": 1.6229,
      "step": 1102
    },
    {
      "epoch": 17.234375,
      "grad_norm": 2.3639748096466064,
      "learning_rate": 0.000131125,
      "loss": 1.7069,
      "step": 1103
    },
    {
      "epoch": 17.25,
      "grad_norm": 2.32356595993042,
      "learning_rate": 0.0001310625,
      "loss": 1.5114,
      "step": 1104
    },
    {
      "epoch": 17.265625,
      "grad_norm": 2.2962846755981445,
      "learning_rate": 0.000131,
      "loss": 1.7195,
      "step": 1105
    },
    {
      "epoch": 17.28125,
      "grad_norm": 2.1110119819641113,
      "learning_rate": 0.0001309375,
      "loss": 1.5383,
      "step": 1106
    },
    {
      "epoch": 17.296875,
      "grad_norm": 2.363980293273926,
      "learning_rate": 0.00013087500000000002,
      "loss": 1.6426,
      "step": 1107
    },
    {
      "epoch": 17.3125,
      "grad_norm": 2.1445748805999756,
      "learning_rate": 0.00013081250000000002,
      "loss": 1.3754,
      "step": 1108
    },
    {
      "epoch": 17.328125,
      "grad_norm": 2.2072901725769043,
      "learning_rate": 0.00013075,
      "loss": 1.8122,
      "step": 1109
    },
    {
      "epoch": 17.34375,
      "grad_norm": 2.21512508392334,
      "learning_rate": 0.0001306875,
      "loss": 1.6729,
      "step": 1110
    },
    {
      "epoch": 17.359375,
      "grad_norm": 2.257763147354126,
      "learning_rate": 0.000130625,
      "loss": 1.7268,
      "step": 1111
    },
    {
      "epoch": 17.375,
      "grad_norm": 2.1872899532318115,
      "learning_rate": 0.0001305625,
      "loss": 1.783,
      "step": 1112
    },
    {
      "epoch": 17.390625,
      "grad_norm": 2.230437755584717,
      "learning_rate": 0.0001305,
      "loss": 1.5255,
      "step": 1113
    },
    {
      "epoch": 17.40625,
      "grad_norm": 2.5439112186431885,
      "learning_rate": 0.00013043750000000002,
      "loss": 1.6641,
      "step": 1114
    },
    {
      "epoch": 17.421875,
      "grad_norm": 2.6091439723968506,
      "learning_rate": 0.000130375,
      "loss": 1.5892,
      "step": 1115
    },
    {
      "epoch": 17.4375,
      "grad_norm": 2.3715596199035645,
      "learning_rate": 0.0001303125,
      "loss": 1.4036,
      "step": 1116
    },
    {
      "epoch": 17.453125,
      "grad_norm": 2.336414098739624,
      "learning_rate": 0.00013025,
      "loss": 1.4834,
      "step": 1117
    },
    {
      "epoch": 17.46875,
      "grad_norm": 2.2868802547454834,
      "learning_rate": 0.00013018749999999999,
      "loss": 1.7267,
      "step": 1118
    },
    {
      "epoch": 17.484375,
      "grad_norm": 2.724982500076294,
      "learning_rate": 0.000130125,
      "loss": 1.4176,
      "step": 1119
    },
    {
      "epoch": 17.5,
      "grad_norm": 2.3264307975769043,
      "learning_rate": 0.0001300625,
      "loss": 1.4701,
      "step": 1120
    },
    {
      "epoch": 17.515625,
      "grad_norm": 2.016785144805908,
      "learning_rate": 0.00013000000000000002,
      "loss": 1.5345,
      "step": 1121
    },
    {
      "epoch": 17.53125,
      "grad_norm": 2.217489004135132,
      "learning_rate": 0.0001299375,
      "loss": 1.784,
      "step": 1122
    },
    {
      "epoch": 17.546875,
      "grad_norm": 2.467740535736084,
      "learning_rate": 0.00012987500000000003,
      "loss": 1.5702,
      "step": 1123
    },
    {
      "epoch": 17.5625,
      "grad_norm": 2.3423218727111816,
      "learning_rate": 0.0001298125,
      "loss": 1.6178,
      "step": 1124
    },
    {
      "epoch": 17.578125,
      "grad_norm": 2.4310052394866943,
      "learning_rate": 0.00012975,
      "loss": 1.5592,
      "step": 1125
    },
    {
      "epoch": 17.59375,
      "grad_norm": 2.7701969146728516,
      "learning_rate": 0.0001296875,
      "loss": 1.5112,
      "step": 1126
    },
    {
      "epoch": 17.609375,
      "grad_norm": 2.362029552459717,
      "learning_rate": 0.000129625,
      "loss": 1.7506,
      "step": 1127
    },
    {
      "epoch": 17.625,
      "grad_norm": 2.2556939125061035,
      "learning_rate": 0.0001295625,
      "loss": 1.482,
      "step": 1128
    },
    {
      "epoch": 17.640625,
      "grad_norm": 2.4716219902038574,
      "learning_rate": 0.0001295,
      "loss": 1.4599,
      "step": 1129
    },
    {
      "epoch": 17.65625,
      "grad_norm": 2.5690419673919678,
      "learning_rate": 0.00012943750000000002,
      "loss": 1.525,
      "step": 1130
    },
    {
      "epoch": 17.671875,
      "grad_norm": 2.4905319213867188,
      "learning_rate": 0.00012937500000000001,
      "loss": 1.5557,
      "step": 1131
    },
    {
      "epoch": 17.6875,
      "grad_norm": 2.1701266765594482,
      "learning_rate": 0.0001293125,
      "loss": 1.6142,
      "step": 1132
    },
    {
      "epoch": 17.703125,
      "grad_norm": 2.097571611404419,
      "learning_rate": 0.00012925,
      "loss": 1.5201,
      "step": 1133
    },
    {
      "epoch": 17.71875,
      "grad_norm": 2.3370938301086426,
      "learning_rate": 0.00012918750000000002,
      "loss": 1.6559,
      "step": 1134
    },
    {
      "epoch": 17.734375,
      "grad_norm": 2.463303804397583,
      "learning_rate": 0.000129125,
      "loss": 1.5383,
      "step": 1135
    },
    {
      "epoch": 17.75,
      "grad_norm": 2.326970100402832,
      "learning_rate": 0.0001290625,
      "loss": 1.5598,
      "step": 1136
    },
    {
      "epoch": 17.765625,
      "grad_norm": 2.4267354011535645,
      "learning_rate": 0.00012900000000000002,
      "loss": 1.4362,
      "step": 1137
    },
    {
      "epoch": 17.78125,
      "grad_norm": 2.324108839035034,
      "learning_rate": 0.0001289375,
      "loss": 1.4266,
      "step": 1138
    },
    {
      "epoch": 17.796875,
      "grad_norm": 2.1814589500427246,
      "learning_rate": 0.000128875,
      "loss": 1.668,
      "step": 1139
    },
    {
      "epoch": 17.8125,
      "grad_norm": 2.524911403656006,
      "learning_rate": 0.0001288125,
      "loss": 1.6104,
      "step": 1140
    },
    {
      "epoch": 17.828125,
      "grad_norm": 2.2576186656951904,
      "learning_rate": 0.00012875,
      "loss": 1.5268,
      "step": 1141
    },
    {
      "epoch": 17.84375,
      "grad_norm": 2.5341644287109375,
      "learning_rate": 0.0001286875,
      "loss": 1.5424,
      "step": 1142
    },
    {
      "epoch": 17.859375,
      "grad_norm": 2.5122694969177246,
      "learning_rate": 0.000128625,
      "loss": 1.5554,
      "step": 1143
    },
    {
      "epoch": 17.875,
      "grad_norm": 2.5271801948547363,
      "learning_rate": 0.00012856250000000001,
      "loss": 1.6179,
      "step": 1144
    },
    {
      "epoch": 17.890625,
      "grad_norm": 2.3950092792510986,
      "learning_rate": 0.0001285,
      "loss": 1.5501,
      "step": 1145
    },
    {
      "epoch": 17.90625,
      "grad_norm": 2.291853904724121,
      "learning_rate": 0.0001284375,
      "loss": 1.5918,
      "step": 1146
    },
    {
      "epoch": 17.921875,
      "grad_norm": 2.2610294818878174,
      "learning_rate": 0.000128375,
      "loss": 1.5295,
      "step": 1147
    },
    {
      "epoch": 17.9375,
      "grad_norm": 2.2117831707000732,
      "learning_rate": 0.0001283125,
      "loss": 1.6205,
      "step": 1148
    },
    {
      "epoch": 17.953125,
      "grad_norm": 2.738036870956421,
      "learning_rate": 0.00012825,
      "loss": 1.7819,
      "step": 1149
    },
    {
      "epoch": 17.96875,
      "grad_norm": 2.48307466506958,
      "learning_rate": 0.00012818750000000002,
      "loss": 1.5225,
      "step": 1150
    },
    {
      "epoch": 17.984375,
      "grad_norm": 2.4282166957855225,
      "learning_rate": 0.000128125,
      "loss": 1.6465,
      "step": 1151
    },
    {
      "epoch": 18.0,
      "grad_norm": 3.3730368614196777,
      "learning_rate": 0.0001280625,
      "loss": 1.7469,
      "step": 1152
    },
    {
      "epoch": 18.0,
      "eval_loss": 2.983654022216797,
      "eval_runtime": 3.025,
      "eval_samples_per_second": 169.258,
      "eval_steps_per_second": 42.314,
      "step": 1152
    },
    {
      "epoch": 18.015625,
      "grad_norm": 2.2073822021484375,
      "learning_rate": 0.00012800000000000002,
      "loss": 1.5642,
      "step": 1153
    },
    {
      "epoch": 18.03125,
      "grad_norm": 2.2069287300109863,
      "learning_rate": 0.00012793749999999999,
      "loss": 1.4648,
      "step": 1154
    },
    {
      "epoch": 18.046875,
      "grad_norm": 2.2486302852630615,
      "learning_rate": 0.000127875,
      "loss": 1.6158,
      "step": 1155
    },
    {
      "epoch": 18.0625,
      "grad_norm": 2.1341257095336914,
      "learning_rate": 0.0001278125,
      "loss": 1.5431,
      "step": 1156
    },
    {
      "epoch": 18.078125,
      "grad_norm": 2.1620848178863525,
      "learning_rate": 0.00012775000000000002,
      "loss": 1.5848,
      "step": 1157
    },
    {
      "epoch": 18.09375,
      "grad_norm": 2.0786020755767822,
      "learning_rate": 0.0001276875,
      "loss": 1.4264,
      "step": 1158
    },
    {
      "epoch": 18.109375,
      "grad_norm": 2.0977678298950195,
      "learning_rate": 0.00012762500000000003,
      "loss": 1.3542,
      "step": 1159
    },
    {
      "epoch": 18.125,
      "grad_norm": 2.3005452156066895,
      "learning_rate": 0.00012756250000000002,
      "loss": 1.451,
      "step": 1160
    },
    {
      "epoch": 18.140625,
      "grad_norm": 2.1336023807525635,
      "learning_rate": 0.0001275,
      "loss": 1.6152,
      "step": 1161
    },
    {
      "epoch": 18.15625,
      "grad_norm": 2.244865655899048,
      "learning_rate": 0.0001274375,
      "loss": 1.7451,
      "step": 1162
    },
    {
      "epoch": 18.171875,
      "grad_norm": 2.1232361793518066,
      "learning_rate": 0.000127375,
      "loss": 1.3723,
      "step": 1163
    },
    {
      "epoch": 18.1875,
      "grad_norm": 2.0114822387695312,
      "learning_rate": 0.0001273125,
      "loss": 1.6598,
      "step": 1164
    },
    {
      "epoch": 18.203125,
      "grad_norm": 2.545685052871704,
      "learning_rate": 0.00012725,
      "loss": 1.66,
      "step": 1165
    },
    {
      "epoch": 18.21875,
      "grad_norm": 2.5415537357330322,
      "learning_rate": 0.00012718750000000002,
      "loss": 1.5024,
      "step": 1166
    },
    {
      "epoch": 18.234375,
      "grad_norm": 2.303285837173462,
      "learning_rate": 0.00012712500000000001,
      "loss": 1.5663,
      "step": 1167
    },
    {
      "epoch": 18.25,
      "grad_norm": 2.3989949226379395,
      "learning_rate": 0.0001270625,
      "loss": 1.5172,
      "step": 1168
    },
    {
      "epoch": 18.265625,
      "grad_norm": 2.1041100025177,
      "learning_rate": 0.000127,
      "loss": 1.2436,
      "step": 1169
    },
    {
      "epoch": 18.28125,
      "grad_norm": 2.4558959007263184,
      "learning_rate": 0.0001269375,
      "loss": 1.6407,
      "step": 1170
    },
    {
      "epoch": 18.296875,
      "grad_norm": 2.463306188583374,
      "learning_rate": 0.000126875,
      "loss": 1.5021,
      "step": 1171
    },
    {
      "epoch": 18.3125,
      "grad_norm": 2.4476470947265625,
      "learning_rate": 0.0001268125,
      "loss": 1.4026,
      "step": 1172
    },
    {
      "epoch": 18.328125,
      "grad_norm": 2.192416191101074,
      "learning_rate": 0.00012675000000000002,
      "loss": 1.5016,
      "step": 1173
    },
    {
      "epoch": 18.34375,
      "grad_norm": 2.258491277694702,
      "learning_rate": 0.0001266875,
      "loss": 1.4315,
      "step": 1174
    },
    {
      "epoch": 18.359375,
      "grad_norm": 2.13092303276062,
      "learning_rate": 0.00012662500000000003,
      "loss": 1.5779,
      "step": 1175
    },
    {
      "epoch": 18.375,
      "grad_norm": 2.6110877990722656,
      "learning_rate": 0.0001265625,
      "loss": 1.5825,
      "step": 1176
    },
    {
      "epoch": 18.390625,
      "grad_norm": 2.4971983432769775,
      "learning_rate": 0.00012649999999999998,
      "loss": 1.6464,
      "step": 1177
    },
    {
      "epoch": 18.40625,
      "grad_norm": 2.412119150161743,
      "learning_rate": 0.0001264375,
      "loss": 1.3401,
      "step": 1178
    },
    {
      "epoch": 18.421875,
      "grad_norm": 2.3911359310150146,
      "learning_rate": 0.000126375,
      "loss": 1.7259,
      "step": 1179
    },
    {
      "epoch": 18.4375,
      "grad_norm": 2.358228921890259,
      "learning_rate": 0.00012631250000000001,
      "loss": 1.3174,
      "step": 1180
    },
    {
      "epoch": 18.453125,
      "grad_norm": 2.7335872650146484,
      "learning_rate": 0.00012625,
      "loss": 1.8481,
      "step": 1181
    },
    {
      "epoch": 18.46875,
      "grad_norm": 2.3854353427886963,
      "learning_rate": 0.00012618750000000002,
      "loss": 1.5959,
      "step": 1182
    },
    {
      "epoch": 18.484375,
      "grad_norm": 2.1138267517089844,
      "learning_rate": 0.00012612500000000002,
      "loss": 1.529,
      "step": 1183
    },
    {
      "epoch": 18.5,
      "grad_norm": 2.260463237762451,
      "learning_rate": 0.0001260625,
      "loss": 1.622,
      "step": 1184
    },
    {
      "epoch": 18.515625,
      "grad_norm": 2.3426265716552734,
      "learning_rate": 0.000126,
      "loss": 1.7127,
      "step": 1185
    },
    {
      "epoch": 18.53125,
      "grad_norm": 2.6920158863067627,
      "learning_rate": 0.0001259375,
      "loss": 1.5914,
      "step": 1186
    },
    {
      "epoch": 18.546875,
      "grad_norm": 2.720867156982422,
      "learning_rate": 0.000125875,
      "loss": 1.8006,
      "step": 1187
    },
    {
      "epoch": 18.5625,
      "grad_norm": 2.3261234760284424,
      "learning_rate": 0.0001258125,
      "loss": 1.41,
      "step": 1188
    },
    {
      "epoch": 18.578125,
      "grad_norm": 2.3369534015655518,
      "learning_rate": 0.00012575000000000002,
      "loss": 1.6499,
      "step": 1189
    },
    {
      "epoch": 18.59375,
      "grad_norm": 2.3857860565185547,
      "learning_rate": 0.0001256875,
      "loss": 1.6135,
      "step": 1190
    },
    {
      "epoch": 18.609375,
      "grad_norm": 2.5196287631988525,
      "learning_rate": 0.000125625,
      "loss": 1.7688,
      "step": 1191
    },
    {
      "epoch": 18.625,
      "grad_norm": 2.512078285217285,
      "learning_rate": 0.0001255625,
      "loss": 1.2718,
      "step": 1192
    },
    {
      "epoch": 18.640625,
      "grad_norm": 2.260312080383301,
      "learning_rate": 0.0001255,
      "loss": 1.6446,
      "step": 1193
    },
    {
      "epoch": 18.65625,
      "grad_norm": 2.381605386734009,
      "learning_rate": 0.0001254375,
      "loss": 1.5508,
      "step": 1194
    },
    {
      "epoch": 18.671875,
      "grad_norm": 2.4233648777008057,
      "learning_rate": 0.000125375,
      "loss": 1.4692,
      "step": 1195
    },
    {
      "epoch": 18.6875,
      "grad_norm": 2.080077886581421,
      "learning_rate": 0.00012531250000000002,
      "loss": 1.5566,
      "step": 1196
    },
    {
      "epoch": 18.703125,
      "grad_norm": 2.3584628105163574,
      "learning_rate": 0.00012525,
      "loss": 1.4006,
      "step": 1197
    },
    {
      "epoch": 18.71875,
      "grad_norm": 2.431485414505005,
      "learning_rate": 0.0001251875,
      "loss": 1.5899,
      "step": 1198
    },
    {
      "epoch": 18.734375,
      "grad_norm": 2.453082799911499,
      "learning_rate": 0.000125125,
      "loss": 1.5501,
      "step": 1199
    },
    {
      "epoch": 18.75,
      "grad_norm": 2.505800724029541,
      "learning_rate": 0.0001250625,
      "loss": 1.5869,
      "step": 1200
    },
    {
      "epoch": 18.765625,
      "grad_norm": 2.400134563446045,
      "learning_rate": 0.000125,
      "loss": 1.7692,
      "step": 1201
    },
    {
      "epoch": 18.78125,
      "grad_norm": 2.341794729232788,
      "learning_rate": 0.0001249375,
      "loss": 1.6075,
      "step": 1202
    },
    {
      "epoch": 18.796875,
      "grad_norm": 2.3253331184387207,
      "learning_rate": 0.000124875,
      "loss": 1.3392,
      "step": 1203
    },
    {
      "epoch": 18.8125,
      "grad_norm": 2.633669853210449,
      "learning_rate": 0.0001248125,
      "loss": 1.5684,
      "step": 1204
    },
    {
      "epoch": 18.828125,
      "grad_norm": 2.3983819484710693,
      "learning_rate": 0.00012475000000000002,
      "loss": 1.53,
      "step": 1205
    },
    {
      "epoch": 18.84375,
      "grad_norm": 2.349421262741089,
      "learning_rate": 0.0001246875,
      "loss": 1.3091,
      "step": 1206
    },
    {
      "epoch": 18.859375,
      "grad_norm": 2.561373472213745,
      "learning_rate": 0.000124625,
      "loss": 1.7786,
      "step": 1207
    },
    {
      "epoch": 18.875,
      "grad_norm": 2.413278818130493,
      "learning_rate": 0.0001245625,
      "loss": 1.567,
      "step": 1208
    },
    {
      "epoch": 18.890625,
      "grad_norm": 2.4485208988189697,
      "learning_rate": 0.00012450000000000002,
      "loss": 1.2969,
      "step": 1209
    },
    {
      "epoch": 18.90625,
      "grad_norm": 2.531663417816162,
      "learning_rate": 0.0001244375,
      "loss": 1.7295,
      "step": 1210
    },
    {
      "epoch": 18.921875,
      "grad_norm": 2.4634737968444824,
      "learning_rate": 0.000124375,
      "loss": 1.6032,
      "step": 1211
    },
    {
      "epoch": 18.9375,
      "grad_norm": 2.316511869430542,
      "learning_rate": 0.00012431250000000002,
      "loss": 1.649,
      "step": 1212
    },
    {
      "epoch": 18.953125,
      "grad_norm": 2.043386697769165,
      "learning_rate": 0.00012425,
      "loss": 1.4764,
      "step": 1213
    },
    {
      "epoch": 18.96875,
      "grad_norm": 2.7339837551116943,
      "learning_rate": 0.0001241875,
      "loss": 1.4314,
      "step": 1214
    },
    {
      "epoch": 18.984375,
      "grad_norm": 2.4561614990234375,
      "learning_rate": 0.000124125,
      "loss": 1.4866,
      "step": 1215
    },
    {
      "epoch": 19.0,
      "grad_norm": 2.5097262859344482,
      "learning_rate": 0.00012406250000000001,
      "loss": 1.4837,
      "step": 1216
    },
    {
      "epoch": 19.0,
      "eval_loss": 2.992797374725342,
      "eval_runtime": 2.9046,
      "eval_samples_per_second": 176.27,
      "eval_steps_per_second": 44.068,
      "step": 1216
    },
    {
      "epoch": 19.015625,
      "grad_norm": 2.3046767711639404,
      "learning_rate": 0.000124,
      "loss": 1.5659,
      "step": 1217
    },
    {
      "epoch": 19.03125,
      "grad_norm": 2.2102558612823486,
      "learning_rate": 0.0001239375,
      "loss": 1.4746,
      "step": 1218
    },
    {
      "epoch": 19.046875,
      "grad_norm": 2.3126492500305176,
      "learning_rate": 0.00012387500000000002,
      "loss": 1.6911,
      "step": 1219
    },
    {
      "epoch": 19.0625,
      "grad_norm": 2.4210622310638428,
      "learning_rate": 0.0001238125,
      "loss": 1.4364,
      "step": 1220
    },
    {
      "epoch": 19.078125,
      "grad_norm": 2.32171630859375,
      "learning_rate": 0.00012375,
      "loss": 1.5264,
      "step": 1221
    },
    {
      "epoch": 19.09375,
      "grad_norm": 2.227292776107788,
      "learning_rate": 0.0001236875,
      "loss": 1.6767,
      "step": 1222
    },
    {
      "epoch": 19.109375,
      "grad_norm": 2.5036709308624268,
      "learning_rate": 0.000123625,
      "loss": 1.3139,
      "step": 1223
    },
    {
      "epoch": 19.125,
      "grad_norm": 2.458350658416748,
      "learning_rate": 0.0001235625,
      "loss": 1.4244,
      "step": 1224
    },
    {
      "epoch": 19.140625,
      "grad_norm": 2.5163187980651855,
      "learning_rate": 0.00012350000000000002,
      "loss": 1.5801,
      "step": 1225
    },
    {
      "epoch": 19.15625,
      "grad_norm": 2.2337372303009033,
      "learning_rate": 0.0001234375,
      "loss": 1.6399,
      "step": 1226
    },
    {
      "epoch": 19.171875,
      "grad_norm": 2.2975502014160156,
      "learning_rate": 0.000123375,
      "loss": 1.5204,
      "step": 1227
    },
    {
      "epoch": 19.1875,
      "grad_norm": 2.672776699066162,
      "learning_rate": 0.0001233125,
      "loss": 1.6124,
      "step": 1228
    },
    {
      "epoch": 19.203125,
      "grad_norm": 2.365511894226074,
      "learning_rate": 0.00012325,
      "loss": 1.5329,
      "step": 1229
    },
    {
      "epoch": 19.21875,
      "grad_norm": 2.4900169372558594,
      "learning_rate": 0.0001231875,
      "loss": 1.4396,
      "step": 1230
    },
    {
      "epoch": 19.234375,
      "grad_norm": 2.367614269256592,
      "learning_rate": 0.000123125,
      "loss": 1.6586,
      "step": 1231
    },
    {
      "epoch": 19.25,
      "grad_norm": 2.631587028503418,
      "learning_rate": 0.00012306250000000002,
      "loss": 1.3284,
      "step": 1232
    },
    {
      "epoch": 19.265625,
      "grad_norm": 2.292802095413208,
      "learning_rate": 0.000123,
      "loss": 1.3867,
      "step": 1233
    },
    {
      "epoch": 19.28125,
      "grad_norm": 2.533846139907837,
      "learning_rate": 0.00012293750000000003,
      "loss": 1.3496,
      "step": 1234
    },
    {
      "epoch": 19.296875,
      "grad_norm": 2.1836283206939697,
      "learning_rate": 0.00012287500000000002,
      "loss": 1.5008,
      "step": 1235
    },
    {
      "epoch": 19.3125,
      "grad_norm": 2.3986661434173584,
      "learning_rate": 0.0001228125,
      "loss": 1.5935,
      "step": 1236
    },
    {
      "epoch": 19.328125,
      "grad_norm": 2.2650434970855713,
      "learning_rate": 0.00012275,
      "loss": 1.4008,
      "step": 1237
    },
    {
      "epoch": 19.34375,
      "grad_norm": 2.267343759536743,
      "learning_rate": 0.0001226875,
      "loss": 1.4436,
      "step": 1238
    },
    {
      "epoch": 19.359375,
      "grad_norm": 2.1697800159454346,
      "learning_rate": 0.000122625,
      "loss": 1.4039,
      "step": 1239
    },
    {
      "epoch": 19.375,
      "grad_norm": 2.343322992324829,
      "learning_rate": 0.0001225625,
      "loss": 1.3973,
      "step": 1240
    },
    {
      "epoch": 19.390625,
      "grad_norm": 2.1162703037261963,
      "learning_rate": 0.00012250000000000002,
      "loss": 1.5554,
      "step": 1241
    },
    {
      "epoch": 19.40625,
      "grad_norm": 2.204378366470337,
      "learning_rate": 0.00012243750000000001,
      "loss": 1.3044,
      "step": 1242
    },
    {
      "epoch": 19.421875,
      "grad_norm": 2.420504570007324,
      "learning_rate": 0.000122375,
      "loss": 1.7392,
      "step": 1243
    },
    {
      "epoch": 19.4375,
      "grad_norm": 2.2042222023010254,
      "learning_rate": 0.0001223125,
      "loss": 1.4755,
      "step": 1244
    },
    {
      "epoch": 19.453125,
      "grad_norm": 2.4673290252685547,
      "learning_rate": 0.00012225,
      "loss": 1.6472,
      "step": 1245
    },
    {
      "epoch": 19.46875,
      "grad_norm": 2.3712048530578613,
      "learning_rate": 0.0001221875,
      "loss": 1.6798,
      "step": 1246
    },
    {
      "epoch": 19.484375,
      "grad_norm": 2.3500728607177734,
      "learning_rate": 0.000122125,
      "loss": 1.7778,
      "step": 1247
    },
    {
      "epoch": 19.5,
      "grad_norm": 2.661395788192749,
      "learning_rate": 0.00012206250000000002,
      "loss": 1.7119,
      "step": 1248
    },
    {
      "epoch": 19.515625,
      "grad_norm": 2.3171231746673584,
      "learning_rate": 0.000122,
      "loss": 1.6253,
      "step": 1249
    },
    {
      "epoch": 19.53125,
      "grad_norm": 2.502124071121216,
      "learning_rate": 0.00012193750000000002,
      "loss": 1.5487,
      "step": 1250
    },
    {
      "epoch": 19.546875,
      "grad_norm": 2.4207773208618164,
      "learning_rate": 0.00012187500000000001,
      "loss": 1.5908,
      "step": 1251
    },
    {
      "epoch": 19.5625,
      "grad_norm": 2.2301101684570312,
      "learning_rate": 0.0001218125,
      "loss": 1.2655,
      "step": 1252
    },
    {
      "epoch": 19.578125,
      "grad_norm": 1.9582808017730713,
      "learning_rate": 0.00012175,
      "loss": 1.506,
      "step": 1253
    },
    {
      "epoch": 19.59375,
      "grad_norm": 2.6129252910614014,
      "learning_rate": 0.0001216875,
      "loss": 1.4683,
      "step": 1254
    },
    {
      "epoch": 19.609375,
      "grad_norm": 2.3810625076293945,
      "learning_rate": 0.00012162500000000002,
      "loss": 1.7522,
      "step": 1255
    },
    {
      "epoch": 19.625,
      "grad_norm": 2.4084177017211914,
      "learning_rate": 0.00012156250000000001,
      "loss": 1.4608,
      "step": 1256
    },
    {
      "epoch": 19.640625,
      "grad_norm": 2.3228607177734375,
      "learning_rate": 0.00012150000000000001,
      "loss": 1.5586,
      "step": 1257
    },
    {
      "epoch": 19.65625,
      "grad_norm": 2.4409337043762207,
      "learning_rate": 0.0001214375,
      "loss": 1.432,
      "step": 1258
    },
    {
      "epoch": 19.671875,
      "grad_norm": 2.4916393756866455,
      "learning_rate": 0.00012137500000000002,
      "loss": 1.446,
      "step": 1259
    },
    {
      "epoch": 19.6875,
      "grad_norm": 2.555722236633301,
      "learning_rate": 0.0001213125,
      "loss": 1.5859,
      "step": 1260
    },
    {
      "epoch": 19.703125,
      "grad_norm": 2.427537441253662,
      "learning_rate": 0.00012124999999999999,
      "loss": 1.6147,
      "step": 1261
    },
    {
      "epoch": 19.71875,
      "grad_norm": 2.483267307281494,
      "learning_rate": 0.00012118750000000001,
      "loss": 1.4711,
      "step": 1262
    },
    {
      "epoch": 19.734375,
      "grad_norm": 2.162001848220825,
      "learning_rate": 0.000121125,
      "loss": 1.3606,
      "step": 1263
    },
    {
      "epoch": 19.75,
      "grad_norm": 2.527200698852539,
      "learning_rate": 0.00012106250000000001,
      "loss": 1.4704,
      "step": 1264
    },
    {
      "epoch": 19.765625,
      "grad_norm": 2.744156837463379,
      "learning_rate": 0.000121,
      "loss": 1.6038,
      "step": 1265
    },
    {
      "epoch": 19.78125,
      "grad_norm": 2.846970558166504,
      "learning_rate": 0.00012093750000000002,
      "loss": 1.5948,
      "step": 1266
    },
    {
      "epoch": 19.796875,
      "grad_norm": 2.5859951972961426,
      "learning_rate": 0.00012087500000000001,
      "loss": 1.4164,
      "step": 1267
    },
    {
      "epoch": 19.8125,
      "grad_norm": 2.4302327632904053,
      "learning_rate": 0.00012081249999999999,
      "loss": 1.6108,
      "step": 1268
    },
    {
      "epoch": 19.828125,
      "grad_norm": 2.119166374206543,
      "learning_rate": 0.00012075000000000001,
      "loss": 1.3502,
      "step": 1269
    },
    {
      "epoch": 19.84375,
      "grad_norm": 2.2231173515319824,
      "learning_rate": 0.0001206875,
      "loss": 1.4278,
      "step": 1270
    },
    {
      "epoch": 19.859375,
      "grad_norm": 2.419410228729248,
      "learning_rate": 0.000120625,
      "loss": 1.6119,
      "step": 1271
    },
    {
      "epoch": 19.875,
      "grad_norm": 2.4256765842437744,
      "learning_rate": 0.0001205625,
      "loss": 1.4331,
      "step": 1272
    },
    {
      "epoch": 19.890625,
      "grad_norm": 2.30008602142334,
      "learning_rate": 0.00012050000000000002,
      "loss": 1.422,
      "step": 1273
    },
    {
      "epoch": 19.90625,
      "grad_norm": 2.4560861587524414,
      "learning_rate": 0.00012043750000000001,
      "loss": 1.8573,
      "step": 1274
    },
    {
      "epoch": 19.921875,
      "grad_norm": 2.1856155395507812,
      "learning_rate": 0.00012037500000000001,
      "loss": 1.7879,
      "step": 1275
    },
    {
      "epoch": 19.9375,
      "grad_norm": 2.4841485023498535,
      "learning_rate": 0.0001203125,
      "loss": 1.6616,
      "step": 1276
    },
    {
      "epoch": 19.953125,
      "grad_norm": 2.5056169033050537,
      "learning_rate": 0.00012025,
      "loss": 1.4575,
      "step": 1277
    },
    {
      "epoch": 19.96875,
      "grad_norm": 2.357048511505127,
      "learning_rate": 0.00012018750000000001,
      "loss": 0.9693,
      "step": 1278
    },
    {
      "epoch": 19.984375,
      "grad_norm": 2.4700534343719482,
      "learning_rate": 0.00012012499999999999,
      "loss": 1.4979,
      "step": 1279
    },
    {
      "epoch": 20.0,
      "grad_norm": 2.861720561981201,
      "learning_rate": 0.00012006250000000001,
      "loss": 1.4119,
      "step": 1280
    },
    {
      "epoch": 20.0,
      "eval_loss": 3.0117173194885254,
      "eval_runtime": 2.9889,
      "eval_samples_per_second": 171.303,
      "eval_steps_per_second": 42.826,
      "step": 1280
    },
    {
      "epoch": 20.015625,
      "grad_norm": 2.5277364253997803,
      "learning_rate": 0.00012,
      "loss": 1.4538,
      "step": 1281
    },
    {
      "epoch": 20.03125,
      "grad_norm": 2.301820993423462,
      "learning_rate": 0.00011993750000000001,
      "loss": 1.6735,
      "step": 1282
    },
    {
      "epoch": 20.046875,
      "grad_norm": 2.3176188468933105,
      "learning_rate": 0.000119875,
      "loss": 1.4599,
      "step": 1283
    },
    {
      "epoch": 20.0625,
      "grad_norm": 2.2174859046936035,
      "learning_rate": 0.00011981250000000002,
      "loss": 1.6621,
      "step": 1284
    },
    {
      "epoch": 20.078125,
      "grad_norm": 2.3085668087005615,
      "learning_rate": 0.00011975000000000001,
      "loss": 1.4532,
      "step": 1285
    },
    {
      "epoch": 20.09375,
      "grad_norm": 2.19671368598938,
      "learning_rate": 0.0001196875,
      "loss": 1.5956,
      "step": 1286
    },
    {
      "epoch": 20.109375,
      "grad_norm": 2.2113945484161377,
      "learning_rate": 0.00011962500000000001,
      "loss": 1.6409,
      "step": 1287
    },
    {
      "epoch": 20.125,
      "grad_norm": 2.5970351696014404,
      "learning_rate": 0.0001195625,
      "loss": 1.5651,
      "step": 1288
    },
    {
      "epoch": 20.140625,
      "grad_norm": 2.3962409496307373,
      "learning_rate": 0.00011950000000000002,
      "loss": 1.3744,
      "step": 1289
    },
    {
      "epoch": 20.15625,
      "grad_norm": 2.707242250442505,
      "learning_rate": 0.0001194375,
      "loss": 1.7033,
      "step": 1290
    },
    {
      "epoch": 20.171875,
      "grad_norm": 2.5084354877471924,
      "learning_rate": 0.00011937500000000001,
      "loss": 1.5605,
      "step": 1291
    },
    {
      "epoch": 20.1875,
      "grad_norm": 2.532650947570801,
      "learning_rate": 0.0001193125,
      "loss": 1.3781,
      "step": 1292
    },
    {
      "epoch": 20.203125,
      "grad_norm": 2.663698434829712,
      "learning_rate": 0.00011925,
      "loss": 1.4938,
      "step": 1293
    },
    {
      "epoch": 20.21875,
      "grad_norm": 2.4255897998809814,
      "learning_rate": 0.0001191875,
      "loss": 1.5403,
      "step": 1294
    },
    {
      "epoch": 20.234375,
      "grad_norm": 2.0586330890655518,
      "learning_rate": 0.000119125,
      "loss": 1.4378,
      "step": 1295
    },
    {
      "epoch": 20.25,
      "grad_norm": 2.615138530731201,
      "learning_rate": 0.00011906250000000001,
      "loss": 1.4057,
      "step": 1296
    },
    {
      "epoch": 20.265625,
      "grad_norm": 2.3831536769866943,
      "learning_rate": 0.000119,
      "loss": 1.2738,
      "step": 1297
    },
    {
      "epoch": 20.28125,
      "grad_norm": 2.444161891937256,
      "learning_rate": 0.00011893750000000001,
      "loss": 1.334,
      "step": 1298
    },
    {
      "epoch": 20.296875,
      "grad_norm": 2.6275994777679443,
      "learning_rate": 0.000118875,
      "loss": 1.5446,
      "step": 1299
    },
    {
      "epoch": 20.3125,
      "grad_norm": 2.0444772243499756,
      "learning_rate": 0.00011881250000000002,
      "loss": 1.3596,
      "step": 1300
    },
    {
      "epoch": 20.328125,
      "grad_norm": 2.5002832412719727,
      "learning_rate": 0.00011875,
      "loss": 1.5114,
      "step": 1301
    },
    {
      "epoch": 20.34375,
      "grad_norm": 2.325784921646118,
      "learning_rate": 0.00011868749999999999,
      "loss": 1.5065,
      "step": 1302
    },
    {
      "epoch": 20.359375,
      "grad_norm": 2.627535820007324,
      "learning_rate": 0.00011862500000000001,
      "loss": 1.4105,
      "step": 1303
    },
    {
      "epoch": 20.375,
      "grad_norm": 2.121304750442505,
      "learning_rate": 0.0001185625,
      "loss": 1.3655,
      "step": 1304
    },
    {
      "epoch": 20.390625,
      "grad_norm": 2.570377826690674,
      "learning_rate": 0.00011850000000000001,
      "loss": 1.4565,
      "step": 1305
    },
    {
      "epoch": 20.40625,
      "grad_norm": 2.242718458175659,
      "learning_rate": 0.0001184375,
      "loss": 1.4679,
      "step": 1306
    },
    {
      "epoch": 20.421875,
      "grad_norm": 2.2107982635498047,
      "learning_rate": 0.00011837500000000002,
      "loss": 1.4513,
      "step": 1307
    },
    {
      "epoch": 20.4375,
      "grad_norm": 2.569570541381836,
      "learning_rate": 0.00011831250000000001,
      "loss": 1.4723,
      "step": 1308
    },
    {
      "epoch": 20.453125,
      "grad_norm": 2.395604372024536,
      "learning_rate": 0.00011825000000000001,
      "loss": 1.3758,
      "step": 1309
    },
    {
      "epoch": 20.46875,
      "grad_norm": 2.2814083099365234,
      "learning_rate": 0.0001181875,
      "loss": 1.3783,
      "step": 1310
    },
    {
      "epoch": 20.484375,
      "grad_norm": 2.3266546726226807,
      "learning_rate": 0.000118125,
      "loss": 1.6058,
      "step": 1311
    },
    {
      "epoch": 20.5,
      "grad_norm": 2.3328282833099365,
      "learning_rate": 0.0001180625,
      "loss": 1.3071,
      "step": 1312
    },
    {
      "epoch": 20.515625,
      "grad_norm": 2.491814136505127,
      "learning_rate": 0.000118,
      "loss": 1.2988,
      "step": 1313
    },
    {
      "epoch": 20.53125,
      "grad_norm": 2.253509759902954,
      "learning_rate": 0.00011793750000000001,
      "loss": 1.336,
      "step": 1314
    },
    {
      "epoch": 20.546875,
      "grad_norm": 2.4440042972564697,
      "learning_rate": 0.000117875,
      "loss": 1.5451,
      "step": 1315
    },
    {
      "epoch": 20.5625,
      "grad_norm": 2.212416887283325,
      "learning_rate": 0.00011781250000000001,
      "loss": 1.5018,
      "step": 1316
    },
    {
      "epoch": 20.578125,
      "grad_norm": 2.49440860748291,
      "learning_rate": 0.00011775,
      "loss": 1.2649,
      "step": 1317
    },
    {
      "epoch": 20.59375,
      "grad_norm": 2.3975019454956055,
      "learning_rate": 0.0001176875,
      "loss": 1.5165,
      "step": 1318
    },
    {
      "epoch": 20.609375,
      "grad_norm": 2.4492459297180176,
      "learning_rate": 0.00011762500000000001,
      "loss": 1.2042,
      "step": 1319
    },
    {
      "epoch": 20.625,
      "grad_norm": 2.246516227722168,
      "learning_rate": 0.00011756249999999999,
      "loss": 1.278,
      "step": 1320
    },
    {
      "epoch": 20.640625,
      "grad_norm": 2.4138784408569336,
      "learning_rate": 0.00011750000000000001,
      "loss": 1.5584,
      "step": 1321
    },
    {
      "epoch": 20.65625,
      "grad_norm": 2.3102543354034424,
      "learning_rate": 0.0001174375,
      "loss": 1.5534,
      "step": 1322
    },
    {
      "epoch": 20.671875,
      "grad_norm": 2.100698709487915,
      "learning_rate": 0.00011737500000000001,
      "loss": 1.5641,
      "step": 1323
    },
    {
      "epoch": 20.6875,
      "grad_norm": 2.5552985668182373,
      "learning_rate": 0.0001173125,
      "loss": 1.5809,
      "step": 1324
    },
    {
      "epoch": 20.703125,
      "grad_norm": 2.510310411453247,
      "learning_rate": 0.00011725000000000002,
      "loss": 1.4892,
      "step": 1325
    },
    {
      "epoch": 20.71875,
      "grad_norm": 2.1348860263824463,
      "learning_rate": 0.00011718750000000001,
      "loss": 1.773,
      "step": 1326
    },
    {
      "epoch": 20.734375,
      "grad_norm": 2.3512320518493652,
      "learning_rate": 0.000117125,
      "loss": 1.4445,
      "step": 1327
    },
    {
      "epoch": 20.75,
      "grad_norm": 2.350956439971924,
      "learning_rate": 0.0001170625,
      "loss": 1.7096,
      "step": 1328
    },
    {
      "epoch": 20.765625,
      "grad_norm": 2.6565890312194824,
      "learning_rate": 0.000117,
      "loss": 1.4917,
      "step": 1329
    },
    {
      "epoch": 20.78125,
      "grad_norm": 2.5709235668182373,
      "learning_rate": 0.00011693750000000002,
      "loss": 1.4547,
      "step": 1330
    },
    {
      "epoch": 20.796875,
      "grad_norm": 2.4267430305480957,
      "learning_rate": 0.000116875,
      "loss": 1.3948,
      "step": 1331
    },
    {
      "epoch": 20.8125,
      "grad_norm": 2.3794968128204346,
      "learning_rate": 0.00011681250000000001,
      "loss": 1.6571,
      "step": 1332
    },
    {
      "epoch": 20.828125,
      "grad_norm": 2.190777540206909,
      "learning_rate": 0.00011675,
      "loss": 1.5683,
      "step": 1333
    },
    {
      "epoch": 20.84375,
      "grad_norm": 2.1807074546813965,
      "learning_rate": 0.00011668750000000001,
      "loss": 1.4285,
      "step": 1334
    },
    {
      "epoch": 20.859375,
      "grad_norm": 2.296510934829712,
      "learning_rate": 0.000116625,
      "loss": 1.5538,
      "step": 1335
    },
    {
      "epoch": 20.875,
      "grad_norm": 2.212714433670044,
      "learning_rate": 0.0001165625,
      "loss": 1.5811,
      "step": 1336
    },
    {
      "epoch": 20.890625,
      "grad_norm": 2.3220527172088623,
      "learning_rate": 0.00011650000000000001,
      "loss": 1.7001,
      "step": 1337
    },
    {
      "epoch": 20.90625,
      "grad_norm": 2.5062737464904785,
      "learning_rate": 0.0001164375,
      "loss": 1.6813,
      "step": 1338
    },
    {
      "epoch": 20.921875,
      "grad_norm": 2.35050892829895,
      "learning_rate": 0.00011637500000000001,
      "loss": 1.6822,
      "step": 1339
    },
    {
      "epoch": 20.9375,
      "grad_norm": 2.350184679031372,
      "learning_rate": 0.0001163125,
      "loss": 1.5169,
      "step": 1340
    },
    {
      "epoch": 20.953125,
      "grad_norm": 2.511523962020874,
      "learning_rate": 0.00011625000000000002,
      "loss": 1.6098,
      "step": 1341
    },
    {
      "epoch": 20.96875,
      "grad_norm": 2.831927537918091,
      "learning_rate": 0.0001161875,
      "loss": 1.5188,
      "step": 1342
    },
    {
      "epoch": 20.984375,
      "grad_norm": 2.7875800132751465,
      "learning_rate": 0.00011612499999999999,
      "loss": 1.6345,
      "step": 1343
    },
    {
      "epoch": 21.0,
      "grad_norm": 2.389643430709839,
      "learning_rate": 0.00011606250000000001,
      "loss": 1.3357,
      "step": 1344
    },
    {
      "epoch": 21.0,
      "eval_loss": 3.010438919067383,
      "eval_runtime": 2.8798,
      "eval_samples_per_second": 177.791,
      "eval_steps_per_second": 44.448,
      "step": 1344
    },
    {
      "epoch": 21.015625,
      "grad_norm": 2.40962553024292,
      "learning_rate": 0.000116,
      "loss": 1.2471,
      "step": 1345
    },
    {
      "epoch": 21.03125,
      "grad_norm": 2.3335421085357666,
      "learning_rate": 0.0001159375,
      "loss": 1.6927,
      "step": 1346
    },
    {
      "epoch": 21.046875,
      "grad_norm": 2.03560471534729,
      "learning_rate": 0.000115875,
      "loss": 1.7252,
      "step": 1347
    },
    {
      "epoch": 21.0625,
      "grad_norm": 2.4395759105682373,
      "learning_rate": 0.00011581250000000002,
      "loss": 1.5281,
      "step": 1348
    },
    {
      "epoch": 21.078125,
      "grad_norm": 2.2898285388946533,
      "learning_rate": 0.00011575000000000001,
      "loss": 1.2645,
      "step": 1349
    },
    {
      "epoch": 21.09375,
      "grad_norm": 2.2703404426574707,
      "learning_rate": 0.00011568750000000001,
      "loss": 1.2532,
      "step": 1350
    },
    {
      "epoch": 21.109375,
      "grad_norm": 2.166947603225708,
      "learning_rate": 0.000115625,
      "loss": 1.4152,
      "step": 1351
    },
    {
      "epoch": 21.125,
      "grad_norm": 2.6031415462493896,
      "learning_rate": 0.0001155625,
      "loss": 1.4356,
      "step": 1352
    },
    {
      "epoch": 21.140625,
      "grad_norm": 2.6499781608581543,
      "learning_rate": 0.0001155,
      "loss": 1.5466,
      "step": 1353
    },
    {
      "epoch": 21.15625,
      "grad_norm": 2.096743583679199,
      "learning_rate": 0.0001154375,
      "loss": 1.4991,
      "step": 1354
    },
    {
      "epoch": 21.171875,
      "grad_norm": 2.04561710357666,
      "learning_rate": 0.00011537500000000001,
      "loss": 1.3485,
      "step": 1355
    },
    {
      "epoch": 21.1875,
      "grad_norm": 2.213837146759033,
      "learning_rate": 0.0001153125,
      "loss": 1.7117,
      "step": 1356
    },
    {
      "epoch": 21.203125,
      "grad_norm": 2.5691802501678467,
      "learning_rate": 0.00011525000000000001,
      "loss": 1.3635,
      "step": 1357
    },
    {
      "epoch": 21.21875,
      "grad_norm": 2.7057559490203857,
      "learning_rate": 0.0001151875,
      "loss": 1.37,
      "step": 1358
    },
    {
      "epoch": 21.234375,
      "grad_norm": 2.239302635192871,
      "learning_rate": 0.00011512500000000002,
      "loss": 1.6959,
      "step": 1359
    },
    {
      "epoch": 21.25,
      "grad_norm": 2.1280131340026855,
      "learning_rate": 0.00011506250000000001,
      "loss": 1.2848,
      "step": 1360
    },
    {
      "epoch": 21.265625,
      "grad_norm": 2.3501014709472656,
      "learning_rate": 0.00011499999999999999,
      "loss": 1.4924,
      "step": 1361
    },
    {
      "epoch": 21.28125,
      "grad_norm": 2.3358657360076904,
      "learning_rate": 0.00011493750000000001,
      "loss": 1.3782,
      "step": 1362
    },
    {
      "epoch": 21.296875,
      "grad_norm": 2.3177268505096436,
      "learning_rate": 0.000114875,
      "loss": 1.677,
      "step": 1363
    },
    {
      "epoch": 21.3125,
      "grad_norm": 2.3877522945404053,
      "learning_rate": 0.0001148125,
      "loss": 1.4882,
      "step": 1364
    },
    {
      "epoch": 21.328125,
      "grad_norm": 2.597093105316162,
      "learning_rate": 0.00011475,
      "loss": 1.6681,
      "step": 1365
    },
    {
      "epoch": 21.34375,
      "grad_norm": 2.485494375228882,
      "learning_rate": 0.00011468750000000002,
      "loss": 1.6403,
      "step": 1366
    },
    {
      "epoch": 21.359375,
      "grad_norm": 2.606267213821411,
      "learning_rate": 0.00011462500000000001,
      "loss": 1.3358,
      "step": 1367
    },
    {
      "epoch": 21.375,
      "grad_norm": 2.3991613388061523,
      "learning_rate": 0.0001145625,
      "loss": 1.3786,
      "step": 1368
    },
    {
      "epoch": 21.390625,
      "grad_norm": 2.080869197845459,
      "learning_rate": 0.0001145,
      "loss": 1.4782,
      "step": 1369
    },
    {
      "epoch": 21.40625,
      "grad_norm": 2.5028555393218994,
      "learning_rate": 0.0001144375,
      "loss": 1.3685,
      "step": 1370
    },
    {
      "epoch": 21.421875,
      "grad_norm": 2.4554085731506348,
      "learning_rate": 0.00011437500000000002,
      "loss": 1.5113,
      "step": 1371
    },
    {
      "epoch": 21.4375,
      "grad_norm": 2.138110399246216,
      "learning_rate": 0.0001143125,
      "loss": 1.4787,
      "step": 1372
    },
    {
      "epoch": 21.453125,
      "grad_norm": 2.311528205871582,
      "learning_rate": 0.00011425000000000001,
      "loss": 1.3565,
      "step": 1373
    },
    {
      "epoch": 21.46875,
      "grad_norm": 2.5464165210723877,
      "learning_rate": 0.0001141875,
      "loss": 1.3663,
      "step": 1374
    },
    {
      "epoch": 21.484375,
      "grad_norm": 2.272981882095337,
      "learning_rate": 0.00011412500000000001,
      "loss": 1.3672,
      "step": 1375
    },
    {
      "epoch": 21.5,
      "grad_norm": 2.5827746391296387,
      "learning_rate": 0.0001140625,
      "loss": 1.6179,
      "step": 1376
    },
    {
      "epoch": 21.515625,
      "grad_norm": 2.3968260288238525,
      "learning_rate": 0.00011399999999999999,
      "loss": 1.5714,
      "step": 1377
    },
    {
      "epoch": 21.53125,
      "grad_norm": 2.294435501098633,
      "learning_rate": 0.00011393750000000001,
      "loss": 1.4812,
      "step": 1378
    },
    {
      "epoch": 21.546875,
      "grad_norm": 2.4044032096862793,
      "learning_rate": 0.000113875,
      "loss": 1.2301,
      "step": 1379
    },
    {
      "epoch": 21.5625,
      "grad_norm": 2.298856496810913,
      "learning_rate": 0.00011381250000000001,
      "loss": 1.5178,
      "step": 1380
    },
    {
      "epoch": 21.578125,
      "grad_norm": 2.4408442974090576,
      "learning_rate": 0.00011375,
      "loss": 1.6182,
      "step": 1381
    },
    {
      "epoch": 21.59375,
      "grad_norm": 3.0359296798706055,
      "learning_rate": 0.00011368750000000002,
      "loss": 1.2993,
      "step": 1382
    },
    {
      "epoch": 21.609375,
      "grad_norm": 2.3581390380859375,
      "learning_rate": 0.000113625,
      "loss": 1.6488,
      "step": 1383
    },
    {
      "epoch": 21.625,
      "grad_norm": 2.4010910987854004,
      "learning_rate": 0.00011356250000000002,
      "loss": 1.5975,
      "step": 1384
    },
    {
      "epoch": 21.640625,
      "grad_norm": 2.5211117267608643,
      "learning_rate": 0.00011350000000000001,
      "loss": 1.4236,
      "step": 1385
    },
    {
      "epoch": 21.65625,
      "grad_norm": 2.5127625465393066,
      "learning_rate": 0.0001134375,
      "loss": 1.4421,
      "step": 1386
    },
    {
      "epoch": 21.671875,
      "grad_norm": 2.616828441619873,
      "learning_rate": 0.000113375,
      "loss": 1.6045,
      "step": 1387
    },
    {
      "epoch": 21.6875,
      "grad_norm": 2.378469944000244,
      "learning_rate": 0.0001133125,
      "loss": 1.3875,
      "step": 1388
    },
    {
      "epoch": 21.703125,
      "grad_norm": 2.321558713912964,
      "learning_rate": 0.00011325000000000002,
      "loss": 1.6051,
      "step": 1389
    },
    {
      "epoch": 21.71875,
      "grad_norm": 2.5773818492889404,
      "learning_rate": 0.00011318750000000001,
      "loss": 1.3223,
      "step": 1390
    },
    {
      "epoch": 21.734375,
      "grad_norm": 1.9409066438674927,
      "learning_rate": 0.00011312500000000001,
      "loss": 1.1781,
      "step": 1391
    },
    {
      "epoch": 21.75,
      "grad_norm": 2.630927085876465,
      "learning_rate": 0.0001130625,
      "loss": 1.4815,
      "step": 1392
    },
    {
      "epoch": 21.765625,
      "grad_norm": 2.530322551727295,
      "learning_rate": 0.000113,
      "loss": 1.5677,
      "step": 1393
    },
    {
      "epoch": 21.78125,
      "grad_norm": 3.0794968605041504,
      "learning_rate": 0.0001129375,
      "loss": 1.4104,
      "step": 1394
    },
    {
      "epoch": 21.796875,
      "grad_norm": 2.421962261199951,
      "learning_rate": 0.00011287499999999999,
      "loss": 1.5158,
      "step": 1395
    },
    {
      "epoch": 21.8125,
      "grad_norm": 2.1461753845214844,
      "learning_rate": 0.00011281250000000001,
      "loss": 1.3696,
      "step": 1396
    },
    {
      "epoch": 21.828125,
      "grad_norm": 2.223426342010498,
      "learning_rate": 0.00011275,
      "loss": 1.495,
      "step": 1397
    },
    {
      "epoch": 21.84375,
      "grad_norm": 2.2779369354248047,
      "learning_rate": 0.00011268750000000001,
      "loss": 1.3557,
      "step": 1398
    },
    {
      "epoch": 21.859375,
      "grad_norm": 2.330634593963623,
      "learning_rate": 0.000112625,
      "loss": 1.3723,
      "step": 1399
    },
    {
      "epoch": 21.875,
      "grad_norm": 2.556164503097534,
      "learning_rate": 0.00011256250000000002,
      "loss": 1.3946,
      "step": 1400
    },
    {
      "epoch": 21.890625,
      "grad_norm": 2.3458104133605957,
      "learning_rate": 0.00011250000000000001,
      "loss": 1.4937,
      "step": 1401
    },
    {
      "epoch": 21.90625,
      "grad_norm": 2.039318323135376,
      "learning_rate": 0.0001124375,
      "loss": 1.4104,
      "step": 1402
    },
    {
      "epoch": 21.921875,
      "grad_norm": 2.299659013748169,
      "learning_rate": 0.00011237500000000001,
      "loss": 1.6486,
      "step": 1403
    },
    {
      "epoch": 21.9375,
      "grad_norm": 2.6525559425354004,
      "learning_rate": 0.0001123125,
      "loss": 1.6799,
      "step": 1404
    },
    {
      "epoch": 21.953125,
      "grad_norm": 2.457286834716797,
      "learning_rate": 0.00011225,
      "loss": 1.3582,
      "step": 1405
    },
    {
      "epoch": 21.96875,
      "grad_norm": 2.609015703201294,
      "learning_rate": 0.0001121875,
      "loss": 1.6185,
      "step": 1406
    },
    {
      "epoch": 21.984375,
      "grad_norm": 2.6779980659484863,
      "learning_rate": 0.00011212500000000001,
      "loss": 1.5721,
      "step": 1407
    },
    {
      "epoch": 22.0,
      "grad_norm": 3.0691254138946533,
      "learning_rate": 0.0001120625,
      "loss": 1.3793,
      "step": 1408
    },
    {
      "epoch": 22.0,
      "eval_loss": 3.020935535430908,
      "eval_runtime": 2.9866,
      "eval_samples_per_second": 171.432,
      "eval_steps_per_second": 42.858,
      "step": 1408
    },
    {
      "epoch": 22.015625,
      "grad_norm": 2.4270689487457275,
      "learning_rate": 0.00011200000000000001,
      "loss": 1.2921,
      "step": 1409
    },
    {
      "epoch": 22.03125,
      "grad_norm": 2.7592403888702393,
      "learning_rate": 0.0001119375,
      "loss": 1.3969,
      "step": 1410
    },
    {
      "epoch": 22.046875,
      "grad_norm": 2.089477300643921,
      "learning_rate": 0.000111875,
      "loss": 1.313,
      "step": 1411
    },
    {
      "epoch": 22.0625,
      "grad_norm": 2.2643938064575195,
      "learning_rate": 0.00011181250000000001,
      "loss": 1.5911,
      "step": 1412
    },
    {
      "epoch": 22.078125,
      "grad_norm": 2.535189390182495,
      "learning_rate": 0.00011175,
      "loss": 1.3632,
      "step": 1413
    },
    {
      "epoch": 22.09375,
      "grad_norm": 2.1426033973693848,
      "learning_rate": 0.00011168750000000001,
      "loss": 1.6337,
      "step": 1414
    },
    {
      "epoch": 22.109375,
      "grad_norm": 2.0354866981506348,
      "learning_rate": 0.000111625,
      "loss": 1.708,
      "step": 1415
    },
    {
      "epoch": 22.125,
      "grad_norm": 2.3696401119232178,
      "learning_rate": 0.00011156250000000001,
      "loss": 1.1503,
      "step": 1416
    },
    {
      "epoch": 22.140625,
      "grad_norm": 2.3983302116394043,
      "learning_rate": 0.0001115,
      "loss": 1.6358,
      "step": 1417
    },
    {
      "epoch": 22.15625,
      "grad_norm": 2.474519729614258,
      "learning_rate": 0.00011143749999999999,
      "loss": 1.5858,
      "step": 1418
    },
    {
      "epoch": 22.171875,
      "grad_norm": 2.480902671813965,
      "learning_rate": 0.00011137500000000001,
      "loss": 1.3969,
      "step": 1419
    },
    {
      "epoch": 22.1875,
      "grad_norm": 2.34348464012146,
      "learning_rate": 0.0001113125,
      "loss": 1.5097,
      "step": 1420
    },
    {
      "epoch": 22.203125,
      "grad_norm": 2.299539804458618,
      "learning_rate": 0.00011125000000000001,
      "loss": 1.4817,
      "step": 1421
    },
    {
      "epoch": 22.21875,
      "grad_norm": 2.5098063945770264,
      "learning_rate": 0.0001111875,
      "loss": 1.5003,
      "step": 1422
    },
    {
      "epoch": 22.234375,
      "grad_norm": 2.1557021141052246,
      "learning_rate": 0.00011112500000000002,
      "loss": 1.5758,
      "step": 1423
    },
    {
      "epoch": 22.25,
      "grad_norm": 2.3690006732940674,
      "learning_rate": 0.00011106250000000001,
      "loss": 1.0559,
      "step": 1424
    },
    {
      "epoch": 22.265625,
      "grad_norm": 2.1825928688049316,
      "learning_rate": 0.00011100000000000001,
      "loss": 1.5953,
      "step": 1425
    },
    {
      "epoch": 22.28125,
      "grad_norm": 2.557042360305786,
      "learning_rate": 0.0001109375,
      "loss": 1.6446,
      "step": 1426
    },
    {
      "epoch": 22.296875,
      "grad_norm": 2.0855934619903564,
      "learning_rate": 0.000110875,
      "loss": 1.5778,
      "step": 1427
    },
    {
      "epoch": 22.3125,
      "grad_norm": 2.4548470973968506,
      "learning_rate": 0.0001108125,
      "loss": 1.3267,
      "step": 1428
    },
    {
      "epoch": 22.328125,
      "grad_norm": 2.4544808864593506,
      "learning_rate": 0.00011075,
      "loss": 1.2833,
      "step": 1429
    },
    {
      "epoch": 22.34375,
      "grad_norm": 2.3995847702026367,
      "learning_rate": 0.00011068750000000001,
      "loss": 1.4512,
      "step": 1430
    },
    {
      "epoch": 22.359375,
      "grad_norm": 2.7319600582122803,
      "learning_rate": 0.000110625,
      "loss": 1.4402,
      "step": 1431
    },
    {
      "epoch": 22.375,
      "grad_norm": 2.533761501312256,
      "learning_rate": 0.00011056250000000001,
      "loss": 1.4663,
      "step": 1432
    },
    {
      "epoch": 22.390625,
      "grad_norm": 2.2049221992492676,
      "learning_rate": 0.0001105,
      "loss": 1.2782,
      "step": 1433
    },
    {
      "epoch": 22.40625,
      "grad_norm": 2.3315317630767822,
      "learning_rate": 0.00011043750000000002,
      "loss": 1.5105,
      "step": 1434
    },
    {
      "epoch": 22.421875,
      "grad_norm": 2.7430198192596436,
      "learning_rate": 0.000110375,
      "loss": 1.3883,
      "step": 1435
    },
    {
      "epoch": 22.4375,
      "grad_norm": 2.433544635772705,
      "learning_rate": 0.00011031249999999999,
      "loss": 1.2823,
      "step": 1436
    },
    {
      "epoch": 22.453125,
      "grad_norm": 2.461618661880493,
      "learning_rate": 0.00011025000000000001,
      "loss": 1.5558,
      "step": 1437
    },
    {
      "epoch": 22.46875,
      "grad_norm": 2.113083600997925,
      "learning_rate": 0.0001101875,
      "loss": 1.4654,
      "step": 1438
    },
    {
      "epoch": 22.484375,
      "grad_norm": 2.622192859649658,
      "learning_rate": 0.00011012500000000001,
      "loss": 1.3662,
      "step": 1439
    },
    {
      "epoch": 22.5,
      "grad_norm": 2.7236621379852295,
      "learning_rate": 0.0001100625,
      "loss": 1.5046,
      "step": 1440
    },
    {
      "epoch": 22.515625,
      "grad_norm": 2.4539361000061035,
      "learning_rate": 0.00011000000000000002,
      "loss": 1.497,
      "step": 1441
    },
    {
      "epoch": 22.53125,
      "grad_norm": 2.742013692855835,
      "learning_rate": 0.00010993750000000001,
      "loss": 1.2805,
      "step": 1442
    },
    {
      "epoch": 22.546875,
      "grad_norm": 2.345588445663452,
      "learning_rate": 0.000109875,
      "loss": 1.2359,
      "step": 1443
    },
    {
      "epoch": 22.5625,
      "grad_norm": 2.1171133518218994,
      "learning_rate": 0.0001098125,
      "loss": 1.2781,
      "step": 1444
    },
    {
      "epoch": 22.578125,
      "grad_norm": 2.4570674896240234,
      "learning_rate": 0.00010975,
      "loss": 1.425,
      "step": 1445
    },
    {
      "epoch": 22.59375,
      "grad_norm": 2.5425102710723877,
      "learning_rate": 0.0001096875,
      "loss": 1.3853,
      "step": 1446
    },
    {
      "epoch": 22.609375,
      "grad_norm": 2.3383469581604004,
      "learning_rate": 0.000109625,
      "loss": 1.4593,
      "step": 1447
    },
    {
      "epoch": 22.625,
      "grad_norm": 2.5797126293182373,
      "learning_rate": 0.00010956250000000001,
      "loss": 1.2233,
      "step": 1448
    },
    {
      "epoch": 22.640625,
      "grad_norm": 2.4372949600219727,
      "learning_rate": 0.0001095,
      "loss": 1.562,
      "step": 1449
    },
    {
      "epoch": 22.65625,
      "grad_norm": 2.467707633972168,
      "learning_rate": 0.00010943750000000001,
      "loss": 1.4631,
      "step": 1450
    },
    {
      "epoch": 22.671875,
      "grad_norm": 2.391948938369751,
      "learning_rate": 0.000109375,
      "loss": 1.4668,
      "step": 1451
    },
    {
      "epoch": 22.6875,
      "grad_norm": 1.889631986618042,
      "learning_rate": 0.0001093125,
      "loss": 1.0872,
      "step": 1452
    },
    {
      "epoch": 22.703125,
      "grad_norm": 2.208453893661499,
      "learning_rate": 0.00010925000000000001,
      "loss": 1.4589,
      "step": 1453
    },
    {
      "epoch": 22.71875,
      "grad_norm": 2.406266450881958,
      "learning_rate": 0.0001091875,
      "loss": 1.4795,
      "step": 1454
    },
    {
      "epoch": 22.734375,
      "grad_norm": 2.579381227493286,
      "learning_rate": 0.00010912500000000001,
      "loss": 1.4677,
      "step": 1455
    },
    {
      "epoch": 22.75,
      "grad_norm": 2.513169050216675,
      "learning_rate": 0.0001090625,
      "loss": 1.6311,
      "step": 1456
    },
    {
      "epoch": 22.765625,
      "grad_norm": 2.7445638179779053,
      "learning_rate": 0.000109,
      "loss": 1.5311,
      "step": 1457
    },
    {
      "epoch": 22.78125,
      "grad_norm": 2.548327922821045,
      "learning_rate": 0.0001089375,
      "loss": 1.567,
      "step": 1458
    },
    {
      "epoch": 22.796875,
      "grad_norm": 2.51176118850708,
      "learning_rate": 0.00010887500000000002,
      "loss": 1.6384,
      "step": 1459
    },
    {
      "epoch": 22.8125,
      "grad_norm": 2.434957265853882,
      "learning_rate": 0.00010881250000000001,
      "loss": 1.5479,
      "step": 1460
    },
    {
      "epoch": 22.828125,
      "grad_norm": 2.5649609565734863,
      "learning_rate": 0.00010875,
      "loss": 1.3684,
      "step": 1461
    },
    {
      "epoch": 22.84375,
      "grad_norm": 2.2807960510253906,
      "learning_rate": 0.0001086875,
      "loss": 1.5161,
      "step": 1462
    },
    {
      "epoch": 22.859375,
      "grad_norm": 2.614351987838745,
      "learning_rate": 0.000108625,
      "loss": 1.3748,
      "step": 1463
    },
    {
      "epoch": 22.875,
      "grad_norm": 2.757826805114746,
      "learning_rate": 0.00010856250000000002,
      "loss": 1.5463,
      "step": 1464
    },
    {
      "epoch": 22.890625,
      "grad_norm": 2.3243327140808105,
      "learning_rate": 0.00010850000000000001,
      "loss": 1.5162,
      "step": 1465
    },
    {
      "epoch": 22.90625,
      "grad_norm": 2.309896945953369,
      "learning_rate": 0.00010843750000000001,
      "loss": 1.5111,
      "step": 1466
    },
    {
      "epoch": 22.921875,
      "grad_norm": 2.1772520542144775,
      "learning_rate": 0.000108375,
      "loss": 1.5905,
      "step": 1467
    },
    {
      "epoch": 22.9375,
      "grad_norm": 2.2548184394836426,
      "learning_rate": 0.0001083125,
      "loss": 1.5291,
      "step": 1468
    },
    {
      "epoch": 22.953125,
      "grad_norm": 2.5237202644348145,
      "learning_rate": 0.00010825,
      "loss": 1.3487,
      "step": 1469
    },
    {
      "epoch": 22.96875,
      "grad_norm": 2.463179111480713,
      "learning_rate": 0.0001081875,
      "loss": 1.4331,
      "step": 1470
    },
    {
      "epoch": 22.984375,
      "grad_norm": 2.3376917839050293,
      "learning_rate": 0.00010812500000000001,
      "loss": 1.4405,
      "step": 1471
    },
    {
      "epoch": 23.0,
      "grad_norm": 2.6721930503845215,
      "learning_rate": 0.0001080625,
      "loss": 1.7024,
      "step": 1472
    },
    {
      "epoch": 23.0,
      "eval_loss": 3.0247654914855957,
      "eval_runtime": 2.8488,
      "eval_samples_per_second": 179.724,
      "eval_steps_per_second": 44.931,
      "step": 1472
    },
    {
      "epoch": 23.015625,
      "grad_norm": 2.33122181892395,
      "learning_rate": 0.00010800000000000001,
      "loss": 1.2911,
      "step": 1473
    },
    {
      "epoch": 23.03125,
      "grad_norm": 2.276770830154419,
      "learning_rate": 0.0001079375,
      "loss": 1.4288,
      "step": 1474
    },
    {
      "epoch": 23.046875,
      "grad_norm": 2.5172979831695557,
      "learning_rate": 0.00010787500000000002,
      "loss": 1.6559,
      "step": 1475
    },
    {
      "epoch": 23.0625,
      "grad_norm": 2.3604180812835693,
      "learning_rate": 0.00010781250000000001,
      "loss": 1.4827,
      "step": 1476
    },
    {
      "epoch": 23.078125,
      "grad_norm": 2.57254695892334,
      "learning_rate": 0.00010774999999999999,
      "loss": 1.3113,
      "step": 1477
    },
    {
      "epoch": 23.09375,
      "grad_norm": 2.1882190704345703,
      "learning_rate": 0.00010768750000000001,
      "loss": 1.4682,
      "step": 1478
    },
    {
      "epoch": 23.109375,
      "grad_norm": 2.0833346843719482,
      "learning_rate": 0.000107625,
      "loss": 1.4336,
      "step": 1479
    },
    {
      "epoch": 23.125,
      "grad_norm": 2.149456739425659,
      "learning_rate": 0.0001075625,
      "loss": 1.4949,
      "step": 1480
    },
    {
      "epoch": 23.140625,
      "grad_norm": 2.12990140914917,
      "learning_rate": 0.0001075,
      "loss": 1.3968,
      "step": 1481
    },
    {
      "epoch": 23.15625,
      "grad_norm": 2.5998237133026123,
      "learning_rate": 0.00010743750000000002,
      "loss": 1.5295,
      "step": 1482
    },
    {
      "epoch": 23.171875,
      "grad_norm": 2.1948678493499756,
      "learning_rate": 0.00010737500000000001,
      "loss": 1.4358,
      "step": 1483
    },
    {
      "epoch": 23.1875,
      "grad_norm": 2.391529083251953,
      "learning_rate": 0.00010731250000000001,
      "loss": 1.3242,
      "step": 1484
    },
    {
      "epoch": 23.203125,
      "grad_norm": 2.3413314819335938,
      "learning_rate": 0.00010725,
      "loss": 1.3147,
      "step": 1485
    },
    {
      "epoch": 23.21875,
      "grad_norm": 2.20231294631958,
      "learning_rate": 0.0001071875,
      "loss": 1.6342,
      "step": 1486
    },
    {
      "epoch": 23.234375,
      "grad_norm": 2.6662304401397705,
      "learning_rate": 0.00010712500000000002,
      "loss": 1.1709,
      "step": 1487
    },
    {
      "epoch": 23.25,
      "grad_norm": 2.3613038063049316,
      "learning_rate": 0.0001070625,
      "loss": 1.5806,
      "step": 1488
    },
    {
      "epoch": 23.265625,
      "grad_norm": 2.2253880500793457,
      "learning_rate": 0.00010700000000000001,
      "loss": 1.2709,
      "step": 1489
    },
    {
      "epoch": 23.28125,
      "grad_norm": 2.664738655090332,
      "learning_rate": 0.0001069375,
      "loss": 1.4015,
      "step": 1490
    },
    {
      "epoch": 23.296875,
      "grad_norm": 2.6128838062286377,
      "learning_rate": 0.00010687500000000001,
      "loss": 1.2536,
      "step": 1491
    },
    {
      "epoch": 23.3125,
      "grad_norm": 2.7923521995544434,
      "learning_rate": 0.0001068125,
      "loss": 1.3248,
      "step": 1492
    },
    {
      "epoch": 23.328125,
      "grad_norm": 2.3152220249176025,
      "learning_rate": 0.00010674999999999999,
      "loss": 1.5165,
      "step": 1493
    },
    {
      "epoch": 23.34375,
      "grad_norm": 2.612161874771118,
      "learning_rate": 0.00010668750000000001,
      "loss": 1.1143,
      "step": 1494
    },
    {
      "epoch": 23.359375,
      "grad_norm": 2.402169704437256,
      "learning_rate": 0.000106625,
      "loss": 1.3807,
      "step": 1495
    },
    {
      "epoch": 23.375,
      "grad_norm": 2.4477062225341797,
      "learning_rate": 0.00010656250000000001,
      "loss": 1.4611,
      "step": 1496
    },
    {
      "epoch": 23.390625,
      "grad_norm": 2.3203845024108887,
      "learning_rate": 0.0001065,
      "loss": 1.4124,
      "step": 1497
    },
    {
      "epoch": 23.40625,
      "grad_norm": 2.540614366531372,
      "learning_rate": 0.00010643750000000002,
      "loss": 1.4958,
      "step": 1498
    },
    {
      "epoch": 23.421875,
      "grad_norm": 2.4457993507385254,
      "learning_rate": 0.000106375,
      "loss": 1.6049,
      "step": 1499
    },
    {
      "epoch": 23.4375,
      "grad_norm": 2.3877360820770264,
      "learning_rate": 0.00010631250000000002,
      "loss": 1.6479,
      "step": 1500
    },
    {
      "epoch": 23.453125,
      "grad_norm": 2.201930046081543,
      "learning_rate": 0.00010625000000000001,
      "loss": 1.3606,
      "step": 1501
    },
    {
      "epoch": 23.46875,
      "grad_norm": 2.436615228652954,
      "learning_rate": 0.0001061875,
      "loss": 1.5271,
      "step": 1502
    },
    {
      "epoch": 23.484375,
      "grad_norm": 2.4544527530670166,
      "learning_rate": 0.000106125,
      "loss": 1.5907,
      "step": 1503
    },
    {
      "epoch": 23.5,
      "grad_norm": 2.3792061805725098,
      "learning_rate": 0.0001060625,
      "loss": 1.4527,
      "step": 1504
    },
    {
      "epoch": 23.515625,
      "grad_norm": 2.286012887954712,
      "learning_rate": 0.00010600000000000002,
      "loss": 1.3568,
      "step": 1505
    },
    {
      "epoch": 23.53125,
      "grad_norm": 2.4412119388580322,
      "learning_rate": 0.00010593750000000001,
      "loss": 1.3654,
      "step": 1506
    },
    {
      "epoch": 23.546875,
      "grad_norm": 2.3372554779052734,
      "learning_rate": 0.00010587500000000001,
      "loss": 1.2333,
      "step": 1507
    },
    {
      "epoch": 23.5625,
      "grad_norm": 2.600727081298828,
      "learning_rate": 0.0001058125,
      "loss": 1.3429,
      "step": 1508
    },
    {
      "epoch": 23.578125,
      "grad_norm": 2.386787176132202,
      "learning_rate": 0.00010575000000000001,
      "loss": 1.5104,
      "step": 1509
    },
    {
      "epoch": 23.59375,
      "grad_norm": 2.566511869430542,
      "learning_rate": 0.0001056875,
      "loss": 1.479,
      "step": 1510
    },
    {
      "epoch": 23.609375,
      "grad_norm": 2.4279472827911377,
      "learning_rate": 0.00010562499999999999,
      "loss": 1.2401,
      "step": 1511
    },
    {
      "epoch": 23.625,
      "grad_norm": 2.283860206604004,
      "learning_rate": 0.00010556250000000001,
      "loss": 1.4331,
      "step": 1512
    },
    {
      "epoch": 23.640625,
      "grad_norm": 2.350170612335205,
      "learning_rate": 0.0001055,
      "loss": 1.5343,
      "step": 1513
    },
    {
      "epoch": 23.65625,
      "grad_norm": 2.4446778297424316,
      "learning_rate": 0.00010543750000000001,
      "loss": 1.3065,
      "step": 1514
    },
    {
      "epoch": 23.671875,
      "grad_norm": 2.61873459815979,
      "learning_rate": 0.000105375,
      "loss": 1.075,
      "step": 1515
    },
    {
      "epoch": 23.6875,
      "grad_norm": 2.2802021503448486,
      "learning_rate": 0.00010531250000000002,
      "loss": 1.4188,
      "step": 1516
    },
    {
      "epoch": 23.703125,
      "grad_norm": 2.6854190826416016,
      "learning_rate": 0.00010525000000000001,
      "loss": 1.3052,
      "step": 1517
    },
    {
      "epoch": 23.71875,
      "grad_norm": 2.471858024597168,
      "learning_rate": 0.00010518749999999999,
      "loss": 1.5896,
      "step": 1518
    },
    {
      "epoch": 23.734375,
      "grad_norm": 2.161982774734497,
      "learning_rate": 0.00010512500000000001,
      "loss": 1.1752,
      "step": 1519
    },
    {
      "epoch": 23.75,
      "grad_norm": 2.26124906539917,
      "learning_rate": 0.0001050625,
      "loss": 1.5284,
      "step": 1520
    },
    {
      "epoch": 23.765625,
      "grad_norm": 2.5210213661193848,
      "learning_rate": 0.000105,
      "loss": 1.5618,
      "step": 1521
    },
    {
      "epoch": 23.78125,
      "grad_norm": 2.469492197036743,
      "learning_rate": 0.0001049375,
      "loss": 1.4351,
      "step": 1522
    },
    {
      "epoch": 23.796875,
      "grad_norm": 2.2915220260620117,
      "learning_rate": 0.00010487500000000001,
      "loss": 1.6984,
      "step": 1523
    },
    {
      "epoch": 23.8125,
      "grad_norm": 2.4975438117980957,
      "learning_rate": 0.0001048125,
      "loss": 1.512,
      "step": 1524
    },
    {
      "epoch": 23.828125,
      "grad_norm": 2.39056396484375,
      "learning_rate": 0.00010475000000000001,
      "loss": 1.6634,
      "step": 1525
    },
    {
      "epoch": 23.84375,
      "grad_norm": 2.4540224075317383,
      "learning_rate": 0.0001046875,
      "loss": 1.5442,
      "step": 1526
    },
    {
      "epoch": 23.859375,
      "grad_norm": 2.3060824871063232,
      "learning_rate": 0.000104625,
      "loss": 1.4242,
      "step": 1527
    },
    {
      "epoch": 23.875,
      "grad_norm": 2.7029736042022705,
      "learning_rate": 0.00010456250000000001,
      "loss": 1.6104,
      "step": 1528
    },
    {
      "epoch": 23.890625,
      "grad_norm": 2.454768657684326,
      "learning_rate": 0.00010449999999999999,
      "loss": 1.5425,
      "step": 1529
    },
    {
      "epoch": 23.90625,
      "grad_norm": 2.240652322769165,
      "learning_rate": 0.00010443750000000001,
      "loss": 1.6351,
      "step": 1530
    },
    {
      "epoch": 23.921875,
      "grad_norm": 2.6041505336761475,
      "learning_rate": 0.000104375,
      "loss": 1.4768,
      "step": 1531
    },
    {
      "epoch": 23.9375,
      "grad_norm": 2.354177951812744,
      "learning_rate": 0.00010431250000000001,
      "loss": 1.3477,
      "step": 1532
    },
    {
      "epoch": 23.953125,
      "grad_norm": 2.260481834411621,
      "learning_rate": 0.00010425,
      "loss": 1.4056,
      "step": 1533
    },
    {
      "epoch": 23.96875,
      "grad_norm": 2.5680227279663086,
      "learning_rate": 0.00010418750000000002,
      "loss": 1.515,
      "step": 1534
    },
    {
      "epoch": 23.984375,
      "grad_norm": 2.422372579574585,
      "learning_rate": 0.00010412500000000001,
      "loss": 1.4486,
      "step": 1535
    },
    {
      "epoch": 24.0,
      "grad_norm": 2.6342475414276123,
      "learning_rate": 0.0001040625,
      "loss": 1.3326,
      "step": 1536
    },
    {
      "epoch": 24.0,
      "eval_loss": 3.0262153148651123,
      "eval_runtime": 2.8799,
      "eval_samples_per_second": 177.786,
      "eval_steps_per_second": 44.446,
      "step": 1536
    },
    {
      "epoch": 24.015625,
      "grad_norm": 2.620285749435425,
      "learning_rate": 0.00010400000000000001,
      "loss": 1.3806,
      "step": 1537
    },
    {
      "epoch": 24.03125,
      "grad_norm": 2.298205852508545,
      "learning_rate": 0.0001039375,
      "loss": 1.3636,
      "step": 1538
    },
    {
      "epoch": 24.046875,
      "grad_norm": 2.4611055850982666,
      "learning_rate": 0.00010387500000000002,
      "loss": 1.4352,
      "step": 1539
    },
    {
      "epoch": 24.0625,
      "grad_norm": 2.3607304096221924,
      "learning_rate": 0.0001038125,
      "loss": 1.5033,
      "step": 1540
    },
    {
      "epoch": 24.078125,
      "grad_norm": 2.300271511077881,
      "learning_rate": 0.00010375000000000001,
      "loss": 1.3139,
      "step": 1541
    },
    {
      "epoch": 24.09375,
      "grad_norm": 2.568589210510254,
      "learning_rate": 0.0001036875,
      "loss": 1.2722,
      "step": 1542
    },
    {
      "epoch": 24.109375,
      "grad_norm": 2.269782543182373,
      "learning_rate": 0.000103625,
      "loss": 1.385,
      "step": 1543
    },
    {
      "epoch": 24.125,
      "grad_norm": 2.44081711769104,
      "learning_rate": 0.0001035625,
      "loss": 1.4732,
      "step": 1544
    },
    {
      "epoch": 24.140625,
      "grad_norm": 2.7607502937316895,
      "learning_rate": 0.0001035,
      "loss": 1.3331,
      "step": 1545
    },
    {
      "epoch": 24.15625,
      "grad_norm": 2.909696102142334,
      "learning_rate": 0.00010343750000000001,
      "loss": 1.5439,
      "step": 1546
    },
    {
      "epoch": 24.171875,
      "grad_norm": 2.3425774574279785,
      "learning_rate": 0.000103375,
      "loss": 1.3499,
      "step": 1547
    },
    {
      "epoch": 24.1875,
      "grad_norm": 2.4953808784484863,
      "learning_rate": 0.00010331250000000001,
      "loss": 1.3043,
      "step": 1548
    },
    {
      "epoch": 24.203125,
      "grad_norm": 2.2637057304382324,
      "learning_rate": 0.00010325,
      "loss": 1.5897,
      "step": 1549
    },
    {
      "epoch": 24.21875,
      "grad_norm": 2.3495664596557617,
      "learning_rate": 0.00010318750000000002,
      "loss": 1.3721,
      "step": 1550
    },
    {
      "epoch": 24.234375,
      "grad_norm": 2.614896774291992,
      "learning_rate": 0.000103125,
      "loss": 1.4567,
      "step": 1551
    },
    {
      "epoch": 24.25,
      "grad_norm": 2.447634696960449,
      "learning_rate": 0.00010306249999999999,
      "loss": 1.2838,
      "step": 1552
    },
    {
      "epoch": 24.265625,
      "grad_norm": 2.297496795654297,
      "learning_rate": 0.00010300000000000001,
      "loss": 1.2578,
      "step": 1553
    },
    {
      "epoch": 24.28125,
      "grad_norm": 2.416883707046509,
      "learning_rate": 0.0001029375,
      "loss": 1.4538,
      "step": 1554
    },
    {
      "epoch": 24.296875,
      "grad_norm": 2.253502368927002,
      "learning_rate": 0.00010287500000000001,
      "loss": 1.4762,
      "step": 1555
    },
    {
      "epoch": 24.3125,
      "grad_norm": 2.3531205654144287,
      "learning_rate": 0.0001028125,
      "loss": 1.3118,
      "step": 1556
    },
    {
      "epoch": 24.328125,
      "grad_norm": 2.239025831222534,
      "learning_rate": 0.00010275000000000002,
      "loss": 1.5017,
      "step": 1557
    },
    {
      "epoch": 24.34375,
      "grad_norm": 2.2663283348083496,
      "learning_rate": 0.00010268750000000001,
      "loss": 1.4201,
      "step": 1558
    },
    {
      "epoch": 24.359375,
      "grad_norm": 2.333200454711914,
      "learning_rate": 0.00010262500000000001,
      "loss": 1.3625,
      "step": 1559
    },
    {
      "epoch": 24.375,
      "grad_norm": 2.46991229057312,
      "learning_rate": 0.0001025625,
      "loss": 1.4776,
      "step": 1560
    },
    {
      "epoch": 24.390625,
      "grad_norm": 2.2986857891082764,
      "learning_rate": 0.0001025,
      "loss": 1.4984,
      "step": 1561
    },
    {
      "epoch": 24.40625,
      "grad_norm": 2.4187371730804443,
      "learning_rate": 0.0001024375,
      "loss": 1.5134,
      "step": 1562
    },
    {
      "epoch": 24.421875,
      "grad_norm": 2.2775089740753174,
      "learning_rate": 0.000102375,
      "loss": 1.4071,
      "step": 1563
    },
    {
      "epoch": 24.4375,
      "grad_norm": 2.218738079071045,
      "learning_rate": 0.00010231250000000001,
      "loss": 1.3866,
      "step": 1564
    },
    {
      "epoch": 24.453125,
      "grad_norm": 2.440150260925293,
      "learning_rate": 0.00010225,
      "loss": 1.3451,
      "step": 1565
    },
    {
      "epoch": 24.46875,
      "grad_norm": 2.328868865966797,
      "learning_rate": 0.00010218750000000001,
      "loss": 1.1961,
      "step": 1566
    },
    {
      "epoch": 24.484375,
      "grad_norm": 2.518519639968872,
      "learning_rate": 0.000102125,
      "loss": 1.2297,
      "step": 1567
    },
    {
      "epoch": 24.5,
      "grad_norm": 2.0802550315856934,
      "learning_rate": 0.0001020625,
      "loss": 1.5025,
      "step": 1568
    },
    {
      "epoch": 24.515625,
      "grad_norm": 2.559352397918701,
      "learning_rate": 0.00010200000000000001,
      "loss": 1.2965,
      "step": 1569
    },
    {
      "epoch": 24.53125,
      "grad_norm": 2.2396481037139893,
      "learning_rate": 0.00010193749999999999,
      "loss": 1.5778,
      "step": 1570
    },
    {
      "epoch": 24.546875,
      "grad_norm": 2.3944895267486572,
      "learning_rate": 0.00010187500000000001,
      "loss": 1.4821,
      "step": 1571
    },
    {
      "epoch": 24.5625,
      "grad_norm": 2.556366443634033,
      "learning_rate": 0.0001018125,
      "loss": 1.4721,
      "step": 1572
    },
    {
      "epoch": 24.578125,
      "grad_norm": 2.271042823791504,
      "learning_rate": 0.00010175,
      "loss": 1.4499,
      "step": 1573
    },
    {
      "epoch": 24.59375,
      "grad_norm": 2.2548389434814453,
      "learning_rate": 0.0001016875,
      "loss": 1.4069,
      "step": 1574
    },
    {
      "epoch": 24.609375,
      "grad_norm": 2.346895694732666,
      "learning_rate": 0.00010162500000000002,
      "loss": 1.261,
      "step": 1575
    },
    {
      "epoch": 24.625,
      "grad_norm": 2.1216092109680176,
      "learning_rate": 0.00010156250000000001,
      "loss": 1.156,
      "step": 1576
    },
    {
      "epoch": 24.640625,
      "grad_norm": 2.6333189010620117,
      "learning_rate": 0.0001015,
      "loss": 1.4463,
      "step": 1577
    },
    {
      "epoch": 24.65625,
      "grad_norm": 2.3750483989715576,
      "learning_rate": 0.0001014375,
      "loss": 1.5005,
      "step": 1578
    },
    {
      "epoch": 24.671875,
      "grad_norm": 2.2241575717926025,
      "learning_rate": 0.000101375,
      "loss": 1.5814,
      "step": 1579
    },
    {
      "epoch": 24.6875,
      "grad_norm": 2.2765324115753174,
      "learning_rate": 0.00010131250000000002,
      "loss": 1.6168,
      "step": 1580
    },
    {
      "epoch": 24.703125,
      "grad_norm": 2.4621565341949463,
      "learning_rate": 0.00010125,
      "loss": 1.3621,
      "step": 1581
    },
    {
      "epoch": 24.71875,
      "grad_norm": 2.2673821449279785,
      "learning_rate": 0.00010118750000000001,
      "loss": 1.353,
      "step": 1582
    },
    {
      "epoch": 24.734375,
      "grad_norm": 2.7065539360046387,
      "learning_rate": 0.000101125,
      "loss": 1.3252,
      "step": 1583
    },
    {
      "epoch": 24.75,
      "grad_norm": 2.3651232719421387,
      "learning_rate": 0.00010106250000000001,
      "loss": 1.4484,
      "step": 1584
    },
    {
      "epoch": 24.765625,
      "grad_norm": 2.298460006713867,
      "learning_rate": 0.000101,
      "loss": 1.2749,
      "step": 1585
    },
    {
      "epoch": 24.78125,
      "grad_norm": 2.3618204593658447,
      "learning_rate": 0.0001009375,
      "loss": 1.6861,
      "step": 1586
    },
    {
      "epoch": 24.796875,
      "grad_norm": 2.6302666664123535,
      "learning_rate": 0.00010087500000000001,
      "loss": 1.5011,
      "step": 1587
    },
    {
      "epoch": 24.8125,
      "grad_norm": 2.478790283203125,
      "learning_rate": 0.0001008125,
      "loss": 1.4044,
      "step": 1588
    },
    {
      "epoch": 24.828125,
      "grad_norm": 2.410696506500244,
      "learning_rate": 0.00010075000000000001,
      "loss": 1.5353,
      "step": 1589
    },
    {
      "epoch": 24.84375,
      "grad_norm": 2.4125871658325195,
      "learning_rate": 0.0001006875,
      "loss": 1.3098,
      "step": 1590
    },
    {
      "epoch": 24.859375,
      "grad_norm": 2.440197467803955,
      "learning_rate": 0.00010062500000000002,
      "loss": 1.3932,
      "step": 1591
    },
    {
      "epoch": 24.875,
      "grad_norm": 2.340721845626831,
      "learning_rate": 0.0001005625,
      "loss": 1.3632,
      "step": 1592
    },
    {
      "epoch": 24.890625,
      "grad_norm": 2.686894655227661,
      "learning_rate": 0.00010049999999999999,
      "loss": 1.4477,
      "step": 1593
    },
    {
      "epoch": 24.90625,
      "grad_norm": 2.33424711227417,
      "learning_rate": 0.00010043750000000001,
      "loss": 1.418,
      "step": 1594
    },
    {
      "epoch": 24.921875,
      "grad_norm": 2.5665502548217773,
      "learning_rate": 0.000100375,
      "loss": 1.5478,
      "step": 1595
    },
    {
      "epoch": 24.9375,
      "grad_norm": 2.4240915775299072,
      "learning_rate": 0.0001003125,
      "loss": 1.508,
      "step": 1596
    },
    {
      "epoch": 24.953125,
      "grad_norm": 2.3143234252929688,
      "learning_rate": 0.00010025,
      "loss": 1.4168,
      "step": 1597
    },
    {
      "epoch": 24.96875,
      "grad_norm": 2.291243314743042,
      "learning_rate": 0.00010018750000000002,
      "loss": 1.6063,
      "step": 1598
    },
    {
      "epoch": 24.984375,
      "grad_norm": 2.638087272644043,
      "learning_rate": 0.00010012500000000001,
      "loss": 1.3699,
      "step": 1599
    },
    {
      "epoch": 25.0,
      "grad_norm": 2.7789061069488525,
      "learning_rate": 0.00010006250000000001,
      "loss": 1.1591,
      "step": 1600
    },
    {
      "epoch": 25.0,
      "eval_loss": 3.0369787216186523,
      "eval_runtime": 2.9019,
      "eval_samples_per_second": 176.434,
      "eval_steps_per_second": 44.108,
      "step": 1600
    },
    {
      "epoch": 25.015625,
      "grad_norm": 2.402273416519165,
      "learning_rate": 0.0001,
      "loss": 1.4322,
      "step": 1601
    },
    {
      "epoch": 25.03125,
      "grad_norm": 2.6802804470062256,
      "learning_rate": 9.993750000000001e-05,
      "loss": 1.5029,
      "step": 1602
    },
    {
      "epoch": 25.046875,
      "grad_norm": 2.347919464111328,
      "learning_rate": 9.9875e-05,
      "loss": 1.5186,
      "step": 1603
    },
    {
      "epoch": 25.0625,
      "grad_norm": 2.695657968521118,
      "learning_rate": 9.981250000000001e-05,
      "loss": 1.4162,
      "step": 1604
    },
    {
      "epoch": 25.078125,
      "grad_norm": 2.241659641265869,
      "learning_rate": 9.975000000000001e-05,
      "loss": 1.3741,
      "step": 1605
    },
    {
      "epoch": 25.09375,
      "grad_norm": 2.1076574325561523,
      "learning_rate": 9.96875e-05,
      "loss": 1.6519,
      "step": 1606
    },
    {
      "epoch": 25.109375,
      "grad_norm": 2.3000693321228027,
      "learning_rate": 9.9625e-05,
      "loss": 1.3268,
      "step": 1607
    },
    {
      "epoch": 25.125,
      "grad_norm": 1.939793348312378,
      "learning_rate": 9.95625e-05,
      "loss": 1.065,
      "step": 1608
    },
    {
      "epoch": 25.140625,
      "grad_norm": 2.758286952972412,
      "learning_rate": 9.95e-05,
      "loss": 1.4735,
      "step": 1609
    },
    {
      "epoch": 25.15625,
      "grad_norm": 2.3117568492889404,
      "learning_rate": 9.943750000000001e-05,
      "loss": 1.1273,
      "step": 1610
    },
    {
      "epoch": 25.171875,
      "grad_norm": 2.6908509731292725,
      "learning_rate": 9.9375e-05,
      "loss": 1.497,
      "step": 1611
    },
    {
      "epoch": 25.1875,
      "grad_norm": 2.320357322692871,
      "learning_rate": 9.931250000000001e-05,
      "loss": 1.5747,
      "step": 1612
    },
    {
      "epoch": 25.203125,
      "grad_norm": 2.652792453765869,
      "learning_rate": 9.925000000000001e-05,
      "loss": 1.2483,
      "step": 1613
    },
    {
      "epoch": 25.21875,
      "grad_norm": 2.6372992992401123,
      "learning_rate": 9.91875e-05,
      "loss": 1.1717,
      "step": 1614
    },
    {
      "epoch": 25.234375,
      "grad_norm": 2.5990500450134277,
      "learning_rate": 9.9125e-05,
      "loss": 1.5472,
      "step": 1615
    },
    {
      "epoch": 25.25,
      "grad_norm": 2.468355894088745,
      "learning_rate": 9.90625e-05,
      "loss": 1.3757,
      "step": 1616
    },
    {
      "epoch": 25.265625,
      "grad_norm": 2.59305477142334,
      "learning_rate": 9.900000000000001e-05,
      "loss": 1.4551,
      "step": 1617
    },
    {
      "epoch": 25.28125,
      "grad_norm": 2.2119433879852295,
      "learning_rate": 9.89375e-05,
      "loss": 1.3077,
      "step": 1618
    },
    {
      "epoch": 25.296875,
      "grad_norm": 2.4328441619873047,
      "learning_rate": 9.8875e-05,
      "loss": 1.3635,
      "step": 1619
    },
    {
      "epoch": 25.3125,
      "grad_norm": 2.327752113342285,
      "learning_rate": 9.881250000000001e-05,
      "loss": 1.316,
      "step": 1620
    },
    {
      "epoch": 25.328125,
      "grad_norm": 2.624512195587158,
      "learning_rate": 9.875000000000002e-05,
      "loss": 1.2466,
      "step": 1621
    },
    {
      "epoch": 25.34375,
      "grad_norm": 2.594153642654419,
      "learning_rate": 9.868749999999999e-05,
      "loss": 1.4777,
      "step": 1622
    },
    {
      "epoch": 25.359375,
      "grad_norm": 2.3484115600585938,
      "learning_rate": 9.8625e-05,
      "loss": 1.1701,
      "step": 1623
    },
    {
      "epoch": 25.375,
      "grad_norm": 2.501244068145752,
      "learning_rate": 9.85625e-05,
      "loss": 1.3554,
      "step": 1624
    },
    {
      "epoch": 25.390625,
      "grad_norm": 2.6592233180999756,
      "learning_rate": 9.850000000000001e-05,
      "loss": 1.6317,
      "step": 1625
    },
    {
      "epoch": 25.40625,
      "grad_norm": 2.437893867492676,
      "learning_rate": 9.84375e-05,
      "loss": 1.3655,
      "step": 1626
    },
    {
      "epoch": 25.421875,
      "grad_norm": 2.1880643367767334,
      "learning_rate": 9.8375e-05,
      "loss": 1.2839,
      "step": 1627
    },
    {
      "epoch": 25.4375,
      "grad_norm": 2.311964750289917,
      "learning_rate": 9.831250000000001e-05,
      "loss": 1.5907,
      "step": 1628
    },
    {
      "epoch": 25.453125,
      "grad_norm": 2.6679880619049072,
      "learning_rate": 9.825e-05,
      "loss": 1.4115,
      "step": 1629
    },
    {
      "epoch": 25.46875,
      "grad_norm": 2.6628611087799072,
      "learning_rate": 9.818750000000001e-05,
      "loss": 1.1042,
      "step": 1630
    },
    {
      "epoch": 25.484375,
      "grad_norm": 2.273676633834839,
      "learning_rate": 9.8125e-05,
      "loss": 1.5083,
      "step": 1631
    },
    {
      "epoch": 25.5,
      "grad_norm": 2.3626363277435303,
      "learning_rate": 9.80625e-05,
      "loss": 1.3165,
      "step": 1632
    },
    {
      "epoch": 25.515625,
      "grad_norm": 2.3172342777252197,
      "learning_rate": 9.8e-05,
      "loss": 1.5861,
      "step": 1633
    },
    {
      "epoch": 25.53125,
      "grad_norm": 2.4574830532073975,
      "learning_rate": 9.79375e-05,
      "loss": 1.3041,
      "step": 1634
    },
    {
      "epoch": 25.546875,
      "grad_norm": 2.1185312271118164,
      "learning_rate": 9.787500000000001e-05,
      "loss": 1.4166,
      "step": 1635
    },
    {
      "epoch": 25.5625,
      "grad_norm": 2.4537434577941895,
      "learning_rate": 9.781250000000001e-05,
      "loss": 1.5112,
      "step": 1636
    },
    {
      "epoch": 25.578125,
      "grad_norm": 2.3975329399108887,
      "learning_rate": 9.775e-05,
      "loss": 1.5498,
      "step": 1637
    },
    {
      "epoch": 25.59375,
      "grad_norm": 2.3379600048065186,
      "learning_rate": 9.768750000000001e-05,
      "loss": 1.6534,
      "step": 1638
    },
    {
      "epoch": 25.609375,
      "grad_norm": 2.443769693374634,
      "learning_rate": 9.7625e-05,
      "loss": 1.2835,
      "step": 1639
    },
    {
      "epoch": 25.625,
      "grad_norm": 2.509476661682129,
      "learning_rate": 9.75625e-05,
      "loss": 1.4726,
      "step": 1640
    },
    {
      "epoch": 25.640625,
      "grad_norm": 2.532467842102051,
      "learning_rate": 9.75e-05,
      "loss": 1.458,
      "step": 1641
    },
    {
      "epoch": 25.65625,
      "grad_norm": 2.3362936973571777,
      "learning_rate": 9.74375e-05,
      "loss": 1.1949,
      "step": 1642
    },
    {
      "epoch": 25.671875,
      "grad_norm": 2.6403255462646484,
      "learning_rate": 9.737500000000001e-05,
      "loss": 1.2909,
      "step": 1643
    },
    {
      "epoch": 25.6875,
      "grad_norm": 2.501408338546753,
      "learning_rate": 9.73125e-05,
      "loss": 1.182,
      "step": 1644
    },
    {
      "epoch": 25.703125,
      "grad_norm": 2.247575521469116,
      "learning_rate": 9.725e-05,
      "loss": 1.3852,
      "step": 1645
    },
    {
      "epoch": 25.71875,
      "grad_norm": 2.5459249019622803,
      "learning_rate": 9.718750000000001e-05,
      "loss": 1.3237,
      "step": 1646
    },
    {
      "epoch": 25.734375,
      "grad_norm": 2.2984299659729004,
      "learning_rate": 9.7125e-05,
      "loss": 1.4546,
      "step": 1647
    },
    {
      "epoch": 25.75,
      "grad_norm": 2.4679179191589355,
      "learning_rate": 9.70625e-05,
      "loss": 1.0767,
      "step": 1648
    },
    {
      "epoch": 25.765625,
      "grad_norm": 2.728808641433716,
      "learning_rate": 9.7e-05,
      "loss": 1.5024,
      "step": 1649
    },
    {
      "epoch": 25.78125,
      "grad_norm": 2.4946911334991455,
      "learning_rate": 9.69375e-05,
      "loss": 1.4134,
      "step": 1650
    },
    {
      "epoch": 25.796875,
      "grad_norm": 2.377716302871704,
      "learning_rate": 9.687500000000001e-05,
      "loss": 1.5863,
      "step": 1651
    },
    {
      "epoch": 25.8125,
      "grad_norm": 2.382518768310547,
      "learning_rate": 9.68125e-05,
      "loss": 1.2179,
      "step": 1652
    },
    {
      "epoch": 25.828125,
      "grad_norm": 2.4088428020477295,
      "learning_rate": 9.675000000000001e-05,
      "loss": 1.4555,
      "step": 1653
    },
    {
      "epoch": 25.84375,
      "grad_norm": 2.4924731254577637,
      "learning_rate": 9.668750000000001e-05,
      "loss": 1.2649,
      "step": 1654
    },
    {
      "epoch": 25.859375,
      "grad_norm": 2.2659800052642822,
      "learning_rate": 9.6625e-05,
      "loss": 1.4055,
      "step": 1655
    },
    {
      "epoch": 25.875,
      "grad_norm": 2.4320192337036133,
      "learning_rate": 9.65625e-05,
      "loss": 1.5751,
      "step": 1656
    },
    {
      "epoch": 25.890625,
      "grad_norm": 2.563300371170044,
      "learning_rate": 9.65e-05,
      "loss": 1.2948,
      "step": 1657
    },
    {
      "epoch": 25.90625,
      "grad_norm": 2.127593994140625,
      "learning_rate": 9.64375e-05,
      "loss": 1.1497,
      "step": 1658
    },
    {
      "epoch": 25.921875,
      "grad_norm": 2.434506893157959,
      "learning_rate": 9.6375e-05,
      "loss": 1.2242,
      "step": 1659
    },
    {
      "epoch": 25.9375,
      "grad_norm": 2.408704996109009,
      "learning_rate": 9.63125e-05,
      "loss": 1.4203,
      "step": 1660
    },
    {
      "epoch": 25.953125,
      "grad_norm": 2.248551845550537,
      "learning_rate": 9.625000000000001e-05,
      "loss": 1.5729,
      "step": 1661
    },
    {
      "epoch": 25.96875,
      "grad_norm": 2.3623831272125244,
      "learning_rate": 9.618750000000001e-05,
      "loss": 1.6557,
      "step": 1662
    },
    {
      "epoch": 25.984375,
      "grad_norm": 2.1898014545440674,
      "learning_rate": 9.6125e-05,
      "loss": 1.6293,
      "step": 1663
    },
    {
      "epoch": 26.0,
      "grad_norm": 2.4342739582061768,
      "learning_rate": 9.60625e-05,
      "loss": 1.5795,
      "step": 1664
    },
    {
      "epoch": 26.0,
      "eval_loss": 3.037109375,
      "eval_runtime": 2.8877,
      "eval_samples_per_second": 177.301,
      "eval_steps_per_second": 44.325,
      "step": 1664
    },
    {
      "epoch": 26.015625,
      "grad_norm": 2.359473466873169,
      "learning_rate": 9.6e-05,
      "loss": 1.174,
      "step": 1665
    },
    {
      "epoch": 26.03125,
      "grad_norm": 2.2502474784851074,
      "learning_rate": 9.593750000000001e-05,
      "loss": 1.3392,
      "step": 1666
    },
    {
      "epoch": 26.046875,
      "grad_norm": 2.4700570106506348,
      "learning_rate": 9.5875e-05,
      "loss": 1.1903,
      "step": 1667
    },
    {
      "epoch": 26.0625,
      "grad_norm": 2.5304882526397705,
      "learning_rate": 9.58125e-05,
      "loss": 1.3401,
      "step": 1668
    },
    {
      "epoch": 26.078125,
      "grad_norm": 2.1821560859680176,
      "learning_rate": 9.575000000000001e-05,
      "loss": 1.2883,
      "step": 1669
    },
    {
      "epoch": 26.09375,
      "grad_norm": 2.472494602203369,
      "learning_rate": 9.56875e-05,
      "loss": 1.5621,
      "step": 1670
    },
    {
      "epoch": 26.109375,
      "grad_norm": 2.5188286304473877,
      "learning_rate": 9.562500000000001e-05,
      "loss": 1.4873,
      "step": 1671
    },
    {
      "epoch": 26.125,
      "grad_norm": 2.2774994373321533,
      "learning_rate": 9.55625e-05,
      "loss": 1.4862,
      "step": 1672
    },
    {
      "epoch": 26.140625,
      "grad_norm": 2.4163434505462646,
      "learning_rate": 9.55e-05,
      "loss": 1.4242,
      "step": 1673
    },
    {
      "epoch": 26.15625,
      "grad_norm": 2.4388198852539062,
      "learning_rate": 9.54375e-05,
      "loss": 1.6289,
      "step": 1674
    },
    {
      "epoch": 26.171875,
      "grad_norm": 2.5748300552368164,
      "learning_rate": 9.5375e-05,
      "loss": 1.448,
      "step": 1675
    },
    {
      "epoch": 26.1875,
      "grad_norm": 2.727675676345825,
      "learning_rate": 9.53125e-05,
      "loss": 1.2517,
      "step": 1676
    },
    {
      "epoch": 26.203125,
      "grad_norm": 2.3279924392700195,
      "learning_rate": 9.525000000000001e-05,
      "loss": 1.4263,
      "step": 1677
    },
    {
      "epoch": 26.21875,
      "grad_norm": 2.3999598026275635,
      "learning_rate": 9.51875e-05,
      "loss": 1.3406,
      "step": 1678
    },
    {
      "epoch": 26.234375,
      "grad_norm": 2.2347137928009033,
      "learning_rate": 9.512500000000001e-05,
      "loss": 1.3695,
      "step": 1679
    },
    {
      "epoch": 26.25,
      "grad_norm": 2.2676968574523926,
      "learning_rate": 9.506250000000001e-05,
      "loss": 1.2946,
      "step": 1680
    },
    {
      "epoch": 26.265625,
      "grad_norm": 2.3561336994171143,
      "learning_rate": 9.5e-05,
      "loss": 1.5422,
      "step": 1681
    },
    {
      "epoch": 26.28125,
      "grad_norm": 2.650923013687134,
      "learning_rate": 9.49375e-05,
      "loss": 1.5571,
      "step": 1682
    },
    {
      "epoch": 26.296875,
      "grad_norm": 2.0348846912384033,
      "learning_rate": 9.4875e-05,
      "loss": 1.461,
      "step": 1683
    },
    {
      "epoch": 26.3125,
      "grad_norm": 2.3180058002471924,
      "learning_rate": 9.481250000000001e-05,
      "loss": 1.3459,
      "step": 1684
    },
    {
      "epoch": 26.328125,
      "grad_norm": 2.3869950771331787,
      "learning_rate": 9.475e-05,
      "loss": 1.2724,
      "step": 1685
    },
    {
      "epoch": 26.34375,
      "grad_norm": 2.3643393516540527,
      "learning_rate": 9.46875e-05,
      "loss": 1.2619,
      "step": 1686
    },
    {
      "epoch": 26.359375,
      "grad_norm": 2.5261318683624268,
      "learning_rate": 9.462500000000001e-05,
      "loss": 1.1863,
      "step": 1687
    },
    {
      "epoch": 26.375,
      "grad_norm": 2.5459530353546143,
      "learning_rate": 9.456250000000001e-05,
      "loss": 0.9784,
      "step": 1688
    },
    {
      "epoch": 26.390625,
      "grad_norm": 2.421006917953491,
      "learning_rate": 9.449999999999999e-05,
      "loss": 1.3367,
      "step": 1689
    },
    {
      "epoch": 26.40625,
      "grad_norm": 2.5827419757843018,
      "learning_rate": 9.44375e-05,
      "loss": 1.3137,
      "step": 1690
    },
    {
      "epoch": 26.421875,
      "grad_norm": 2.249692916870117,
      "learning_rate": 9.4375e-05,
      "loss": 1.3008,
      "step": 1691
    },
    {
      "epoch": 26.4375,
      "grad_norm": 2.649374008178711,
      "learning_rate": 9.431250000000001e-05,
      "loss": 1.3185,
      "step": 1692
    },
    {
      "epoch": 26.453125,
      "grad_norm": 2.79433536529541,
      "learning_rate": 9.425e-05,
      "loss": 1.3777,
      "step": 1693
    },
    {
      "epoch": 26.46875,
      "grad_norm": 2.6525163650512695,
      "learning_rate": 9.41875e-05,
      "loss": 1.6373,
      "step": 1694
    },
    {
      "epoch": 26.484375,
      "grad_norm": 2.3282310962677,
      "learning_rate": 9.412500000000001e-05,
      "loss": 1.6138,
      "step": 1695
    },
    {
      "epoch": 26.5,
      "grad_norm": 2.3625435829162598,
      "learning_rate": 9.40625e-05,
      "loss": 1.4599,
      "step": 1696
    },
    {
      "epoch": 26.515625,
      "grad_norm": 2.6207196712493896,
      "learning_rate": 9.4e-05,
      "loss": 1.4551,
      "step": 1697
    },
    {
      "epoch": 26.53125,
      "grad_norm": 2.3300282955169678,
      "learning_rate": 9.39375e-05,
      "loss": 1.2391,
      "step": 1698
    },
    {
      "epoch": 26.546875,
      "grad_norm": 2.469866991043091,
      "learning_rate": 9.3875e-05,
      "loss": 1.1419,
      "step": 1699
    },
    {
      "epoch": 26.5625,
      "grad_norm": 2.3741703033447266,
      "learning_rate": 9.38125e-05,
      "loss": 1.5244,
      "step": 1700
    },
    {
      "epoch": 26.578125,
      "grad_norm": 2.3963866233825684,
      "learning_rate": 9.375e-05,
      "loss": 1.3899,
      "step": 1701
    },
    {
      "epoch": 26.59375,
      "grad_norm": 2.316650390625,
      "learning_rate": 9.368750000000001e-05,
      "loss": 1.3574,
      "step": 1702
    },
    {
      "epoch": 26.609375,
      "grad_norm": 2.4439876079559326,
      "learning_rate": 9.362500000000001e-05,
      "loss": 1.7119,
      "step": 1703
    },
    {
      "epoch": 26.625,
      "grad_norm": 2.3607192039489746,
      "learning_rate": 9.35625e-05,
      "loss": 1.421,
      "step": 1704
    },
    {
      "epoch": 26.640625,
      "grad_norm": 2.2711374759674072,
      "learning_rate": 9.350000000000001e-05,
      "loss": 1.3409,
      "step": 1705
    },
    {
      "epoch": 26.65625,
      "grad_norm": 2.4565532207489014,
      "learning_rate": 9.34375e-05,
      "loss": 1.4585,
      "step": 1706
    },
    {
      "epoch": 26.671875,
      "grad_norm": 2.258512020111084,
      "learning_rate": 9.3375e-05,
      "loss": 1.2507,
      "step": 1707
    },
    {
      "epoch": 26.6875,
      "grad_norm": 2.7186057567596436,
      "learning_rate": 9.33125e-05,
      "loss": 1.3651,
      "step": 1708
    },
    {
      "epoch": 26.703125,
      "grad_norm": 2.407775402069092,
      "learning_rate": 9.325e-05,
      "loss": 1.3347,
      "step": 1709
    },
    {
      "epoch": 26.71875,
      "grad_norm": 2.765726089477539,
      "learning_rate": 9.318750000000001e-05,
      "loss": 1.3936,
      "step": 1710
    },
    {
      "epoch": 26.734375,
      "grad_norm": 2.499204635620117,
      "learning_rate": 9.3125e-05,
      "loss": 1.533,
      "step": 1711
    },
    {
      "epoch": 26.75,
      "grad_norm": 2.663628578186035,
      "learning_rate": 9.30625e-05,
      "loss": 1.2934,
      "step": 1712
    },
    {
      "epoch": 26.765625,
      "grad_norm": 2.4822120666503906,
      "learning_rate": 9.300000000000001e-05,
      "loss": 1.4062,
      "step": 1713
    },
    {
      "epoch": 26.78125,
      "grad_norm": 2.393930673599243,
      "learning_rate": 9.29375e-05,
      "loss": 1.1824,
      "step": 1714
    },
    {
      "epoch": 26.796875,
      "grad_norm": 2.6587324142456055,
      "learning_rate": 9.2875e-05,
      "loss": 1.4549,
      "step": 1715
    },
    {
      "epoch": 26.8125,
      "grad_norm": 2.476799488067627,
      "learning_rate": 9.28125e-05,
      "loss": 1.2807,
      "step": 1716
    },
    {
      "epoch": 26.828125,
      "grad_norm": 2.494565010070801,
      "learning_rate": 9.275e-05,
      "loss": 1.6049,
      "step": 1717
    },
    {
      "epoch": 26.84375,
      "grad_norm": 2.2905490398406982,
      "learning_rate": 9.268750000000001e-05,
      "loss": 1.4489,
      "step": 1718
    },
    {
      "epoch": 26.859375,
      "grad_norm": 2.9204487800598145,
      "learning_rate": 9.2625e-05,
      "loss": 1.2724,
      "step": 1719
    },
    {
      "epoch": 26.875,
      "grad_norm": 2.6118133068084717,
      "learning_rate": 9.256250000000001e-05,
      "loss": 1.3596,
      "step": 1720
    },
    {
      "epoch": 26.890625,
      "grad_norm": 2.7343204021453857,
      "learning_rate": 9.250000000000001e-05,
      "loss": 1.4667,
      "step": 1721
    },
    {
      "epoch": 26.90625,
      "grad_norm": 2.75205135345459,
      "learning_rate": 9.24375e-05,
      "loss": 1.2396,
      "step": 1722
    },
    {
      "epoch": 26.921875,
      "grad_norm": 2.4443633556365967,
      "learning_rate": 9.2375e-05,
      "loss": 1.3522,
      "step": 1723
    },
    {
      "epoch": 26.9375,
      "grad_norm": 2.405514717102051,
      "learning_rate": 9.23125e-05,
      "loss": 1.3323,
      "step": 1724
    },
    {
      "epoch": 26.953125,
      "grad_norm": 2.793332576751709,
      "learning_rate": 9.225e-05,
      "loss": 1.4382,
      "step": 1725
    },
    {
      "epoch": 26.96875,
      "grad_norm": 2.470749616622925,
      "learning_rate": 9.21875e-05,
      "loss": 1.6164,
      "step": 1726
    },
    {
      "epoch": 26.984375,
      "grad_norm": 2.336679458618164,
      "learning_rate": 9.2125e-05,
      "loss": 1.6345,
      "step": 1727
    },
    {
      "epoch": 27.0,
      "grad_norm": 2.297586679458618,
      "learning_rate": 9.206250000000001e-05,
      "loss": 1.436,
      "step": 1728
    },
    {
      "epoch": 27.0,
      "eval_loss": 3.0515542030334473,
      "eval_runtime": 2.8579,
      "eval_samples_per_second": 179.151,
      "eval_steps_per_second": 44.788,
      "step": 1728
    },
    {
      "epoch": 27.015625,
      "grad_norm": 2.394995927810669,
      "learning_rate": 9.200000000000001e-05,
      "loss": 1.0737,
      "step": 1729
    },
    {
      "epoch": 27.03125,
      "grad_norm": 2.322646379470825,
      "learning_rate": 9.19375e-05,
      "loss": 1.5705,
      "step": 1730
    },
    {
      "epoch": 27.046875,
      "grad_norm": 2.601682424545288,
      "learning_rate": 9.1875e-05,
      "loss": 1.2631,
      "step": 1731
    },
    {
      "epoch": 27.0625,
      "grad_norm": 2.262084722518921,
      "learning_rate": 9.18125e-05,
      "loss": 1.3804,
      "step": 1732
    },
    {
      "epoch": 27.078125,
      "grad_norm": 2.4643077850341797,
      "learning_rate": 9.175000000000001e-05,
      "loss": 1.1702,
      "step": 1733
    },
    {
      "epoch": 27.09375,
      "grad_norm": 2.7816123962402344,
      "learning_rate": 9.16875e-05,
      "loss": 1.3708,
      "step": 1734
    },
    {
      "epoch": 27.109375,
      "grad_norm": 2.6347217559814453,
      "learning_rate": 9.1625e-05,
      "loss": 1.3125,
      "step": 1735
    },
    {
      "epoch": 27.125,
      "grad_norm": 2.2542450428009033,
      "learning_rate": 9.156250000000001e-05,
      "loss": 1.4287,
      "step": 1736
    },
    {
      "epoch": 27.140625,
      "grad_norm": 2.1367249488830566,
      "learning_rate": 9.15e-05,
      "loss": 1.347,
      "step": 1737
    },
    {
      "epoch": 27.15625,
      "grad_norm": 2.4237589836120605,
      "learning_rate": 9.14375e-05,
      "loss": 1.3072,
      "step": 1738
    },
    {
      "epoch": 27.171875,
      "grad_norm": 2.834909200668335,
      "learning_rate": 9.1375e-05,
      "loss": 1.3312,
      "step": 1739
    },
    {
      "epoch": 27.1875,
      "grad_norm": 2.970099449157715,
      "learning_rate": 9.13125e-05,
      "loss": 1.4088,
      "step": 1740
    },
    {
      "epoch": 27.203125,
      "grad_norm": 2.443269729614258,
      "learning_rate": 9.125e-05,
      "loss": 1.1643,
      "step": 1741
    },
    {
      "epoch": 27.21875,
      "grad_norm": 2.152560234069824,
      "learning_rate": 9.11875e-05,
      "loss": 1.4581,
      "step": 1742
    },
    {
      "epoch": 27.234375,
      "grad_norm": 2.294389247894287,
      "learning_rate": 9.1125e-05,
      "loss": 1.3088,
      "step": 1743
    },
    {
      "epoch": 27.25,
      "grad_norm": 2.39121413230896,
      "learning_rate": 9.106250000000001e-05,
      "loss": 1.5176,
      "step": 1744
    },
    {
      "epoch": 27.265625,
      "grad_norm": 2.5851144790649414,
      "learning_rate": 9.1e-05,
      "loss": 1.3417,
      "step": 1745
    },
    {
      "epoch": 27.28125,
      "grad_norm": 2.5576953887939453,
      "learning_rate": 9.093750000000001e-05,
      "loss": 1.34,
      "step": 1746
    },
    {
      "epoch": 27.296875,
      "grad_norm": 2.4649605751037598,
      "learning_rate": 9.0875e-05,
      "loss": 1.2727,
      "step": 1747
    },
    {
      "epoch": 27.3125,
      "grad_norm": 2.0990447998046875,
      "learning_rate": 9.08125e-05,
      "loss": 1.395,
      "step": 1748
    },
    {
      "epoch": 27.328125,
      "grad_norm": 2.2938783168792725,
      "learning_rate": 9.075e-05,
      "loss": 1.3392,
      "step": 1749
    },
    {
      "epoch": 27.34375,
      "grad_norm": 2.2091853618621826,
      "learning_rate": 9.06875e-05,
      "loss": 1.3404,
      "step": 1750
    },
    {
      "epoch": 27.359375,
      "grad_norm": 2.7596166133880615,
      "learning_rate": 9.062500000000001e-05,
      "loss": 1.3188,
      "step": 1751
    },
    {
      "epoch": 27.375,
      "grad_norm": 2.531481981277466,
      "learning_rate": 9.05625e-05,
      "loss": 1.3736,
      "step": 1752
    },
    {
      "epoch": 27.390625,
      "grad_norm": 2.1919689178466797,
      "learning_rate": 9.05e-05,
      "loss": 1.2958,
      "step": 1753
    },
    {
      "epoch": 27.40625,
      "grad_norm": 2.62312912940979,
      "learning_rate": 9.043750000000001e-05,
      "loss": 1.5257,
      "step": 1754
    },
    {
      "epoch": 27.421875,
      "grad_norm": 2.2387239933013916,
      "learning_rate": 9.037500000000001e-05,
      "loss": 1.4089,
      "step": 1755
    },
    {
      "epoch": 27.4375,
      "grad_norm": 2.2942259311676025,
      "learning_rate": 9.03125e-05,
      "loss": 1.5069,
      "step": 1756
    },
    {
      "epoch": 27.453125,
      "grad_norm": 2.4681785106658936,
      "learning_rate": 9.025e-05,
      "loss": 1.4006,
      "step": 1757
    },
    {
      "epoch": 27.46875,
      "grad_norm": 2.3471291065216064,
      "learning_rate": 9.01875e-05,
      "loss": 1.5172,
      "step": 1758
    },
    {
      "epoch": 27.484375,
      "grad_norm": 2.606654405593872,
      "learning_rate": 9.012500000000001e-05,
      "loss": 1.4537,
      "step": 1759
    },
    {
      "epoch": 27.5,
      "grad_norm": 2.631474733352661,
      "learning_rate": 9.00625e-05,
      "loss": 1.3709,
      "step": 1760
    },
    {
      "epoch": 27.515625,
      "grad_norm": 2.631814479827881,
      "learning_rate": 9e-05,
      "loss": 1.5111,
      "step": 1761
    },
    {
      "epoch": 27.53125,
      "grad_norm": 2.4277780055999756,
      "learning_rate": 8.993750000000001e-05,
      "loss": 1.1631,
      "step": 1762
    },
    {
      "epoch": 27.546875,
      "grad_norm": 2.595301866531372,
      "learning_rate": 8.9875e-05,
      "loss": 1.4544,
      "step": 1763
    },
    {
      "epoch": 27.5625,
      "grad_norm": 2.1347408294677734,
      "learning_rate": 8.98125e-05,
      "loss": 1.6255,
      "step": 1764
    },
    {
      "epoch": 27.578125,
      "grad_norm": 2.7754173278808594,
      "learning_rate": 8.975e-05,
      "loss": 1.2733,
      "step": 1765
    },
    {
      "epoch": 27.59375,
      "grad_norm": 2.566636085510254,
      "learning_rate": 8.96875e-05,
      "loss": 1.3959,
      "step": 1766
    },
    {
      "epoch": 27.609375,
      "grad_norm": 2.3145668506622314,
      "learning_rate": 8.962500000000001e-05,
      "loss": 1.5059,
      "step": 1767
    },
    {
      "epoch": 27.625,
      "grad_norm": 2.4023220539093018,
      "learning_rate": 8.95625e-05,
      "loss": 1.0971,
      "step": 1768
    },
    {
      "epoch": 27.640625,
      "grad_norm": 2.389479875564575,
      "learning_rate": 8.950000000000001e-05,
      "loss": 1.3144,
      "step": 1769
    },
    {
      "epoch": 27.65625,
      "grad_norm": 2.705702066421509,
      "learning_rate": 8.943750000000001e-05,
      "loss": 1.207,
      "step": 1770
    },
    {
      "epoch": 27.671875,
      "grad_norm": 2.815072536468506,
      "learning_rate": 8.9375e-05,
      "loss": 1.2191,
      "step": 1771
    },
    {
      "epoch": 27.6875,
      "grad_norm": 2.3281748294830322,
      "learning_rate": 8.93125e-05,
      "loss": 1.3877,
      "step": 1772
    },
    {
      "epoch": 27.703125,
      "grad_norm": 2.625584840774536,
      "learning_rate": 8.925e-05,
      "loss": 1.3946,
      "step": 1773
    },
    {
      "epoch": 27.71875,
      "grad_norm": 2.4452645778656006,
      "learning_rate": 8.91875e-05,
      "loss": 1.4279,
      "step": 1774
    },
    {
      "epoch": 27.734375,
      "grad_norm": 2.633371114730835,
      "learning_rate": 8.9125e-05,
      "loss": 1.2781,
      "step": 1775
    },
    {
      "epoch": 27.75,
      "grad_norm": 2.356947183609009,
      "learning_rate": 8.90625e-05,
      "loss": 1.4853,
      "step": 1776
    },
    {
      "epoch": 27.765625,
      "grad_norm": 2.5434412956237793,
      "learning_rate": 8.900000000000001e-05,
      "loss": 1.4063,
      "step": 1777
    },
    {
      "epoch": 27.78125,
      "grad_norm": 2.3137855529785156,
      "learning_rate": 8.89375e-05,
      "loss": 1.4077,
      "step": 1778
    },
    {
      "epoch": 27.796875,
      "grad_norm": 2.306757926940918,
      "learning_rate": 8.8875e-05,
      "loss": 1.3858,
      "step": 1779
    },
    {
      "epoch": 27.8125,
      "grad_norm": 2.1507675647735596,
      "learning_rate": 8.881250000000001e-05,
      "loss": 1.1612,
      "step": 1780
    },
    {
      "epoch": 27.828125,
      "grad_norm": 2.516491174697876,
      "learning_rate": 8.875e-05,
      "loss": 1.5623,
      "step": 1781
    },
    {
      "epoch": 27.84375,
      "grad_norm": 2.442410469055176,
      "learning_rate": 8.868750000000001e-05,
      "loss": 1.3042,
      "step": 1782
    },
    {
      "epoch": 27.859375,
      "grad_norm": 2.4291374683380127,
      "learning_rate": 8.8625e-05,
      "loss": 1.4468,
      "step": 1783
    },
    {
      "epoch": 27.875,
      "grad_norm": 2.2050421237945557,
      "learning_rate": 8.85625e-05,
      "loss": 1.1307,
      "step": 1784
    },
    {
      "epoch": 27.890625,
      "grad_norm": 2.4268059730529785,
      "learning_rate": 8.850000000000001e-05,
      "loss": 1.5025,
      "step": 1785
    },
    {
      "epoch": 27.90625,
      "grad_norm": 2.214414358139038,
      "learning_rate": 8.84375e-05,
      "loss": 1.3939,
      "step": 1786
    },
    {
      "epoch": 27.921875,
      "grad_norm": 2.365525960922241,
      "learning_rate": 8.837500000000001e-05,
      "loss": 1.4106,
      "step": 1787
    },
    {
      "epoch": 27.9375,
      "grad_norm": 2.4248297214508057,
      "learning_rate": 8.831250000000001e-05,
      "loss": 1.398,
      "step": 1788
    },
    {
      "epoch": 27.953125,
      "grad_norm": 2.3907768726348877,
      "learning_rate": 8.825e-05,
      "loss": 1.2658,
      "step": 1789
    },
    {
      "epoch": 27.96875,
      "grad_norm": 2.4537782669067383,
      "learning_rate": 8.81875e-05,
      "loss": 1.3966,
      "step": 1790
    },
    {
      "epoch": 27.984375,
      "grad_norm": 2.353975772857666,
      "learning_rate": 8.8125e-05,
      "loss": 1.3179,
      "step": 1791
    },
    {
      "epoch": 28.0,
      "grad_norm": 2.512988567352295,
      "learning_rate": 8.80625e-05,
      "loss": 1.4038,
      "step": 1792
    },
    {
      "epoch": 28.0,
      "eval_loss": 3.045253038406372,
      "eval_runtime": 2.9323,
      "eval_samples_per_second": 174.608,
      "eval_steps_per_second": 43.652,
      "step": 1792
    },
    {
      "epoch": 28.015625,
      "grad_norm": 2.180790662765503,
      "learning_rate": 8.800000000000001e-05,
      "loss": 1.5353,
      "step": 1793
    },
    {
      "epoch": 28.03125,
      "grad_norm": 2.504127264022827,
      "learning_rate": 8.79375e-05,
      "loss": 1.3198,
      "step": 1794
    },
    {
      "epoch": 28.046875,
      "grad_norm": 2.36154842376709,
      "learning_rate": 8.787500000000001e-05,
      "loss": 1.083,
      "step": 1795
    },
    {
      "epoch": 28.0625,
      "grad_norm": 2.4244182109832764,
      "learning_rate": 8.781250000000001e-05,
      "loss": 1.0324,
      "step": 1796
    },
    {
      "epoch": 28.078125,
      "grad_norm": 2.4665822982788086,
      "learning_rate": 8.775e-05,
      "loss": 1.3075,
      "step": 1797
    },
    {
      "epoch": 28.09375,
      "grad_norm": 2.3777945041656494,
      "learning_rate": 8.76875e-05,
      "loss": 1.4326,
      "step": 1798
    },
    {
      "epoch": 28.109375,
      "grad_norm": 2.488539457321167,
      "learning_rate": 8.7625e-05,
      "loss": 1.2706,
      "step": 1799
    },
    {
      "epoch": 28.125,
      "grad_norm": 2.325396776199341,
      "learning_rate": 8.756250000000001e-05,
      "loss": 1.2582,
      "step": 1800
    },
    {
      "epoch": 28.140625,
      "grad_norm": 2.4544148445129395,
      "learning_rate": 8.75e-05,
      "loss": 1.2119,
      "step": 1801
    },
    {
      "epoch": 28.15625,
      "grad_norm": 2.7688701152801514,
      "learning_rate": 8.74375e-05,
      "loss": 1.2673,
      "step": 1802
    },
    {
      "epoch": 28.171875,
      "grad_norm": 2.4710772037506104,
      "learning_rate": 8.737500000000001e-05,
      "loss": 1.2793,
      "step": 1803
    },
    {
      "epoch": 28.1875,
      "grad_norm": 2.527799367904663,
      "learning_rate": 8.731250000000001e-05,
      "loss": 1.4602,
      "step": 1804
    },
    {
      "epoch": 28.203125,
      "grad_norm": 2.356613874435425,
      "learning_rate": 8.725e-05,
      "loss": 1.3781,
      "step": 1805
    },
    {
      "epoch": 28.21875,
      "grad_norm": 2.352174758911133,
      "learning_rate": 8.71875e-05,
      "loss": 1.3206,
      "step": 1806
    },
    {
      "epoch": 28.234375,
      "grad_norm": 2.291260242462158,
      "learning_rate": 8.7125e-05,
      "loss": 1.348,
      "step": 1807
    },
    {
      "epoch": 28.25,
      "grad_norm": 2.3005340099334717,
      "learning_rate": 8.706250000000001e-05,
      "loss": 1.5759,
      "step": 1808
    },
    {
      "epoch": 28.265625,
      "grad_norm": 2.745114326477051,
      "learning_rate": 8.7e-05,
      "loss": 1.4649,
      "step": 1809
    },
    {
      "epoch": 28.28125,
      "grad_norm": 2.0481064319610596,
      "learning_rate": 8.69375e-05,
      "loss": 1.1149,
      "step": 1810
    },
    {
      "epoch": 28.296875,
      "grad_norm": 2.2938053607940674,
      "learning_rate": 8.687500000000001e-05,
      "loss": 1.2594,
      "step": 1811
    },
    {
      "epoch": 28.3125,
      "grad_norm": 2.209815263748169,
      "learning_rate": 8.68125e-05,
      "loss": 1.2531,
      "step": 1812
    },
    {
      "epoch": 28.328125,
      "grad_norm": 2.3220393657684326,
      "learning_rate": 8.675000000000001e-05,
      "loss": 1.4532,
      "step": 1813
    },
    {
      "epoch": 28.34375,
      "grad_norm": 2.331737995147705,
      "learning_rate": 8.66875e-05,
      "loss": 1.4683,
      "step": 1814
    },
    {
      "epoch": 28.359375,
      "grad_norm": 2.4901936054229736,
      "learning_rate": 8.6625e-05,
      "loss": 1.4209,
      "step": 1815
    },
    {
      "epoch": 28.375,
      "grad_norm": 2.298083543777466,
      "learning_rate": 8.65625e-05,
      "loss": 1.3529,
      "step": 1816
    },
    {
      "epoch": 28.390625,
      "grad_norm": 2.8475091457366943,
      "learning_rate": 8.65e-05,
      "loss": 1.3962,
      "step": 1817
    },
    {
      "epoch": 28.40625,
      "grad_norm": 2.287780284881592,
      "learning_rate": 8.643750000000001e-05,
      "loss": 1.2471,
      "step": 1818
    },
    {
      "epoch": 28.421875,
      "grad_norm": 2.536971092224121,
      "learning_rate": 8.637500000000001e-05,
      "loss": 1.2857,
      "step": 1819
    },
    {
      "epoch": 28.4375,
      "grad_norm": 2.273892879486084,
      "learning_rate": 8.63125e-05,
      "loss": 1.4475,
      "step": 1820
    },
    {
      "epoch": 28.453125,
      "grad_norm": 2.5178885459899902,
      "learning_rate": 8.625000000000001e-05,
      "loss": 1.2241,
      "step": 1821
    },
    {
      "epoch": 28.46875,
      "grad_norm": 2.291532516479492,
      "learning_rate": 8.61875e-05,
      "loss": 1.1414,
      "step": 1822
    },
    {
      "epoch": 28.484375,
      "grad_norm": 2.322047233581543,
      "learning_rate": 8.6125e-05,
      "loss": 1.5483,
      "step": 1823
    },
    {
      "epoch": 28.5,
      "grad_norm": 2.290435552597046,
      "learning_rate": 8.60625e-05,
      "loss": 1.4893,
      "step": 1824
    },
    {
      "epoch": 28.515625,
      "grad_norm": 2.252124786376953,
      "learning_rate": 8.6e-05,
      "loss": 1.3668,
      "step": 1825
    },
    {
      "epoch": 28.53125,
      "grad_norm": 2.185767889022827,
      "learning_rate": 8.593750000000001e-05,
      "loss": 1.4924,
      "step": 1826
    },
    {
      "epoch": 28.546875,
      "grad_norm": 2.6515204906463623,
      "learning_rate": 8.5875e-05,
      "loss": 1.329,
      "step": 1827
    },
    {
      "epoch": 28.5625,
      "grad_norm": 2.865997791290283,
      "learning_rate": 8.58125e-05,
      "loss": 1.5405,
      "step": 1828
    },
    {
      "epoch": 28.578125,
      "grad_norm": 3.0387699604034424,
      "learning_rate": 8.575000000000001e-05,
      "loss": 1.343,
      "step": 1829
    },
    {
      "epoch": 28.59375,
      "grad_norm": 2.1882097721099854,
      "learning_rate": 8.568750000000002e-05,
      "loss": 1.1875,
      "step": 1830
    },
    {
      "epoch": 28.609375,
      "grad_norm": 2.6367456912994385,
      "learning_rate": 8.5625e-05,
      "loss": 1.4127,
      "step": 1831
    },
    {
      "epoch": 28.625,
      "grad_norm": 2.7514266967773438,
      "learning_rate": 8.55625e-05,
      "loss": 1.5041,
      "step": 1832
    },
    {
      "epoch": 28.640625,
      "grad_norm": 2.298607110977173,
      "learning_rate": 8.55e-05,
      "loss": 1.0859,
      "step": 1833
    },
    {
      "epoch": 28.65625,
      "grad_norm": 2.4161887168884277,
      "learning_rate": 8.543750000000001e-05,
      "loss": 1.2618,
      "step": 1834
    },
    {
      "epoch": 28.671875,
      "grad_norm": 2.585703134536743,
      "learning_rate": 8.5375e-05,
      "loss": 1.3986,
      "step": 1835
    },
    {
      "epoch": 28.6875,
      "grad_norm": 2.5339415073394775,
      "learning_rate": 8.531250000000001e-05,
      "loss": 1.3509,
      "step": 1836
    },
    {
      "epoch": 28.703125,
      "grad_norm": 2.4321413040161133,
      "learning_rate": 8.525000000000001e-05,
      "loss": 1.3494,
      "step": 1837
    },
    {
      "epoch": 28.71875,
      "grad_norm": 2.1875975131988525,
      "learning_rate": 8.51875e-05,
      "loss": 1.302,
      "step": 1838
    },
    {
      "epoch": 28.734375,
      "grad_norm": 2.5404233932495117,
      "learning_rate": 8.5125e-05,
      "loss": 1.5992,
      "step": 1839
    },
    {
      "epoch": 28.75,
      "grad_norm": 2.6953251361846924,
      "learning_rate": 8.50625e-05,
      "loss": 1.2098,
      "step": 1840
    },
    {
      "epoch": 28.765625,
      "grad_norm": 2.4873664379119873,
      "learning_rate": 8.5e-05,
      "loss": 1.3932,
      "step": 1841
    },
    {
      "epoch": 28.78125,
      "grad_norm": 1.898756504058838,
      "learning_rate": 8.49375e-05,
      "loss": 1.2252,
      "step": 1842
    },
    {
      "epoch": 28.796875,
      "grad_norm": 2.395699977874756,
      "learning_rate": 8.4875e-05,
      "loss": 1.2837,
      "step": 1843
    },
    {
      "epoch": 28.8125,
      "grad_norm": 2.5855627059936523,
      "learning_rate": 8.481250000000001e-05,
      "loss": 1.2029,
      "step": 1844
    },
    {
      "epoch": 28.828125,
      "grad_norm": 2.5133798122406006,
      "learning_rate": 8.475000000000001e-05,
      "loss": 1.5304,
      "step": 1845
    },
    {
      "epoch": 28.84375,
      "grad_norm": 2.442232131958008,
      "learning_rate": 8.46875e-05,
      "loss": 1.3869,
      "step": 1846
    },
    {
      "epoch": 28.859375,
      "grad_norm": 2.448406457901001,
      "learning_rate": 8.4625e-05,
      "loss": 1.4401,
      "step": 1847
    },
    {
      "epoch": 28.875,
      "grad_norm": 2.706432819366455,
      "learning_rate": 8.45625e-05,
      "loss": 1.1534,
      "step": 1848
    },
    {
      "epoch": 28.890625,
      "grad_norm": 2.4254558086395264,
      "learning_rate": 8.450000000000001e-05,
      "loss": 1.3966,
      "step": 1849
    },
    {
      "epoch": 28.90625,
      "grad_norm": 2.3824055194854736,
      "learning_rate": 8.44375e-05,
      "loss": 1.3272,
      "step": 1850
    },
    {
      "epoch": 28.921875,
      "grad_norm": 2.341437578201294,
      "learning_rate": 8.4375e-05,
      "loss": 1.4557,
      "step": 1851
    },
    {
      "epoch": 28.9375,
      "grad_norm": 2.265751361846924,
      "learning_rate": 8.431250000000001e-05,
      "loss": 1.3198,
      "step": 1852
    },
    {
      "epoch": 28.953125,
      "grad_norm": 2.128093957901001,
      "learning_rate": 8.425e-05,
      "loss": 1.4437,
      "step": 1853
    },
    {
      "epoch": 28.96875,
      "grad_norm": 2.4645566940307617,
      "learning_rate": 8.41875e-05,
      "loss": 1.3771,
      "step": 1854
    },
    {
      "epoch": 28.984375,
      "grad_norm": 2.3335556983947754,
      "learning_rate": 8.412500000000001e-05,
      "loss": 1.3668,
      "step": 1855
    },
    {
      "epoch": 29.0,
      "grad_norm": 3.4888997077941895,
      "learning_rate": 8.40625e-05,
      "loss": 1.2562,
      "step": 1856
    },
    {
      "epoch": 29.0,
      "eval_loss": 3.050661563873291,
      "eval_runtime": 2.8613,
      "eval_samples_per_second": 178.938,
      "eval_steps_per_second": 44.734,
      "step": 1856
    },
    {
      "epoch": 29.015625,
      "grad_norm": 2.3706586360931396,
      "learning_rate": 8.4e-05,
      "loss": 1.3749,
      "step": 1857
    },
    {
      "epoch": 29.03125,
      "grad_norm": 2.8880419731140137,
      "learning_rate": 8.39375e-05,
      "loss": 1.1385,
      "step": 1858
    },
    {
      "epoch": 29.046875,
      "grad_norm": 2.186790943145752,
      "learning_rate": 8.3875e-05,
      "loss": 1.4661,
      "step": 1859
    },
    {
      "epoch": 29.0625,
      "grad_norm": 2.6575961112976074,
      "learning_rate": 8.381250000000001e-05,
      "loss": 1.3319,
      "step": 1860
    },
    {
      "epoch": 29.078125,
      "grad_norm": 2.115563154220581,
      "learning_rate": 8.375e-05,
      "loss": 1.2726,
      "step": 1861
    },
    {
      "epoch": 29.09375,
      "grad_norm": 2.3388009071350098,
      "learning_rate": 8.368750000000001e-05,
      "loss": 1.1266,
      "step": 1862
    },
    {
      "epoch": 29.109375,
      "grad_norm": 2.512110948562622,
      "learning_rate": 8.362500000000001e-05,
      "loss": 1.2559,
      "step": 1863
    },
    {
      "epoch": 29.125,
      "grad_norm": 2.3059449195861816,
      "learning_rate": 8.35625e-05,
      "loss": 1.4247,
      "step": 1864
    },
    {
      "epoch": 29.140625,
      "grad_norm": 2.699917793273926,
      "learning_rate": 8.35e-05,
      "loss": 1.3707,
      "step": 1865
    },
    {
      "epoch": 29.15625,
      "grad_norm": 2.8207523822784424,
      "learning_rate": 8.34375e-05,
      "loss": 1.2082,
      "step": 1866
    },
    {
      "epoch": 29.171875,
      "grad_norm": 2.3649559020996094,
      "learning_rate": 8.337500000000001e-05,
      "loss": 1.4296,
      "step": 1867
    },
    {
      "epoch": 29.1875,
      "grad_norm": 2.181089401245117,
      "learning_rate": 8.33125e-05,
      "loss": 1.2765,
      "step": 1868
    },
    {
      "epoch": 29.203125,
      "grad_norm": 2.2407047748565674,
      "learning_rate": 8.325e-05,
      "loss": 1.2815,
      "step": 1869
    },
    {
      "epoch": 29.21875,
      "grad_norm": 2.516223430633545,
      "learning_rate": 8.318750000000001e-05,
      "loss": 1.3181,
      "step": 1870
    },
    {
      "epoch": 29.234375,
      "grad_norm": 2.7790939807891846,
      "learning_rate": 8.312500000000001e-05,
      "loss": 1.0336,
      "step": 1871
    },
    {
      "epoch": 29.25,
      "grad_norm": 2.525062084197998,
      "learning_rate": 8.306249999999999e-05,
      "loss": 1.1425,
      "step": 1872
    },
    {
      "epoch": 29.265625,
      "grad_norm": 2.670572280883789,
      "learning_rate": 8.3e-05,
      "loss": 1.3827,
      "step": 1873
    },
    {
      "epoch": 29.28125,
      "grad_norm": 2.179635763168335,
      "learning_rate": 8.29375e-05,
      "loss": 1.4869,
      "step": 1874
    },
    {
      "epoch": 29.296875,
      "grad_norm": 2.4853885173797607,
      "learning_rate": 8.287500000000001e-05,
      "loss": 1.2398,
      "step": 1875
    },
    {
      "epoch": 29.3125,
      "grad_norm": 2.805851936340332,
      "learning_rate": 8.28125e-05,
      "loss": 1.342,
      "step": 1876
    },
    {
      "epoch": 29.328125,
      "grad_norm": 2.241379976272583,
      "learning_rate": 8.275e-05,
      "loss": 1.3796,
      "step": 1877
    },
    {
      "epoch": 29.34375,
      "grad_norm": 2.4368984699249268,
      "learning_rate": 8.268750000000001e-05,
      "loss": 1.456,
      "step": 1878
    },
    {
      "epoch": 29.359375,
      "grad_norm": 2.2352936267852783,
      "learning_rate": 8.2625e-05,
      "loss": 1.6073,
      "step": 1879
    },
    {
      "epoch": 29.375,
      "grad_norm": 2.195333242416382,
      "learning_rate": 8.256250000000001e-05,
      "loss": 1.5276,
      "step": 1880
    },
    {
      "epoch": 29.390625,
      "grad_norm": 2.324192762374878,
      "learning_rate": 8.25e-05,
      "loss": 1.3512,
      "step": 1881
    },
    {
      "epoch": 29.40625,
      "grad_norm": 2.3324203491210938,
      "learning_rate": 8.24375e-05,
      "loss": 1.2923,
      "step": 1882
    },
    {
      "epoch": 29.421875,
      "grad_norm": 2.195675849914551,
      "learning_rate": 8.2375e-05,
      "loss": 1.0345,
      "step": 1883
    },
    {
      "epoch": 29.4375,
      "grad_norm": 2.2806897163391113,
      "learning_rate": 8.23125e-05,
      "loss": 1.2649,
      "step": 1884
    },
    {
      "epoch": 29.453125,
      "grad_norm": 2.2607674598693848,
      "learning_rate": 8.225000000000001e-05,
      "loss": 1.2498,
      "step": 1885
    },
    {
      "epoch": 29.46875,
      "grad_norm": 2.4282941818237305,
      "learning_rate": 8.218750000000001e-05,
      "loss": 1.3665,
      "step": 1886
    },
    {
      "epoch": 29.484375,
      "grad_norm": 2.5612032413482666,
      "learning_rate": 8.2125e-05,
      "loss": 1.3527,
      "step": 1887
    },
    {
      "epoch": 29.5,
      "grad_norm": 2.072444438934326,
      "learning_rate": 8.206250000000001e-05,
      "loss": 1.3919,
      "step": 1888
    },
    {
      "epoch": 29.515625,
      "grad_norm": 2.4589436054229736,
      "learning_rate": 8.2e-05,
      "loss": 1.1668,
      "step": 1889
    },
    {
      "epoch": 29.53125,
      "grad_norm": 3.1244564056396484,
      "learning_rate": 8.19375e-05,
      "loss": 1.3446,
      "step": 1890
    },
    {
      "epoch": 29.546875,
      "grad_norm": 2.2875075340270996,
      "learning_rate": 8.1875e-05,
      "loss": 1.3008,
      "step": 1891
    },
    {
      "epoch": 29.5625,
      "grad_norm": 2.6798386573791504,
      "learning_rate": 8.18125e-05,
      "loss": 1.3092,
      "step": 1892
    },
    {
      "epoch": 29.578125,
      "grad_norm": 2.563800811767578,
      "learning_rate": 8.175000000000001e-05,
      "loss": 1.3002,
      "step": 1893
    },
    {
      "epoch": 29.59375,
      "grad_norm": 2.473098039627075,
      "learning_rate": 8.16875e-05,
      "loss": 1.4018,
      "step": 1894
    },
    {
      "epoch": 29.609375,
      "grad_norm": 2.4369874000549316,
      "learning_rate": 8.1625e-05,
      "loss": 1.4682,
      "step": 1895
    },
    {
      "epoch": 29.625,
      "grad_norm": 2.1726181507110596,
      "learning_rate": 8.156250000000001e-05,
      "loss": 1.221,
      "step": 1896
    },
    {
      "epoch": 29.640625,
      "grad_norm": 2.684556007385254,
      "learning_rate": 8.15e-05,
      "loss": 1.2544,
      "step": 1897
    },
    {
      "epoch": 29.65625,
      "grad_norm": 2.4799389839172363,
      "learning_rate": 8.14375e-05,
      "loss": 1.5429,
      "step": 1898
    },
    {
      "epoch": 29.671875,
      "grad_norm": 2.34982967376709,
      "learning_rate": 8.1375e-05,
      "loss": 1.4223,
      "step": 1899
    },
    {
      "epoch": 29.6875,
      "grad_norm": 2.333425760269165,
      "learning_rate": 8.13125e-05,
      "loss": 1.313,
      "step": 1900
    },
    {
      "epoch": 29.703125,
      "grad_norm": 2.4707424640655518,
      "learning_rate": 8.125000000000001e-05,
      "loss": 1.6294,
      "step": 1901
    },
    {
      "epoch": 29.71875,
      "grad_norm": 2.9507734775543213,
      "learning_rate": 8.11875e-05,
      "loss": 1.287,
      "step": 1902
    },
    {
      "epoch": 29.734375,
      "grad_norm": 2.432468891143799,
      "learning_rate": 8.112500000000001e-05,
      "loss": 1.4011,
      "step": 1903
    },
    {
      "epoch": 29.75,
      "grad_norm": 2.049851179122925,
      "learning_rate": 8.106250000000001e-05,
      "loss": 1.1989,
      "step": 1904
    },
    {
      "epoch": 29.765625,
      "grad_norm": 3.2906720638275146,
      "learning_rate": 8.1e-05,
      "loss": 1.3281,
      "step": 1905
    },
    {
      "epoch": 29.78125,
      "grad_norm": 2.3689663410186768,
      "learning_rate": 8.09375e-05,
      "loss": 1.1294,
      "step": 1906
    },
    {
      "epoch": 29.796875,
      "grad_norm": 2.653087615966797,
      "learning_rate": 8.0875e-05,
      "loss": 1.3502,
      "step": 1907
    },
    {
      "epoch": 29.8125,
      "grad_norm": 2.376650094985962,
      "learning_rate": 8.08125e-05,
      "loss": 1.2943,
      "step": 1908
    },
    {
      "epoch": 29.828125,
      "grad_norm": 2.4693944454193115,
      "learning_rate": 8.075e-05,
      "loss": 1.1415,
      "step": 1909
    },
    {
      "epoch": 29.84375,
      "grad_norm": 2.3264434337615967,
      "learning_rate": 8.06875e-05,
      "loss": 1.3676,
      "step": 1910
    },
    {
      "epoch": 29.859375,
      "grad_norm": 2.576991081237793,
      "learning_rate": 8.062500000000001e-05,
      "loss": 1.6782,
      "step": 1911
    },
    {
      "epoch": 29.875,
      "grad_norm": 2.3689675331115723,
      "learning_rate": 8.056250000000001e-05,
      "loss": 1.5169,
      "step": 1912
    },
    {
      "epoch": 29.890625,
      "grad_norm": 2.6830050945281982,
      "learning_rate": 8.05e-05,
      "loss": 1.4098,
      "step": 1913
    },
    {
      "epoch": 29.90625,
      "grad_norm": 2.641322374343872,
      "learning_rate": 8.04375e-05,
      "loss": 1.4079,
      "step": 1914
    },
    {
      "epoch": 29.921875,
      "grad_norm": 2.4760730266571045,
      "learning_rate": 8.0375e-05,
      "loss": 1.3036,
      "step": 1915
    },
    {
      "epoch": 29.9375,
      "grad_norm": 2.1256489753723145,
      "learning_rate": 8.031250000000001e-05,
      "loss": 1.3964,
      "step": 1916
    },
    {
      "epoch": 29.953125,
      "grad_norm": 2.5248641967773438,
      "learning_rate": 8.025e-05,
      "loss": 1.3941,
      "step": 1917
    },
    {
      "epoch": 29.96875,
      "grad_norm": 2.4105420112609863,
      "learning_rate": 8.01875e-05,
      "loss": 1.2763,
      "step": 1918
    },
    {
      "epoch": 29.984375,
      "grad_norm": 2.415919303894043,
      "learning_rate": 8.012500000000001e-05,
      "loss": 1.3174,
      "step": 1919
    },
    {
      "epoch": 30.0,
      "grad_norm": 2.8473455905914307,
      "learning_rate": 8.00625e-05,
      "loss": 1.2306,
      "step": 1920
    },
    {
      "epoch": 30.0,
      "eval_loss": 3.0597262382507324,
      "eval_runtime": 2.8481,
      "eval_samples_per_second": 179.769,
      "eval_steps_per_second": 44.942,
      "step": 1920
    },
    {
      "epoch": 30.015625,
      "grad_norm": 2.243298053741455,
      "learning_rate": 8e-05,
      "loss": 1.0381,
      "step": 1921
    },
    {
      "epoch": 30.03125,
      "grad_norm": 2.2839345932006836,
      "learning_rate": 7.99375e-05,
      "loss": 1.5718,
      "step": 1922
    },
    {
      "epoch": 30.046875,
      "grad_norm": 2.3050649166107178,
      "learning_rate": 7.9875e-05,
      "loss": 1.1758,
      "step": 1923
    },
    {
      "epoch": 30.0625,
      "grad_norm": 2.221419334411621,
      "learning_rate": 7.98125e-05,
      "loss": 1.4449,
      "step": 1924
    },
    {
      "epoch": 30.078125,
      "grad_norm": 2.282834053039551,
      "learning_rate": 7.975e-05,
      "loss": 1.1729,
      "step": 1925
    },
    {
      "epoch": 30.09375,
      "grad_norm": 2.4876232147216797,
      "learning_rate": 7.96875e-05,
      "loss": 1.2735,
      "step": 1926
    },
    {
      "epoch": 30.109375,
      "grad_norm": 2.4108078479766846,
      "learning_rate": 7.962500000000001e-05,
      "loss": 1.2674,
      "step": 1927
    },
    {
      "epoch": 30.125,
      "grad_norm": 2.629854440689087,
      "learning_rate": 7.95625e-05,
      "loss": 1.3999,
      "step": 1928
    },
    {
      "epoch": 30.140625,
      "grad_norm": 2.819713830947876,
      "learning_rate": 7.950000000000001e-05,
      "loss": 1.4231,
      "step": 1929
    },
    {
      "epoch": 30.15625,
      "grad_norm": 2.4748551845550537,
      "learning_rate": 7.943750000000001e-05,
      "loss": 1.3003,
      "step": 1930
    },
    {
      "epoch": 30.171875,
      "grad_norm": 2.454322099685669,
      "learning_rate": 7.9375e-05,
      "loss": 1.3181,
      "step": 1931
    },
    {
      "epoch": 30.1875,
      "grad_norm": 2.627302885055542,
      "learning_rate": 7.93125e-05,
      "loss": 1.3326,
      "step": 1932
    },
    {
      "epoch": 30.203125,
      "grad_norm": 2.292715311050415,
      "learning_rate": 7.925e-05,
      "loss": 1.2192,
      "step": 1933
    },
    {
      "epoch": 30.21875,
      "grad_norm": 2.415271043777466,
      "learning_rate": 7.918750000000001e-05,
      "loss": 1.1828,
      "step": 1934
    },
    {
      "epoch": 30.234375,
      "grad_norm": 2.5337724685668945,
      "learning_rate": 7.9125e-05,
      "loss": 1.1481,
      "step": 1935
    },
    {
      "epoch": 30.25,
      "grad_norm": 2.3589892387390137,
      "learning_rate": 7.90625e-05,
      "loss": 1.218,
      "step": 1936
    },
    {
      "epoch": 30.265625,
      "grad_norm": 2.45194935798645,
      "learning_rate": 7.900000000000001e-05,
      "loss": 1.4025,
      "step": 1937
    },
    {
      "epoch": 30.28125,
      "grad_norm": 2.4701242446899414,
      "learning_rate": 7.893750000000001e-05,
      "loss": 1.1326,
      "step": 1938
    },
    {
      "epoch": 30.296875,
      "grad_norm": 2.243572235107422,
      "learning_rate": 7.887499999999999e-05,
      "loss": 1.3649,
      "step": 1939
    },
    {
      "epoch": 30.3125,
      "grad_norm": 2.589404582977295,
      "learning_rate": 7.88125e-05,
      "loss": 1.2301,
      "step": 1940
    },
    {
      "epoch": 30.328125,
      "grad_norm": 2.4728610515594482,
      "learning_rate": 7.875e-05,
      "loss": 1.1356,
      "step": 1941
    },
    {
      "epoch": 30.34375,
      "grad_norm": 2.3767950534820557,
      "learning_rate": 7.868750000000001e-05,
      "loss": 1.4639,
      "step": 1942
    },
    {
      "epoch": 30.359375,
      "grad_norm": 2.4123151302337646,
      "learning_rate": 7.8625e-05,
      "loss": 1.2732,
      "step": 1943
    },
    {
      "epoch": 30.375,
      "grad_norm": 2.850285053253174,
      "learning_rate": 7.85625e-05,
      "loss": 1.3441,
      "step": 1944
    },
    {
      "epoch": 30.390625,
      "grad_norm": 2.5972423553466797,
      "learning_rate": 7.850000000000001e-05,
      "loss": 1.3776,
      "step": 1945
    },
    {
      "epoch": 30.40625,
      "grad_norm": 2.2573015689849854,
      "learning_rate": 7.84375e-05,
      "loss": 1.4734,
      "step": 1946
    },
    {
      "epoch": 30.421875,
      "grad_norm": 2.487711191177368,
      "learning_rate": 7.8375e-05,
      "loss": 1.33,
      "step": 1947
    },
    {
      "epoch": 30.4375,
      "grad_norm": 2.5618436336517334,
      "learning_rate": 7.83125e-05,
      "loss": 1.3272,
      "step": 1948
    },
    {
      "epoch": 30.453125,
      "grad_norm": 2.547907829284668,
      "learning_rate": 7.825e-05,
      "loss": 1.4976,
      "step": 1949
    },
    {
      "epoch": 30.46875,
      "grad_norm": 2.6455931663513184,
      "learning_rate": 7.81875e-05,
      "loss": 1.261,
      "step": 1950
    },
    {
      "epoch": 30.484375,
      "grad_norm": 2.431368350982666,
      "learning_rate": 7.8125e-05,
      "loss": 1.3196,
      "step": 1951
    },
    {
      "epoch": 30.5,
      "grad_norm": 2.425816059112549,
      "learning_rate": 7.806250000000001e-05,
      "loss": 1.3024,
      "step": 1952
    },
    {
      "epoch": 30.515625,
      "grad_norm": 2.3320391178131104,
      "learning_rate": 7.800000000000001e-05,
      "loss": 1.2401,
      "step": 1953
    },
    {
      "epoch": 30.53125,
      "grad_norm": 2.408737897872925,
      "learning_rate": 7.79375e-05,
      "loss": 1.2879,
      "step": 1954
    },
    {
      "epoch": 30.546875,
      "grad_norm": 2.4044549465179443,
      "learning_rate": 7.787500000000001e-05,
      "loss": 1.1703,
      "step": 1955
    },
    {
      "epoch": 30.5625,
      "grad_norm": 2.5110843181610107,
      "learning_rate": 7.78125e-05,
      "loss": 1.1966,
      "step": 1956
    },
    {
      "epoch": 30.578125,
      "grad_norm": 2.356023073196411,
      "learning_rate": 7.775e-05,
      "loss": 1.3575,
      "step": 1957
    },
    {
      "epoch": 30.59375,
      "grad_norm": 2.3768374919891357,
      "learning_rate": 7.76875e-05,
      "loss": 1.3271,
      "step": 1958
    },
    {
      "epoch": 30.609375,
      "grad_norm": 2.3572750091552734,
      "learning_rate": 7.7625e-05,
      "loss": 1.4368,
      "step": 1959
    },
    {
      "epoch": 30.625,
      "grad_norm": 2.4727284908294678,
      "learning_rate": 7.756250000000001e-05,
      "loss": 1.3304,
      "step": 1960
    },
    {
      "epoch": 30.640625,
      "grad_norm": 2.3545873165130615,
      "learning_rate": 7.75e-05,
      "loss": 1.1899,
      "step": 1961
    },
    {
      "epoch": 30.65625,
      "grad_norm": 2.4038212299346924,
      "learning_rate": 7.74375e-05,
      "loss": 1.1475,
      "step": 1962
    },
    {
      "epoch": 30.671875,
      "grad_norm": 2.75050950050354,
      "learning_rate": 7.737500000000001e-05,
      "loss": 1.3498,
      "step": 1963
    },
    {
      "epoch": 30.6875,
      "grad_norm": 2.526909112930298,
      "learning_rate": 7.73125e-05,
      "loss": 1.5424,
      "step": 1964
    },
    {
      "epoch": 30.703125,
      "grad_norm": 2.5108938217163086,
      "learning_rate": 7.725e-05,
      "loss": 1.2931,
      "step": 1965
    },
    {
      "epoch": 30.71875,
      "grad_norm": 2.5821890830993652,
      "learning_rate": 7.71875e-05,
      "loss": 1.3306,
      "step": 1966
    },
    {
      "epoch": 30.734375,
      "grad_norm": 2.5232620239257812,
      "learning_rate": 7.7125e-05,
      "loss": 1.4259,
      "step": 1967
    },
    {
      "epoch": 30.75,
      "grad_norm": 2.493201494216919,
      "learning_rate": 7.706250000000001e-05,
      "loss": 1.4885,
      "step": 1968
    },
    {
      "epoch": 30.765625,
      "grad_norm": 2.569944381713867,
      "learning_rate": 7.7e-05,
      "loss": 1.4168,
      "step": 1969
    },
    {
      "epoch": 30.78125,
      "grad_norm": 2.649271249771118,
      "learning_rate": 7.69375e-05,
      "loss": 1.1056,
      "step": 1970
    },
    {
      "epoch": 30.796875,
      "grad_norm": 2.501588821411133,
      "learning_rate": 7.687500000000001e-05,
      "loss": 1.496,
      "step": 1971
    },
    {
      "epoch": 30.8125,
      "grad_norm": 2.255279541015625,
      "learning_rate": 7.68125e-05,
      "loss": 1.1785,
      "step": 1972
    },
    {
      "epoch": 30.828125,
      "grad_norm": 2.3605024814605713,
      "learning_rate": 7.675e-05,
      "loss": 1.3695,
      "step": 1973
    },
    {
      "epoch": 30.84375,
      "grad_norm": 2.4197378158569336,
      "learning_rate": 7.66875e-05,
      "loss": 1.3676,
      "step": 1974
    },
    {
      "epoch": 30.859375,
      "grad_norm": 2.6067745685577393,
      "learning_rate": 7.6625e-05,
      "loss": 1.5112,
      "step": 1975
    },
    {
      "epoch": 30.875,
      "grad_norm": 2.6596381664276123,
      "learning_rate": 7.65625e-05,
      "loss": 1.5232,
      "step": 1976
    },
    {
      "epoch": 30.890625,
      "grad_norm": 2.530704975128174,
      "learning_rate": 7.65e-05,
      "loss": 1.3279,
      "step": 1977
    },
    {
      "epoch": 30.90625,
      "grad_norm": 2.7641892433166504,
      "learning_rate": 7.643750000000001e-05,
      "loss": 1.3178,
      "step": 1978
    },
    {
      "epoch": 30.921875,
      "grad_norm": 2.3807265758514404,
      "learning_rate": 7.637500000000001e-05,
      "loss": 1.4219,
      "step": 1979
    },
    {
      "epoch": 30.9375,
      "grad_norm": 2.385436534881592,
      "learning_rate": 7.63125e-05,
      "loss": 1.3783,
      "step": 1980
    },
    {
      "epoch": 30.953125,
      "grad_norm": 2.456477165222168,
      "learning_rate": 7.625e-05,
      "loss": 1.343,
      "step": 1981
    },
    {
      "epoch": 30.96875,
      "grad_norm": 2.374629497528076,
      "learning_rate": 7.61875e-05,
      "loss": 1.6239,
      "step": 1982
    },
    {
      "epoch": 30.984375,
      "grad_norm": 2.621753215789795,
      "learning_rate": 7.612500000000001e-05,
      "loss": 1.2617,
      "step": 1983
    },
    {
      "epoch": 31.0,
      "grad_norm": 2.544302225112915,
      "learning_rate": 7.60625e-05,
      "loss": 1.4765,
      "step": 1984
    },
    {
      "epoch": 31.0,
      "eval_loss": 3.0611345767974854,
      "eval_runtime": 2.87,
      "eval_samples_per_second": 178.398,
      "eval_steps_per_second": 44.599,
      "step": 1984
    },
    {
      "epoch": 31.015625,
      "grad_norm": 2.4561548233032227,
      "learning_rate": 7.6e-05,
      "loss": 1.2461,
      "step": 1985
    },
    {
      "epoch": 31.03125,
      "grad_norm": 2.046844720840454,
      "learning_rate": 7.593750000000001e-05,
      "loss": 1.3228,
      "step": 1986
    },
    {
      "epoch": 31.046875,
      "grad_norm": 2.517420768737793,
      "learning_rate": 7.5875e-05,
      "loss": 1.1654,
      "step": 1987
    },
    {
      "epoch": 31.0625,
      "grad_norm": 2.240361452102661,
      "learning_rate": 7.58125e-05,
      "loss": 1.3262,
      "step": 1988
    },
    {
      "epoch": 31.078125,
      "grad_norm": 2.2049145698547363,
      "learning_rate": 7.575e-05,
      "loss": 1.3424,
      "step": 1989
    },
    {
      "epoch": 31.09375,
      "grad_norm": 2.4178929328918457,
      "learning_rate": 7.56875e-05,
      "loss": 1.1944,
      "step": 1990
    },
    {
      "epoch": 31.109375,
      "grad_norm": 2.2037832736968994,
      "learning_rate": 7.5625e-05,
      "loss": 1.2761,
      "step": 1991
    },
    {
      "epoch": 31.125,
      "grad_norm": 2.47892689704895,
      "learning_rate": 7.55625e-05,
      "loss": 1.2489,
      "step": 1992
    },
    {
      "epoch": 31.140625,
      "grad_norm": 2.390213966369629,
      "learning_rate": 7.55e-05,
      "loss": 1.1715,
      "step": 1993
    },
    {
      "epoch": 31.15625,
      "grad_norm": 2.399014949798584,
      "learning_rate": 7.543750000000001e-05,
      "loss": 1.3846,
      "step": 1994
    },
    {
      "epoch": 31.171875,
      "grad_norm": 2.2854881286621094,
      "learning_rate": 7.5375e-05,
      "loss": 1.4593,
      "step": 1995
    },
    {
      "epoch": 31.1875,
      "grad_norm": 2.2560558319091797,
      "learning_rate": 7.531250000000001e-05,
      "loss": 1.458,
      "step": 1996
    },
    {
      "epoch": 31.203125,
      "grad_norm": 2.254143476486206,
      "learning_rate": 7.525e-05,
      "loss": 1.033,
      "step": 1997
    },
    {
      "epoch": 31.21875,
      "grad_norm": 2.1141440868377686,
      "learning_rate": 7.51875e-05,
      "loss": 1.406,
      "step": 1998
    },
    {
      "epoch": 31.234375,
      "grad_norm": 2.533381223678589,
      "learning_rate": 7.5125e-05,
      "loss": 1.0435,
      "step": 1999
    },
    {
      "epoch": 31.25,
      "grad_norm": 2.1300745010375977,
      "learning_rate": 7.50625e-05,
      "loss": 1.4663,
      "step": 2000
    },
    {
      "epoch": 31.265625,
      "grad_norm": 2.5386478900909424,
      "learning_rate": 7.500000000000001e-05,
      "loss": 1.1931,
      "step": 2001
    },
    {
      "epoch": 31.28125,
      "grad_norm": 2.5353517532348633,
      "learning_rate": 7.49375e-05,
      "loss": 1.4356,
      "step": 2002
    },
    {
      "epoch": 31.296875,
      "grad_norm": 2.5385639667510986,
      "learning_rate": 7.4875e-05,
      "loss": 1.3552,
      "step": 2003
    },
    {
      "epoch": 31.3125,
      "grad_norm": 2.2310829162597656,
      "learning_rate": 7.481250000000001e-05,
      "loss": 1.3779,
      "step": 2004
    },
    {
      "epoch": 31.328125,
      "grad_norm": 2.1493585109710693,
      "learning_rate": 7.475000000000001e-05,
      "loss": 1.3416,
      "step": 2005
    },
    {
      "epoch": 31.34375,
      "grad_norm": 2.7533044815063477,
      "learning_rate": 7.468749999999999e-05,
      "loss": 1.2558,
      "step": 2006
    },
    {
      "epoch": 31.359375,
      "grad_norm": 2.365610361099243,
      "learning_rate": 7.4625e-05,
      "loss": 1.0469,
      "step": 2007
    },
    {
      "epoch": 31.375,
      "grad_norm": 2.207770824432373,
      "learning_rate": 7.45625e-05,
      "loss": 1.2686,
      "step": 2008
    },
    {
      "epoch": 31.390625,
      "grad_norm": 2.2714290618896484,
      "learning_rate": 7.450000000000001e-05,
      "loss": 1.2631,
      "step": 2009
    },
    {
      "epoch": 31.40625,
      "grad_norm": 2.4725778102874756,
      "learning_rate": 7.44375e-05,
      "loss": 1.4666,
      "step": 2010
    },
    {
      "epoch": 31.421875,
      "grad_norm": 2.669138193130493,
      "learning_rate": 7.4375e-05,
      "loss": 1.4757,
      "step": 2011
    },
    {
      "epoch": 31.4375,
      "grad_norm": 2.6999664306640625,
      "learning_rate": 7.431250000000001e-05,
      "loss": 1.3191,
      "step": 2012
    },
    {
      "epoch": 31.453125,
      "grad_norm": 2.740605115890503,
      "learning_rate": 7.425e-05,
      "loss": 1.3586,
      "step": 2013
    },
    {
      "epoch": 31.46875,
      "grad_norm": 2.977104902267456,
      "learning_rate": 7.41875e-05,
      "loss": 1.1461,
      "step": 2014
    },
    {
      "epoch": 31.484375,
      "grad_norm": 2.202676773071289,
      "learning_rate": 7.4125e-05,
      "loss": 1.4647,
      "step": 2015
    },
    {
      "epoch": 31.5,
      "grad_norm": 2.8642563819885254,
      "learning_rate": 7.40625e-05,
      "loss": 1.2386,
      "step": 2016
    },
    {
      "epoch": 31.515625,
      "grad_norm": 2.614811658859253,
      "learning_rate": 7.4e-05,
      "loss": 1.3324,
      "step": 2017
    },
    {
      "epoch": 31.53125,
      "grad_norm": 2.70609450340271,
      "learning_rate": 7.39375e-05,
      "loss": 1.2096,
      "step": 2018
    },
    {
      "epoch": 31.546875,
      "grad_norm": 2.4627978801727295,
      "learning_rate": 7.3875e-05,
      "loss": 1.1766,
      "step": 2019
    },
    {
      "epoch": 31.5625,
      "grad_norm": 2.2922959327697754,
      "learning_rate": 7.381250000000001e-05,
      "loss": 1.2401,
      "step": 2020
    },
    {
      "epoch": 31.578125,
      "grad_norm": 2.4665956497192383,
      "learning_rate": 7.375e-05,
      "loss": 1.632,
      "step": 2021
    },
    {
      "epoch": 31.59375,
      "grad_norm": 2.7751576900482178,
      "learning_rate": 7.36875e-05,
      "loss": 1.2988,
      "step": 2022
    },
    {
      "epoch": 31.609375,
      "grad_norm": 2.4931278228759766,
      "learning_rate": 7.3625e-05,
      "loss": 1.3527,
      "step": 2023
    },
    {
      "epoch": 31.625,
      "grad_norm": 2.193869113922119,
      "learning_rate": 7.35625e-05,
      "loss": 1.3714,
      "step": 2024
    },
    {
      "epoch": 31.640625,
      "grad_norm": 2.734752893447876,
      "learning_rate": 7.35e-05,
      "loss": 1.2937,
      "step": 2025
    },
    {
      "epoch": 31.65625,
      "grad_norm": 2.423023223876953,
      "learning_rate": 7.34375e-05,
      "loss": 1.3584,
      "step": 2026
    },
    {
      "epoch": 31.671875,
      "grad_norm": 2.4248456954956055,
      "learning_rate": 7.337500000000001e-05,
      "loss": 1.3354,
      "step": 2027
    },
    {
      "epoch": 31.6875,
      "grad_norm": 2.650225877761841,
      "learning_rate": 7.33125e-05,
      "loss": 1.2417,
      "step": 2028
    },
    {
      "epoch": 31.703125,
      "grad_norm": 2.304382801055908,
      "learning_rate": 7.325e-05,
      "loss": 1.4772,
      "step": 2029
    },
    {
      "epoch": 31.71875,
      "grad_norm": 2.404806613922119,
      "learning_rate": 7.318750000000001e-05,
      "loss": 1.5181,
      "step": 2030
    },
    {
      "epoch": 31.734375,
      "grad_norm": 2.2516322135925293,
      "learning_rate": 7.3125e-05,
      "loss": 1.2384,
      "step": 2031
    },
    {
      "epoch": 31.75,
      "grad_norm": 3.1772093772888184,
      "learning_rate": 7.30625e-05,
      "loss": 1.2786,
      "step": 2032
    },
    {
      "epoch": 31.765625,
      "grad_norm": 2.5929975509643555,
      "learning_rate": 7.3e-05,
      "loss": 1.3486,
      "step": 2033
    },
    {
      "epoch": 31.78125,
      "grad_norm": 2.5987768173217773,
      "learning_rate": 7.29375e-05,
      "loss": 1.4114,
      "step": 2034
    },
    {
      "epoch": 31.796875,
      "grad_norm": 2.383439540863037,
      "learning_rate": 7.287500000000001e-05,
      "loss": 1.3218,
      "step": 2035
    },
    {
      "epoch": 31.8125,
      "grad_norm": 2.857064962387085,
      "learning_rate": 7.28125e-05,
      "loss": 1.3657,
      "step": 2036
    },
    {
      "epoch": 31.828125,
      "grad_norm": 2.416991949081421,
      "learning_rate": 7.275e-05,
      "loss": 1.1792,
      "step": 2037
    },
    {
      "epoch": 31.84375,
      "grad_norm": 2.377809524536133,
      "learning_rate": 7.268750000000001e-05,
      "loss": 1.2741,
      "step": 2038
    },
    {
      "epoch": 31.859375,
      "grad_norm": 2.434091567993164,
      "learning_rate": 7.2625e-05,
      "loss": 1.4672,
      "step": 2039
    },
    {
      "epoch": 31.875,
      "grad_norm": 2.435007333755493,
      "learning_rate": 7.25625e-05,
      "loss": 1.1906,
      "step": 2040
    },
    {
      "epoch": 31.890625,
      "grad_norm": 2.8275482654571533,
      "learning_rate": 7.25e-05,
      "loss": 1.256,
      "step": 2041
    },
    {
      "epoch": 31.90625,
      "grad_norm": 2.499622344970703,
      "learning_rate": 7.24375e-05,
      "loss": 1.1889,
      "step": 2042
    },
    {
      "epoch": 31.921875,
      "grad_norm": 2.267218589782715,
      "learning_rate": 7.2375e-05,
      "loss": 1.2405,
      "step": 2043
    },
    {
      "epoch": 31.9375,
      "grad_norm": 2.8253700733184814,
      "learning_rate": 7.23125e-05,
      "loss": 1.1225,
      "step": 2044
    },
    {
      "epoch": 31.953125,
      "grad_norm": 2.3821237087249756,
      "learning_rate": 7.225000000000001e-05,
      "loss": 1.3296,
      "step": 2045
    },
    {
      "epoch": 31.96875,
      "grad_norm": 2.2923073768615723,
      "learning_rate": 7.218750000000001e-05,
      "loss": 1.5757,
      "step": 2046
    },
    {
      "epoch": 31.984375,
      "grad_norm": 2.475167751312256,
      "learning_rate": 7.2125e-05,
      "loss": 1.1062,
      "step": 2047
    },
    {
      "epoch": 32.0,
      "grad_norm": 2.619253158569336,
      "learning_rate": 7.20625e-05,
      "loss": 1.3004,
      "step": 2048
    },
    {
      "epoch": 32.0,
      "eval_loss": 3.0650877952575684,
      "eval_runtime": 2.8703,
      "eval_samples_per_second": 178.376,
      "eval_steps_per_second": 44.594,
      "step": 2048
    },
    {
      "epoch": 32.015625,
      "grad_norm": 2.4766886234283447,
      "learning_rate": 7.2e-05,
      "loss": 1.3628,
      "step": 2049
    },
    {
      "epoch": 32.03125,
      "grad_norm": 2.4750828742980957,
      "learning_rate": 7.193750000000001e-05,
      "loss": 1.4255,
      "step": 2050
    },
    {
      "epoch": 32.046875,
      "grad_norm": 2.270620346069336,
      "learning_rate": 7.1875e-05,
      "loss": 1.3891,
      "step": 2051
    },
    {
      "epoch": 32.0625,
      "grad_norm": 2.5487060546875,
      "learning_rate": 7.18125e-05,
      "loss": 1.2043,
      "step": 2052
    },
    {
      "epoch": 32.078125,
      "grad_norm": 2.4503514766693115,
      "learning_rate": 7.175000000000001e-05,
      "loss": 0.9023,
      "step": 2053
    },
    {
      "epoch": 32.09375,
      "grad_norm": 2.3846969604492188,
      "learning_rate": 7.16875e-05,
      "loss": 1.3903,
      "step": 2054
    },
    {
      "epoch": 32.109375,
      "grad_norm": 2.277449369430542,
      "learning_rate": 7.1625e-05,
      "loss": 1.1587,
      "step": 2055
    },
    {
      "epoch": 32.125,
      "grad_norm": 2.278508424758911,
      "learning_rate": 7.15625e-05,
      "loss": 1.1988,
      "step": 2056
    },
    {
      "epoch": 32.140625,
      "grad_norm": 2.2853543758392334,
      "learning_rate": 7.15e-05,
      "loss": 1.4715,
      "step": 2057
    },
    {
      "epoch": 32.15625,
      "grad_norm": 2.457690715789795,
      "learning_rate": 7.14375e-05,
      "loss": 1.1867,
      "step": 2058
    },
    {
      "epoch": 32.171875,
      "grad_norm": 2.2130773067474365,
      "learning_rate": 7.1375e-05,
      "loss": 1.346,
      "step": 2059
    },
    {
      "epoch": 32.1875,
      "grad_norm": 2.633162021636963,
      "learning_rate": 7.13125e-05,
      "loss": 1.2782,
      "step": 2060
    },
    {
      "epoch": 32.203125,
      "grad_norm": 2.336540937423706,
      "learning_rate": 7.125000000000001e-05,
      "loss": 1.3431,
      "step": 2061
    },
    {
      "epoch": 32.21875,
      "grad_norm": 2.3743038177490234,
      "learning_rate": 7.11875e-05,
      "loss": 1.3452,
      "step": 2062
    },
    {
      "epoch": 32.234375,
      "grad_norm": 2.651566505432129,
      "learning_rate": 7.112500000000001e-05,
      "loss": 0.9915,
      "step": 2063
    },
    {
      "epoch": 32.25,
      "grad_norm": 2.4090287685394287,
      "learning_rate": 7.10625e-05,
      "loss": 1.3948,
      "step": 2064
    },
    {
      "epoch": 32.265625,
      "grad_norm": 2.4464035034179688,
      "learning_rate": 7.1e-05,
      "loss": 1.4304,
      "step": 2065
    },
    {
      "epoch": 32.28125,
      "grad_norm": 2.255868673324585,
      "learning_rate": 7.09375e-05,
      "loss": 1.4346,
      "step": 2066
    },
    {
      "epoch": 32.296875,
      "grad_norm": 2.42494535446167,
      "learning_rate": 7.0875e-05,
      "loss": 1.2311,
      "step": 2067
    },
    {
      "epoch": 32.3125,
      "grad_norm": 2.224954843521118,
      "learning_rate": 7.081250000000001e-05,
      "loss": 1.2327,
      "step": 2068
    },
    {
      "epoch": 32.328125,
      "grad_norm": 2.680690288543701,
      "learning_rate": 7.075e-05,
      "loss": 1.1516,
      "step": 2069
    },
    {
      "epoch": 32.34375,
      "grad_norm": 2.3225481510162354,
      "learning_rate": 7.06875e-05,
      "loss": 1.1317,
      "step": 2070
    },
    {
      "epoch": 32.359375,
      "grad_norm": 2.1743693351745605,
      "learning_rate": 7.062500000000001e-05,
      "loss": 1.1247,
      "step": 2071
    },
    {
      "epoch": 32.375,
      "grad_norm": 2.4790701866149902,
      "learning_rate": 7.05625e-05,
      "loss": 1.33,
      "step": 2072
    },
    {
      "epoch": 32.390625,
      "grad_norm": 2.4535880088806152,
      "learning_rate": 7.05e-05,
      "loss": 1.2183,
      "step": 2073
    },
    {
      "epoch": 32.40625,
      "grad_norm": 2.279792547225952,
      "learning_rate": 7.04375e-05,
      "loss": 1.3698,
      "step": 2074
    },
    {
      "epoch": 32.421875,
      "grad_norm": 2.4975674152374268,
      "learning_rate": 7.0375e-05,
      "loss": 1.312,
      "step": 2075
    },
    {
      "epoch": 32.4375,
      "grad_norm": 2.3717312812805176,
      "learning_rate": 7.031250000000001e-05,
      "loss": 1.171,
      "step": 2076
    },
    {
      "epoch": 32.453125,
      "grad_norm": 2.3972012996673584,
      "learning_rate": 7.025e-05,
      "loss": 1.3143,
      "step": 2077
    },
    {
      "epoch": 32.46875,
      "grad_norm": 2.438347578048706,
      "learning_rate": 7.01875e-05,
      "loss": 1.1259,
      "step": 2078
    },
    {
      "epoch": 32.484375,
      "grad_norm": 2.3644025325775146,
      "learning_rate": 7.012500000000001e-05,
      "loss": 1.314,
      "step": 2079
    },
    {
      "epoch": 32.5,
      "grad_norm": 2.891456365585327,
      "learning_rate": 7.00625e-05,
      "loss": 1.3038,
      "step": 2080
    },
    {
      "epoch": 32.515625,
      "grad_norm": 2.4934253692626953,
      "learning_rate": 7e-05,
      "loss": 1.3897,
      "step": 2081
    },
    {
      "epoch": 32.53125,
      "grad_norm": 2.497067451477051,
      "learning_rate": 6.99375e-05,
      "loss": 1.3787,
      "step": 2082
    },
    {
      "epoch": 32.546875,
      "grad_norm": 2.601365089416504,
      "learning_rate": 6.9875e-05,
      "loss": 1.2824,
      "step": 2083
    },
    {
      "epoch": 32.5625,
      "grad_norm": 2.0955049991607666,
      "learning_rate": 6.981250000000001e-05,
      "loss": 1.3049,
      "step": 2084
    },
    {
      "epoch": 32.578125,
      "grad_norm": 2.5113048553466797,
      "learning_rate": 6.975e-05,
      "loss": 1.2326,
      "step": 2085
    },
    {
      "epoch": 32.59375,
      "grad_norm": 2.471109628677368,
      "learning_rate": 6.96875e-05,
      "loss": 1.2513,
      "step": 2086
    },
    {
      "epoch": 32.609375,
      "grad_norm": 2.4926130771636963,
      "learning_rate": 6.962500000000001e-05,
      "loss": 1.2153,
      "step": 2087
    },
    {
      "epoch": 32.625,
      "grad_norm": 2.5651164054870605,
      "learning_rate": 6.95625e-05,
      "loss": 1.185,
      "step": 2088
    },
    {
      "epoch": 32.640625,
      "grad_norm": 2.360314130783081,
      "learning_rate": 6.95e-05,
      "loss": 1.3631,
      "step": 2089
    },
    {
      "epoch": 32.65625,
      "grad_norm": 2.3729050159454346,
      "learning_rate": 6.94375e-05,
      "loss": 1.3517,
      "step": 2090
    },
    {
      "epoch": 32.671875,
      "grad_norm": 2.3025412559509277,
      "learning_rate": 6.9375e-05,
      "loss": 1.414,
      "step": 2091
    },
    {
      "epoch": 32.6875,
      "grad_norm": 2.790457010269165,
      "learning_rate": 6.93125e-05,
      "loss": 1.2957,
      "step": 2092
    },
    {
      "epoch": 32.703125,
      "grad_norm": 2.615929126739502,
      "learning_rate": 6.925e-05,
      "loss": 1.1938,
      "step": 2093
    },
    {
      "epoch": 32.71875,
      "grad_norm": 2.5610971450805664,
      "learning_rate": 6.918750000000001e-05,
      "loss": 1.4381,
      "step": 2094
    },
    {
      "epoch": 32.734375,
      "grad_norm": 2.193241834640503,
      "learning_rate": 6.9125e-05,
      "loss": 1.4985,
      "step": 2095
    },
    {
      "epoch": 32.75,
      "grad_norm": 2.6518349647521973,
      "learning_rate": 6.90625e-05,
      "loss": 1.2896,
      "step": 2096
    },
    {
      "epoch": 32.765625,
      "grad_norm": 2.5103280544281006,
      "learning_rate": 6.9e-05,
      "loss": 1.2712,
      "step": 2097
    },
    {
      "epoch": 32.78125,
      "grad_norm": 2.3990843296051025,
      "learning_rate": 6.89375e-05,
      "loss": 1.2724,
      "step": 2098
    },
    {
      "epoch": 32.796875,
      "grad_norm": 2.620070695877075,
      "learning_rate": 6.887500000000001e-05,
      "loss": 1.2533,
      "step": 2099
    },
    {
      "epoch": 32.8125,
      "grad_norm": 2.4586451053619385,
      "learning_rate": 6.88125e-05,
      "loss": 1.3853,
      "step": 2100
    },
    {
      "epoch": 32.828125,
      "grad_norm": 2.4729182720184326,
      "learning_rate": 6.875e-05,
      "loss": 1.3927,
      "step": 2101
    },
    {
      "epoch": 32.84375,
      "grad_norm": 2.3878989219665527,
      "learning_rate": 6.868750000000001e-05,
      "loss": 1.4039,
      "step": 2102
    },
    {
      "epoch": 32.859375,
      "grad_norm": 2.432701349258423,
      "learning_rate": 6.8625e-05,
      "loss": 1.4048,
      "step": 2103
    },
    {
      "epoch": 32.875,
      "grad_norm": 2.5297157764434814,
      "learning_rate": 6.85625e-05,
      "loss": 1.2455,
      "step": 2104
    },
    {
      "epoch": 32.890625,
      "grad_norm": 2.5070724487304688,
      "learning_rate": 6.850000000000001e-05,
      "loss": 1.1344,
      "step": 2105
    },
    {
      "epoch": 32.90625,
      "grad_norm": 2.307124137878418,
      "learning_rate": 6.84375e-05,
      "loss": 1.07,
      "step": 2106
    },
    {
      "epoch": 32.921875,
      "grad_norm": 2.723708152770996,
      "learning_rate": 6.8375e-05,
      "loss": 1.3259,
      "step": 2107
    },
    {
      "epoch": 32.9375,
      "grad_norm": 2.294153928756714,
      "learning_rate": 6.83125e-05,
      "loss": 1.3613,
      "step": 2108
    },
    {
      "epoch": 32.953125,
      "grad_norm": 2.2524333000183105,
      "learning_rate": 6.825e-05,
      "loss": 1.0234,
      "step": 2109
    },
    {
      "epoch": 32.96875,
      "grad_norm": 2.3096659183502197,
      "learning_rate": 6.818750000000001e-05,
      "loss": 1.383,
      "step": 2110
    },
    {
      "epoch": 32.984375,
      "grad_norm": 2.422907590866089,
      "learning_rate": 6.8125e-05,
      "loss": 1.4496,
      "step": 2111
    },
    {
      "epoch": 33.0,
      "grad_norm": 3.4481258392333984,
      "learning_rate": 6.806250000000001e-05,
      "loss": 1.2535,
      "step": 2112
    },
    {
      "epoch": 33.0,
      "eval_loss": 3.071908950805664,
      "eval_runtime": 2.9479,
      "eval_samples_per_second": 173.684,
      "eval_steps_per_second": 43.421,
      "step": 2112
    },
    {
      "epoch": 33.015625,
      "grad_norm": 2.2339372634887695,
      "learning_rate": 6.800000000000001e-05,
      "loss": 1.3839,
      "step": 2113
    },
    {
      "epoch": 33.03125,
      "grad_norm": 2.408453941345215,
      "learning_rate": 6.79375e-05,
      "loss": 1.526,
      "step": 2114
    },
    {
      "epoch": 33.046875,
      "grad_norm": 2.3368217945098877,
      "learning_rate": 6.7875e-05,
      "loss": 1.2719,
      "step": 2115
    },
    {
      "epoch": 33.0625,
      "grad_norm": 2.4771623611450195,
      "learning_rate": 6.78125e-05,
      "loss": 1.1592,
      "step": 2116
    },
    {
      "epoch": 33.078125,
      "grad_norm": 2.4564685821533203,
      "learning_rate": 6.775000000000001e-05,
      "loss": 1.0927,
      "step": 2117
    },
    {
      "epoch": 33.09375,
      "grad_norm": 2.315577745437622,
      "learning_rate": 6.76875e-05,
      "loss": 1.1996,
      "step": 2118
    },
    {
      "epoch": 33.109375,
      "grad_norm": 2.3146286010742188,
      "learning_rate": 6.7625e-05,
      "loss": 1.2175,
      "step": 2119
    },
    {
      "epoch": 33.125,
      "grad_norm": 2.575855016708374,
      "learning_rate": 6.756250000000001e-05,
      "loss": 1.2739,
      "step": 2120
    },
    {
      "epoch": 33.140625,
      "grad_norm": 2.1909070014953613,
      "learning_rate": 6.750000000000001e-05,
      "loss": 1.2904,
      "step": 2121
    },
    {
      "epoch": 33.15625,
      "grad_norm": 2.514859676361084,
      "learning_rate": 6.743749999999999e-05,
      "loss": 1.1998,
      "step": 2122
    },
    {
      "epoch": 33.171875,
      "grad_norm": 2.3137242794036865,
      "learning_rate": 6.7375e-05,
      "loss": 1.0911,
      "step": 2123
    },
    {
      "epoch": 33.1875,
      "grad_norm": 2.317004442214966,
      "learning_rate": 6.73125e-05,
      "loss": 1.0837,
      "step": 2124
    },
    {
      "epoch": 33.203125,
      "grad_norm": 2.586578369140625,
      "learning_rate": 6.725000000000001e-05,
      "loss": 1.1359,
      "step": 2125
    },
    {
      "epoch": 33.21875,
      "grad_norm": 2.652360200881958,
      "learning_rate": 6.71875e-05,
      "loss": 1.2852,
      "step": 2126
    },
    {
      "epoch": 33.234375,
      "grad_norm": 2.615938186645508,
      "learning_rate": 6.7125e-05,
      "loss": 1.373,
      "step": 2127
    },
    {
      "epoch": 33.25,
      "grad_norm": 2.490614175796509,
      "learning_rate": 6.706250000000001e-05,
      "loss": 0.9427,
      "step": 2128
    },
    {
      "epoch": 33.265625,
      "grad_norm": 2.5119001865386963,
      "learning_rate": 6.7e-05,
      "loss": 1.2743,
      "step": 2129
    },
    {
      "epoch": 33.28125,
      "grad_norm": 2.5067410469055176,
      "learning_rate": 6.693750000000001e-05,
      "loss": 1.2735,
      "step": 2130
    },
    {
      "epoch": 33.296875,
      "grad_norm": 2.176511287689209,
      "learning_rate": 6.6875e-05,
      "loss": 1.183,
      "step": 2131
    },
    {
      "epoch": 33.3125,
      "grad_norm": 2.6750478744506836,
      "learning_rate": 6.68125e-05,
      "loss": 1.2095,
      "step": 2132
    },
    {
      "epoch": 33.328125,
      "grad_norm": 2.655350685119629,
      "learning_rate": 6.675e-05,
      "loss": 1.2829,
      "step": 2133
    },
    {
      "epoch": 33.34375,
      "grad_norm": 2.212824583053589,
      "learning_rate": 6.66875e-05,
      "loss": 1.2727,
      "step": 2134
    },
    {
      "epoch": 33.359375,
      "grad_norm": 2.4004077911376953,
      "learning_rate": 6.6625e-05,
      "loss": 1.3349,
      "step": 2135
    },
    {
      "epoch": 33.375,
      "grad_norm": 2.4126269817352295,
      "learning_rate": 6.656250000000001e-05,
      "loss": 1.5374,
      "step": 2136
    },
    {
      "epoch": 33.390625,
      "grad_norm": 2.6505770683288574,
      "learning_rate": 6.65e-05,
      "loss": 1.2654,
      "step": 2137
    },
    {
      "epoch": 33.40625,
      "grad_norm": 2.7497787475585938,
      "learning_rate": 6.643750000000001e-05,
      "loss": 1.3996,
      "step": 2138
    },
    {
      "epoch": 33.421875,
      "grad_norm": 2.385464668273926,
      "learning_rate": 6.6375e-05,
      "loss": 1.3047,
      "step": 2139
    },
    {
      "epoch": 33.4375,
      "grad_norm": 2.6757149696350098,
      "learning_rate": 6.63125e-05,
      "loss": 1.3457,
      "step": 2140
    },
    {
      "epoch": 33.453125,
      "grad_norm": 2.269559144973755,
      "learning_rate": 6.625e-05,
      "loss": 1.324,
      "step": 2141
    },
    {
      "epoch": 33.46875,
      "grad_norm": 2.590893030166626,
      "learning_rate": 6.61875e-05,
      "loss": 1.2999,
      "step": 2142
    },
    {
      "epoch": 33.484375,
      "grad_norm": 2.7694990634918213,
      "learning_rate": 6.612500000000001e-05,
      "loss": 1.2816,
      "step": 2143
    },
    {
      "epoch": 33.5,
      "grad_norm": 2.490323543548584,
      "learning_rate": 6.60625e-05,
      "loss": 1.239,
      "step": 2144
    },
    {
      "epoch": 33.515625,
      "grad_norm": 2.3738911151885986,
      "learning_rate": 6.6e-05,
      "loss": 1.3581,
      "step": 2145
    },
    {
      "epoch": 33.53125,
      "grad_norm": 2.643203020095825,
      "learning_rate": 6.593750000000001e-05,
      "loss": 1.2677,
      "step": 2146
    },
    {
      "epoch": 33.546875,
      "grad_norm": 2.519604206085205,
      "learning_rate": 6.5875e-05,
      "loss": 1.3604,
      "step": 2147
    },
    {
      "epoch": 33.5625,
      "grad_norm": 2.582381248474121,
      "learning_rate": 6.58125e-05,
      "loss": 1.2999,
      "step": 2148
    },
    {
      "epoch": 33.578125,
      "grad_norm": 2.311035394668579,
      "learning_rate": 6.575e-05,
      "loss": 1.2233,
      "step": 2149
    },
    {
      "epoch": 33.59375,
      "grad_norm": 2.384537935256958,
      "learning_rate": 6.56875e-05,
      "loss": 1.5004,
      "step": 2150
    },
    {
      "epoch": 33.609375,
      "grad_norm": 2.1816697120666504,
      "learning_rate": 6.562500000000001e-05,
      "loss": 1.2625,
      "step": 2151
    },
    {
      "epoch": 33.625,
      "grad_norm": 2.233576536178589,
      "learning_rate": 6.55625e-05,
      "loss": 1.3501,
      "step": 2152
    },
    {
      "epoch": 33.640625,
      "grad_norm": 2.8021674156188965,
      "learning_rate": 6.55e-05,
      "loss": 1.1825,
      "step": 2153
    },
    {
      "epoch": 33.65625,
      "grad_norm": 2.7278330326080322,
      "learning_rate": 6.543750000000001e-05,
      "loss": 1.4279,
      "step": 2154
    },
    {
      "epoch": 33.671875,
      "grad_norm": 2.3741090297698975,
      "learning_rate": 6.5375e-05,
      "loss": 1.0873,
      "step": 2155
    },
    {
      "epoch": 33.6875,
      "grad_norm": 2.308974504470825,
      "learning_rate": 6.53125e-05,
      "loss": 1.2754,
      "step": 2156
    },
    {
      "epoch": 33.703125,
      "grad_norm": 2.931699514389038,
      "learning_rate": 6.525e-05,
      "loss": 1.2469,
      "step": 2157
    },
    {
      "epoch": 33.71875,
      "grad_norm": 2.4078547954559326,
      "learning_rate": 6.51875e-05,
      "loss": 1.5852,
      "step": 2158
    },
    {
      "epoch": 33.734375,
      "grad_norm": 2.479557752609253,
      "learning_rate": 6.5125e-05,
      "loss": 1.1943,
      "step": 2159
    },
    {
      "epoch": 33.75,
      "grad_norm": 2.449644088745117,
      "learning_rate": 6.50625e-05,
      "loss": 1.2335,
      "step": 2160
    },
    {
      "epoch": 33.765625,
      "grad_norm": 2.7147414684295654,
      "learning_rate": 6.500000000000001e-05,
      "loss": 1.2386,
      "step": 2161
    },
    {
      "epoch": 33.78125,
      "grad_norm": 2.4628236293792725,
      "learning_rate": 6.493750000000001e-05,
      "loss": 1.2085,
      "step": 2162
    },
    {
      "epoch": 33.796875,
      "grad_norm": 2.622994899749756,
      "learning_rate": 6.4875e-05,
      "loss": 1.2148,
      "step": 2163
    },
    {
      "epoch": 33.8125,
      "grad_norm": 2.261126756668091,
      "learning_rate": 6.48125e-05,
      "loss": 1.1728,
      "step": 2164
    },
    {
      "epoch": 33.828125,
      "grad_norm": 2.3550832271575928,
      "learning_rate": 6.475e-05,
      "loss": 1.3084,
      "step": 2165
    },
    {
      "epoch": 33.84375,
      "grad_norm": 2.6083123683929443,
      "learning_rate": 6.468750000000001e-05,
      "loss": 1.4213,
      "step": 2166
    },
    {
      "epoch": 33.859375,
      "grad_norm": 2.4329724311828613,
      "learning_rate": 6.4625e-05,
      "loss": 1.196,
      "step": 2167
    },
    {
      "epoch": 33.875,
      "grad_norm": 2.421818733215332,
      "learning_rate": 6.45625e-05,
      "loss": 1.2755,
      "step": 2168
    },
    {
      "epoch": 33.890625,
      "grad_norm": 2.3758997917175293,
      "learning_rate": 6.450000000000001e-05,
      "loss": 1.1276,
      "step": 2169
    },
    {
      "epoch": 33.90625,
      "grad_norm": 2.38946795463562,
      "learning_rate": 6.44375e-05,
      "loss": 1.4467,
      "step": 2170
    },
    {
      "epoch": 33.921875,
      "grad_norm": 2.346892833709717,
      "learning_rate": 6.4375e-05,
      "loss": 1.3212,
      "step": 2171
    },
    {
      "epoch": 33.9375,
      "grad_norm": 2.4979467391967773,
      "learning_rate": 6.43125e-05,
      "loss": 1.4568,
      "step": 2172
    },
    {
      "epoch": 33.953125,
      "grad_norm": 2.8379812240600586,
      "learning_rate": 6.425e-05,
      "loss": 1.2514,
      "step": 2173
    },
    {
      "epoch": 33.96875,
      "grad_norm": 2.4893579483032227,
      "learning_rate": 6.41875e-05,
      "loss": 1.4266,
      "step": 2174
    },
    {
      "epoch": 33.984375,
      "grad_norm": 2.771724224090576,
      "learning_rate": 6.4125e-05,
      "loss": 1.3379,
      "step": 2175
    },
    {
      "epoch": 34.0,
      "grad_norm": 3.0840518474578857,
      "learning_rate": 6.40625e-05,
      "loss": 1.427,
      "step": 2176
    },
    {
      "epoch": 34.0,
      "eval_loss": 3.0699846744537354,
      "eval_runtime": 2.8948,
      "eval_samples_per_second": 176.869,
      "eval_steps_per_second": 44.217,
      "step": 2176
    },
    {
      "epoch": 34.015625,
      "grad_norm": 2.621248483657837,
      "learning_rate": 6.400000000000001e-05,
      "loss": 1.0407,
      "step": 2177
    },
    {
      "epoch": 34.03125,
      "grad_norm": 2.6301374435424805,
      "learning_rate": 6.39375e-05,
      "loss": 1.2617,
      "step": 2178
    },
    {
      "epoch": 34.046875,
      "grad_norm": 2.4795725345611572,
      "learning_rate": 6.387500000000001e-05,
      "loss": 1.1801,
      "step": 2179
    },
    {
      "epoch": 34.0625,
      "grad_norm": 2.491211414337158,
      "learning_rate": 6.381250000000001e-05,
      "loss": 1.2911,
      "step": 2180
    },
    {
      "epoch": 34.078125,
      "grad_norm": 2.0048558712005615,
      "learning_rate": 6.375e-05,
      "loss": 1.1511,
      "step": 2181
    },
    {
      "epoch": 34.09375,
      "grad_norm": 2.398858070373535,
      "learning_rate": 6.36875e-05,
      "loss": 1.458,
      "step": 2182
    },
    {
      "epoch": 34.109375,
      "grad_norm": 2.623370885848999,
      "learning_rate": 6.3625e-05,
      "loss": 1.1532,
      "step": 2183
    },
    {
      "epoch": 34.125,
      "grad_norm": 2.251521110534668,
      "learning_rate": 6.356250000000001e-05,
      "loss": 1.2961,
      "step": 2184
    },
    {
      "epoch": 34.140625,
      "grad_norm": 2.2521893978118896,
      "learning_rate": 6.35e-05,
      "loss": 1.3594,
      "step": 2185
    },
    {
      "epoch": 34.15625,
      "grad_norm": 2.565516471862793,
      "learning_rate": 6.34375e-05,
      "loss": 1.1533,
      "step": 2186
    },
    {
      "epoch": 34.171875,
      "grad_norm": 2.3988473415374756,
      "learning_rate": 6.337500000000001e-05,
      "loss": 1.0965,
      "step": 2187
    },
    {
      "epoch": 34.1875,
      "grad_norm": 2.7750558853149414,
      "learning_rate": 6.331250000000001e-05,
      "loss": 1.1674,
      "step": 2188
    },
    {
      "epoch": 34.203125,
      "grad_norm": 2.1511049270629883,
      "learning_rate": 6.324999999999999e-05,
      "loss": 1.4478,
      "step": 2189
    },
    {
      "epoch": 34.21875,
      "grad_norm": 2.3622677326202393,
      "learning_rate": 6.31875e-05,
      "loss": 1.0473,
      "step": 2190
    },
    {
      "epoch": 34.234375,
      "grad_norm": 2.56028413772583,
      "learning_rate": 6.3125e-05,
      "loss": 1.3334,
      "step": 2191
    },
    {
      "epoch": 34.25,
      "grad_norm": 2.1803500652313232,
      "learning_rate": 6.306250000000001e-05,
      "loss": 1.1873,
      "step": 2192
    },
    {
      "epoch": 34.265625,
      "grad_norm": 2.3931899070739746,
      "learning_rate": 6.3e-05,
      "loss": 1.2925,
      "step": 2193
    },
    {
      "epoch": 34.28125,
      "grad_norm": 2.723963499069214,
      "learning_rate": 6.29375e-05,
      "loss": 1.1896,
      "step": 2194
    },
    {
      "epoch": 34.296875,
      "grad_norm": 2.6058292388916016,
      "learning_rate": 6.287500000000001e-05,
      "loss": 1.1942,
      "step": 2195
    },
    {
      "epoch": 34.3125,
      "grad_norm": 2.6553378105163574,
      "learning_rate": 6.28125e-05,
      "loss": 1.233,
      "step": 2196
    },
    {
      "epoch": 34.328125,
      "grad_norm": 2.3545475006103516,
      "learning_rate": 6.275e-05,
      "loss": 1.262,
      "step": 2197
    },
    {
      "epoch": 34.34375,
      "grad_norm": 2.2100675106048584,
      "learning_rate": 6.26875e-05,
      "loss": 1.3916,
      "step": 2198
    },
    {
      "epoch": 34.359375,
      "grad_norm": 2.3541173934936523,
      "learning_rate": 6.2625e-05,
      "loss": 1.3312,
      "step": 2199
    },
    {
      "epoch": 34.375,
      "grad_norm": 2.306574821472168,
      "learning_rate": 6.25625e-05,
      "loss": 1.1167,
      "step": 2200
    },
    {
      "epoch": 34.390625,
      "grad_norm": 2.499152898788452,
      "learning_rate": 6.25e-05,
      "loss": 1.2703,
      "step": 2201
    },
    {
      "epoch": 34.40625,
      "grad_norm": 2.6154847145080566,
      "learning_rate": 6.24375e-05,
      "loss": 1.114,
      "step": 2202
    },
    {
      "epoch": 34.421875,
      "grad_norm": 2.281715154647827,
      "learning_rate": 6.237500000000001e-05,
      "loss": 1.1065,
      "step": 2203
    },
    {
      "epoch": 34.4375,
      "grad_norm": 2.305757999420166,
      "learning_rate": 6.23125e-05,
      "loss": 1.2316,
      "step": 2204
    },
    {
      "epoch": 34.453125,
      "grad_norm": 2.337035655975342,
      "learning_rate": 6.225000000000001e-05,
      "loss": 1.5847,
      "step": 2205
    },
    {
      "epoch": 34.46875,
      "grad_norm": 2.553713083267212,
      "learning_rate": 6.21875e-05,
      "loss": 1.1848,
      "step": 2206
    },
    {
      "epoch": 34.484375,
      "grad_norm": 2.2118682861328125,
      "learning_rate": 6.2125e-05,
      "loss": 1.0839,
      "step": 2207
    },
    {
      "epoch": 34.5,
      "grad_norm": 2.1692755222320557,
      "learning_rate": 6.20625e-05,
      "loss": 1.2489,
      "step": 2208
    },
    {
      "epoch": 34.515625,
      "grad_norm": 2.460430383682251,
      "learning_rate": 6.2e-05,
      "loss": 1.4753,
      "step": 2209
    },
    {
      "epoch": 34.53125,
      "grad_norm": 2.295342206954956,
      "learning_rate": 6.193750000000001e-05,
      "loss": 1.3867,
      "step": 2210
    },
    {
      "epoch": 34.546875,
      "grad_norm": 2.355818033218384,
      "learning_rate": 6.1875e-05,
      "loss": 1.3813,
      "step": 2211
    },
    {
      "epoch": 34.5625,
      "grad_norm": 2.1467106342315674,
      "learning_rate": 6.18125e-05,
      "loss": 1.2231,
      "step": 2212
    },
    {
      "epoch": 34.578125,
      "grad_norm": 2.6584644317626953,
      "learning_rate": 6.175000000000001e-05,
      "loss": 1.2504,
      "step": 2213
    },
    {
      "epoch": 34.59375,
      "grad_norm": 2.4805569648742676,
      "learning_rate": 6.16875e-05,
      "loss": 1.3004,
      "step": 2214
    },
    {
      "epoch": 34.609375,
      "grad_norm": 2.417264461517334,
      "learning_rate": 6.1625e-05,
      "loss": 1.2388,
      "step": 2215
    },
    {
      "epoch": 34.625,
      "grad_norm": 2.3150575160980225,
      "learning_rate": 6.15625e-05,
      "loss": 1.3423,
      "step": 2216
    },
    {
      "epoch": 34.640625,
      "grad_norm": 2.6260488033294678,
      "learning_rate": 6.15e-05,
      "loss": 1.3276,
      "step": 2217
    },
    {
      "epoch": 34.65625,
      "grad_norm": 2.70405650138855,
      "learning_rate": 6.143750000000001e-05,
      "loss": 1.29,
      "step": 2218
    },
    {
      "epoch": 34.671875,
      "grad_norm": 2.3183016777038574,
      "learning_rate": 6.1375e-05,
      "loss": 1.3754,
      "step": 2219
    },
    {
      "epoch": 34.6875,
      "grad_norm": 2.4368069171905518,
      "learning_rate": 6.13125e-05,
      "loss": 1.4631,
      "step": 2220
    },
    {
      "epoch": 34.703125,
      "grad_norm": 2.7015016078948975,
      "learning_rate": 6.125000000000001e-05,
      "loss": 1.2863,
      "step": 2221
    },
    {
      "epoch": 34.71875,
      "grad_norm": 2.625676393508911,
      "learning_rate": 6.11875e-05,
      "loss": 1.2326,
      "step": 2222
    },
    {
      "epoch": 34.734375,
      "grad_norm": 2.521193027496338,
      "learning_rate": 6.1125e-05,
      "loss": 1.1993,
      "step": 2223
    },
    {
      "epoch": 34.75,
      "grad_norm": 2.711362600326538,
      "learning_rate": 6.10625e-05,
      "loss": 1.2115,
      "step": 2224
    },
    {
      "epoch": 34.765625,
      "grad_norm": 2.5228981971740723,
      "learning_rate": 6.1e-05,
      "loss": 1.4102,
      "step": 2225
    },
    {
      "epoch": 34.78125,
      "grad_norm": 2.5978314876556396,
      "learning_rate": 6.0937500000000004e-05,
      "loss": 1.1035,
      "step": 2226
    },
    {
      "epoch": 34.796875,
      "grad_norm": 2.4712677001953125,
      "learning_rate": 6.0875e-05,
      "loss": 1.245,
      "step": 2227
    },
    {
      "epoch": 34.8125,
      "grad_norm": 2.5885565280914307,
      "learning_rate": 6.081250000000001e-05,
      "loss": 1.1685,
      "step": 2228
    },
    {
      "epoch": 34.828125,
      "grad_norm": 2.2120187282562256,
      "learning_rate": 6.0750000000000006e-05,
      "loss": 1.2873,
      "step": 2229
    },
    {
      "epoch": 34.84375,
      "grad_norm": 2.450796365737915,
      "learning_rate": 6.068750000000001e-05,
      "loss": 1.3963,
      "step": 2230
    },
    {
      "epoch": 34.859375,
      "grad_norm": 2.5241200923919678,
      "learning_rate": 6.0624999999999996e-05,
      "loss": 1.1989,
      "step": 2231
    },
    {
      "epoch": 34.875,
      "grad_norm": 2.5413520336151123,
      "learning_rate": 6.05625e-05,
      "loss": 1.2706,
      "step": 2232
    },
    {
      "epoch": 34.890625,
      "grad_norm": 2.445969343185425,
      "learning_rate": 6.05e-05,
      "loss": 1.1724,
      "step": 2233
    },
    {
      "epoch": 34.90625,
      "grad_norm": 2.530601978302002,
      "learning_rate": 6.0437500000000005e-05,
      "loss": 1.2263,
      "step": 2234
    },
    {
      "epoch": 34.921875,
      "grad_norm": 2.5277838706970215,
      "learning_rate": 6.0375000000000004e-05,
      "loss": 1.3829,
      "step": 2235
    },
    {
      "epoch": 34.9375,
      "grad_norm": 2.423532485961914,
      "learning_rate": 6.03125e-05,
      "loss": 1.1733,
      "step": 2236
    },
    {
      "epoch": 34.953125,
      "grad_norm": 2.5365700721740723,
      "learning_rate": 6.025000000000001e-05,
      "loss": 1.223,
      "step": 2237
    },
    {
      "epoch": 34.96875,
      "grad_norm": 2.3549962043762207,
      "learning_rate": 6.0187500000000006e-05,
      "loss": 1.2363,
      "step": 2238
    },
    {
      "epoch": 34.984375,
      "grad_norm": 2.614260196685791,
      "learning_rate": 6.0125e-05,
      "loss": 0.9574,
      "step": 2239
    },
    {
      "epoch": 35.0,
      "grad_norm": 2.7502517700195312,
      "learning_rate": 6.0062499999999996e-05,
      "loss": 1.3481,
      "step": 2240
    },
    {
      "epoch": 35.0,
      "eval_loss": 3.0711793899536133,
      "eval_runtime": 2.9175,
      "eval_samples_per_second": 175.49,
      "eval_steps_per_second": 43.872,
      "step": 2240
    },
    {
      "epoch": 35.015625,
      "grad_norm": 2.2939207553863525,
      "learning_rate": 6e-05,
      "loss": 1.4826,
      "step": 2241
    },
    {
      "epoch": 35.03125,
      "grad_norm": 2.4889822006225586,
      "learning_rate": 5.99375e-05,
      "loss": 1.3054,
      "step": 2242
    },
    {
      "epoch": 35.046875,
      "grad_norm": 2.7636795043945312,
      "learning_rate": 5.9875000000000005e-05,
      "loss": 1.2805,
      "step": 2243
    },
    {
      "epoch": 35.0625,
      "grad_norm": 2.2666163444519043,
      "learning_rate": 5.9812500000000004e-05,
      "loss": 1.3368,
      "step": 2244
    },
    {
      "epoch": 35.078125,
      "grad_norm": 2.1629207134246826,
      "learning_rate": 5.975000000000001e-05,
      "loss": 1.2695,
      "step": 2245
    },
    {
      "epoch": 35.09375,
      "grad_norm": 2.4970579147338867,
      "learning_rate": 5.968750000000001e-05,
      "loss": 1.3341,
      "step": 2246
    },
    {
      "epoch": 35.109375,
      "grad_norm": 2.1390249729156494,
      "learning_rate": 5.9625e-05,
      "loss": 1.3061,
      "step": 2247
    },
    {
      "epoch": 35.125,
      "grad_norm": 2.9151482582092285,
      "learning_rate": 5.95625e-05,
      "loss": 1.0357,
      "step": 2248
    },
    {
      "epoch": 35.140625,
      "grad_norm": 2.2600438594818115,
      "learning_rate": 5.95e-05,
      "loss": 1.28,
      "step": 2249
    },
    {
      "epoch": 35.15625,
      "grad_norm": 2.557391881942749,
      "learning_rate": 5.94375e-05,
      "loss": 1.0739,
      "step": 2250
    },
    {
      "epoch": 35.171875,
      "grad_norm": 2.3806722164154053,
      "learning_rate": 5.9375e-05,
      "loss": 1.0772,
      "step": 2251
    },
    {
      "epoch": 35.1875,
      "grad_norm": 2.640036106109619,
      "learning_rate": 5.9312500000000005e-05,
      "loss": 1.183,
      "step": 2252
    },
    {
      "epoch": 35.203125,
      "grad_norm": 2.6599721908569336,
      "learning_rate": 5.9250000000000004e-05,
      "loss": 1.0995,
      "step": 2253
    },
    {
      "epoch": 35.21875,
      "grad_norm": 2.774445056915283,
      "learning_rate": 5.918750000000001e-05,
      "loss": 1.2628,
      "step": 2254
    },
    {
      "epoch": 35.234375,
      "grad_norm": 2.4477896690368652,
      "learning_rate": 5.912500000000001e-05,
      "loss": 1.3209,
      "step": 2255
    },
    {
      "epoch": 35.25,
      "grad_norm": 2.5019731521606445,
      "learning_rate": 5.90625e-05,
      "loss": 1.2143,
      "step": 2256
    },
    {
      "epoch": 35.265625,
      "grad_norm": 2.7371368408203125,
      "learning_rate": 5.9e-05,
      "loss": 1.2726,
      "step": 2257
    },
    {
      "epoch": 35.28125,
      "grad_norm": 2.3881452083587646,
      "learning_rate": 5.89375e-05,
      "loss": 1.6029,
      "step": 2258
    },
    {
      "epoch": 35.296875,
      "grad_norm": 2.682079315185547,
      "learning_rate": 5.8875e-05,
      "loss": 1.2603,
      "step": 2259
    },
    {
      "epoch": 35.3125,
      "grad_norm": 2.362192153930664,
      "learning_rate": 5.8812500000000007e-05,
      "loss": 1.3269,
      "step": 2260
    },
    {
      "epoch": 35.328125,
      "grad_norm": 2.552406072616577,
      "learning_rate": 5.8750000000000005e-05,
      "loss": 1.2792,
      "step": 2261
    },
    {
      "epoch": 35.34375,
      "grad_norm": 2.727374792098999,
      "learning_rate": 5.8687500000000003e-05,
      "loss": 1.2977,
      "step": 2262
    },
    {
      "epoch": 35.359375,
      "grad_norm": 2.4189090728759766,
      "learning_rate": 5.862500000000001e-05,
      "loss": 1.2049,
      "step": 2263
    },
    {
      "epoch": 35.375,
      "grad_norm": 2.3692033290863037,
      "learning_rate": 5.85625e-05,
      "loss": 1.397,
      "step": 2264
    },
    {
      "epoch": 35.390625,
      "grad_norm": 2.6948912143707275,
      "learning_rate": 5.85e-05,
      "loss": 1.267,
      "step": 2265
    },
    {
      "epoch": 35.40625,
      "grad_norm": 2.5508012771606445,
      "learning_rate": 5.84375e-05,
      "loss": 1.1922,
      "step": 2266
    },
    {
      "epoch": 35.421875,
      "grad_norm": 2.575798273086548,
      "learning_rate": 5.8375e-05,
      "loss": 1.2051,
      "step": 2267
    },
    {
      "epoch": 35.4375,
      "grad_norm": 2.4820456504821777,
      "learning_rate": 5.83125e-05,
      "loss": 1.2695,
      "step": 2268
    },
    {
      "epoch": 35.453125,
      "grad_norm": 2.472069263458252,
      "learning_rate": 5.8250000000000006e-05,
      "loss": 1.2512,
      "step": 2269
    },
    {
      "epoch": 35.46875,
      "grad_norm": 2.3181354999542236,
      "learning_rate": 5.8187500000000005e-05,
      "loss": 1.241,
      "step": 2270
    },
    {
      "epoch": 35.484375,
      "grad_norm": 2.4154093265533447,
      "learning_rate": 5.812500000000001e-05,
      "loss": 1.2657,
      "step": 2271
    },
    {
      "epoch": 35.5,
      "grad_norm": 2.608938694000244,
      "learning_rate": 5.8062499999999995e-05,
      "loss": 1.1008,
      "step": 2272
    },
    {
      "epoch": 35.515625,
      "grad_norm": 2.210602283477783,
      "learning_rate": 5.8e-05,
      "loss": 1.2939,
      "step": 2273
    },
    {
      "epoch": 35.53125,
      "grad_norm": 2.2572803497314453,
      "learning_rate": 5.79375e-05,
      "loss": 1.0626,
      "step": 2274
    },
    {
      "epoch": 35.546875,
      "grad_norm": 2.2787907123565674,
      "learning_rate": 5.7875000000000004e-05,
      "loss": 1.0831,
      "step": 2275
    },
    {
      "epoch": 35.5625,
      "grad_norm": 2.5927491188049316,
      "learning_rate": 5.78125e-05,
      "loss": 1.295,
      "step": 2276
    },
    {
      "epoch": 35.578125,
      "grad_norm": 2.4913947582244873,
      "learning_rate": 5.775e-05,
      "loss": 1.1686,
      "step": 2277
    },
    {
      "epoch": 35.59375,
      "grad_norm": 2.3924849033355713,
      "learning_rate": 5.7687500000000006e-05,
      "loss": 1.3795,
      "step": 2278
    },
    {
      "epoch": 35.609375,
      "grad_norm": 2.449169397354126,
      "learning_rate": 5.7625000000000005e-05,
      "loss": 1.394,
      "step": 2279
    },
    {
      "epoch": 35.625,
      "grad_norm": 2.312316417694092,
      "learning_rate": 5.756250000000001e-05,
      "loss": 1.4324,
      "step": 2280
    },
    {
      "epoch": 35.640625,
      "grad_norm": 2.642113447189331,
      "learning_rate": 5.7499999999999995e-05,
      "loss": 1.1613,
      "step": 2281
    },
    {
      "epoch": 35.65625,
      "grad_norm": 2.7588725090026855,
      "learning_rate": 5.74375e-05,
      "loss": 1.2176,
      "step": 2282
    },
    {
      "epoch": 35.671875,
      "grad_norm": 2.0413198471069336,
      "learning_rate": 5.7375e-05,
      "loss": 1.1912,
      "step": 2283
    },
    {
      "epoch": 35.6875,
      "grad_norm": 2.3424153327941895,
      "learning_rate": 5.7312500000000004e-05,
      "loss": 1.2882,
      "step": 2284
    },
    {
      "epoch": 35.703125,
      "grad_norm": 2.467233419418335,
      "learning_rate": 5.725e-05,
      "loss": 1.0719,
      "step": 2285
    },
    {
      "epoch": 35.71875,
      "grad_norm": 2.316485643386841,
      "learning_rate": 5.718750000000001e-05,
      "loss": 1.2477,
      "step": 2286
    },
    {
      "epoch": 35.734375,
      "grad_norm": 2.722137689590454,
      "learning_rate": 5.7125000000000006e-05,
      "loss": 1.4396,
      "step": 2287
    },
    {
      "epoch": 35.75,
      "grad_norm": 2.3641586303710938,
      "learning_rate": 5.7062500000000005e-05,
      "loss": 1.1915,
      "step": 2288
    },
    {
      "epoch": 35.765625,
      "grad_norm": 2.388256311416626,
      "learning_rate": 5.6999999999999996e-05,
      "loss": 1.1656,
      "step": 2289
    },
    {
      "epoch": 35.78125,
      "grad_norm": 2.598712682723999,
      "learning_rate": 5.69375e-05,
      "loss": 1.2862,
      "step": 2290
    },
    {
      "epoch": 35.796875,
      "grad_norm": 2.588616371154785,
      "learning_rate": 5.6875e-05,
      "loss": 1.3749,
      "step": 2291
    },
    {
      "epoch": 35.8125,
      "grad_norm": 2.6594958305358887,
      "learning_rate": 5.68125e-05,
      "loss": 1.3178,
      "step": 2292
    },
    {
      "epoch": 35.828125,
      "grad_norm": 2.5284249782562256,
      "learning_rate": 5.6750000000000004e-05,
      "loss": 1.5196,
      "step": 2293
    },
    {
      "epoch": 35.84375,
      "grad_norm": 2.2589025497436523,
      "learning_rate": 5.66875e-05,
      "loss": 1.0865,
      "step": 2294
    },
    {
      "epoch": 35.859375,
      "grad_norm": 2.4906656742095947,
      "learning_rate": 5.662500000000001e-05,
      "loss": 1.1322,
      "step": 2295
    },
    {
      "epoch": 35.875,
      "grad_norm": 2.7757463455200195,
      "learning_rate": 5.6562500000000006e-05,
      "loss": 1.2787,
      "step": 2296
    },
    {
      "epoch": 35.890625,
      "grad_norm": 2.5835256576538086,
      "learning_rate": 5.65e-05,
      "loss": 1.4197,
      "step": 2297
    },
    {
      "epoch": 35.90625,
      "grad_norm": 2.719505548477173,
      "learning_rate": 5.6437499999999996e-05,
      "loss": 1.2412,
      "step": 2298
    },
    {
      "epoch": 35.921875,
      "grad_norm": 2.6721060276031494,
      "learning_rate": 5.6375e-05,
      "loss": 1.109,
      "step": 2299
    },
    {
      "epoch": 35.9375,
      "grad_norm": 2.7934272289276123,
      "learning_rate": 5.63125e-05,
      "loss": 1.2766,
      "step": 2300
    },
    {
      "epoch": 35.953125,
      "grad_norm": 2.3783857822418213,
      "learning_rate": 5.6250000000000005e-05,
      "loss": 1.2888,
      "step": 2301
    },
    {
      "epoch": 35.96875,
      "grad_norm": 2.452106237411499,
      "learning_rate": 5.6187500000000004e-05,
      "loss": 1.4316,
      "step": 2302
    },
    {
      "epoch": 35.984375,
      "grad_norm": 2.4148995876312256,
      "learning_rate": 5.6125e-05,
      "loss": 1.235,
      "step": 2303
    },
    {
      "epoch": 36.0,
      "grad_norm": 2.7353062629699707,
      "learning_rate": 5.606250000000001e-05,
      "loss": 1.3419,
      "step": 2304
    },
    {
      "epoch": 36.0,
      "eval_loss": 3.078512668609619,
      "eval_runtime": 2.9592,
      "eval_samples_per_second": 173.017,
      "eval_steps_per_second": 43.254,
      "step": 2304
    },
    {
      "epoch": 36.015625,
      "grad_norm": 2.293055295944214,
      "learning_rate": 5.6000000000000006e-05,
      "loss": 1.2008,
      "step": 2305
    },
    {
      "epoch": 36.03125,
      "grad_norm": 2.7334182262420654,
      "learning_rate": 5.59375e-05,
      "loss": 1.4058,
      "step": 2306
    },
    {
      "epoch": 36.046875,
      "grad_norm": 2.136500597000122,
      "learning_rate": 5.5875e-05,
      "loss": 1.189,
      "step": 2307
    },
    {
      "epoch": 36.0625,
      "grad_norm": 2.615905523300171,
      "learning_rate": 5.58125e-05,
      "loss": 1.2015,
      "step": 2308
    },
    {
      "epoch": 36.078125,
      "grad_norm": 2.359585762023926,
      "learning_rate": 5.575e-05,
      "loss": 1.3564,
      "step": 2309
    },
    {
      "epoch": 36.09375,
      "grad_norm": 2.1364951133728027,
      "learning_rate": 5.5687500000000005e-05,
      "loss": 1.2167,
      "step": 2310
    },
    {
      "epoch": 36.109375,
      "grad_norm": 2.6273434162139893,
      "learning_rate": 5.5625000000000004e-05,
      "loss": 1.054,
      "step": 2311
    },
    {
      "epoch": 36.125,
      "grad_norm": 2.513120651245117,
      "learning_rate": 5.556250000000001e-05,
      "loss": 1.4167,
      "step": 2312
    },
    {
      "epoch": 36.140625,
      "grad_norm": 2.3284199237823486,
      "learning_rate": 5.550000000000001e-05,
      "loss": 1.22,
      "step": 2313
    },
    {
      "epoch": 36.15625,
      "grad_norm": 2.926490068435669,
      "learning_rate": 5.54375e-05,
      "loss": 1.1188,
      "step": 2314
    },
    {
      "epoch": 36.171875,
      "grad_norm": 2.494903087615967,
      "learning_rate": 5.5375e-05,
      "loss": 1.2042,
      "step": 2315
    },
    {
      "epoch": 36.1875,
      "grad_norm": 2.634472370147705,
      "learning_rate": 5.53125e-05,
      "loss": 1.214,
      "step": 2316
    },
    {
      "epoch": 36.203125,
      "grad_norm": 2.650930643081665,
      "learning_rate": 5.525e-05,
      "loss": 0.9331,
      "step": 2317
    },
    {
      "epoch": 36.21875,
      "grad_norm": 2.5292301177978516,
      "learning_rate": 5.51875e-05,
      "loss": 1.1619,
      "step": 2318
    },
    {
      "epoch": 36.234375,
      "grad_norm": 2.3784165382385254,
      "learning_rate": 5.5125000000000005e-05,
      "loss": 1.3089,
      "step": 2319
    },
    {
      "epoch": 36.25,
      "grad_norm": 2.362964391708374,
      "learning_rate": 5.5062500000000003e-05,
      "loss": 1.3207,
      "step": 2320
    },
    {
      "epoch": 36.265625,
      "grad_norm": 2.6006927490234375,
      "learning_rate": 5.500000000000001e-05,
      "loss": 1.2094,
      "step": 2321
    },
    {
      "epoch": 36.28125,
      "grad_norm": 2.6247060298919678,
      "learning_rate": 5.49375e-05,
      "loss": 1.4585,
      "step": 2322
    },
    {
      "epoch": 36.296875,
      "grad_norm": 2.7820475101470947,
      "learning_rate": 5.4875e-05,
      "loss": 1.1577,
      "step": 2323
    },
    {
      "epoch": 36.3125,
      "grad_norm": 2.4589390754699707,
      "learning_rate": 5.48125e-05,
      "loss": 1.2088,
      "step": 2324
    },
    {
      "epoch": 36.328125,
      "grad_norm": 2.166093349456787,
      "learning_rate": 5.475e-05,
      "loss": 1.2758,
      "step": 2325
    },
    {
      "epoch": 36.34375,
      "grad_norm": 2.5810000896453857,
      "learning_rate": 5.46875e-05,
      "loss": 1.3203,
      "step": 2326
    },
    {
      "epoch": 36.359375,
      "grad_norm": 2.6933956146240234,
      "learning_rate": 5.4625000000000006e-05,
      "loss": 1.2038,
      "step": 2327
    },
    {
      "epoch": 36.375,
      "grad_norm": 2.1069705486297607,
      "learning_rate": 5.4562500000000005e-05,
      "loss": 1.2359,
      "step": 2328
    },
    {
      "epoch": 36.390625,
      "grad_norm": 2.666701316833496,
      "learning_rate": 5.45e-05,
      "loss": 1.2713,
      "step": 2329
    },
    {
      "epoch": 36.40625,
      "grad_norm": 2.4012372493743896,
      "learning_rate": 5.443750000000001e-05,
      "loss": 1.4093,
      "step": 2330
    },
    {
      "epoch": 36.421875,
      "grad_norm": 2.2821428775787354,
      "learning_rate": 5.4375e-05,
      "loss": 1.2046,
      "step": 2331
    },
    {
      "epoch": 36.4375,
      "grad_norm": 2.1719043254852295,
      "learning_rate": 5.43125e-05,
      "loss": 1.0595,
      "step": 2332
    },
    {
      "epoch": 36.453125,
      "grad_norm": 2.1041007041931152,
      "learning_rate": 5.4250000000000004e-05,
      "loss": 1.2016,
      "step": 2333
    },
    {
      "epoch": 36.46875,
      "grad_norm": 2.4056904315948486,
      "learning_rate": 5.41875e-05,
      "loss": 1.324,
      "step": 2334
    },
    {
      "epoch": 36.484375,
      "grad_norm": 2.4361231327056885,
      "learning_rate": 5.4125e-05,
      "loss": 1.3304,
      "step": 2335
    },
    {
      "epoch": 36.5,
      "grad_norm": 2.473414182662964,
      "learning_rate": 5.4062500000000006e-05,
      "loss": 1.2958,
      "step": 2336
    },
    {
      "epoch": 36.515625,
      "grad_norm": 2.470560073852539,
      "learning_rate": 5.4000000000000005e-05,
      "loss": 1.3052,
      "step": 2337
    },
    {
      "epoch": 36.53125,
      "grad_norm": 2.7024569511413574,
      "learning_rate": 5.393750000000001e-05,
      "loss": 1.182,
      "step": 2338
    },
    {
      "epoch": 36.546875,
      "grad_norm": 2.4274299144744873,
      "learning_rate": 5.3874999999999995e-05,
      "loss": 1.4084,
      "step": 2339
    },
    {
      "epoch": 36.5625,
      "grad_norm": 2.582984447479248,
      "learning_rate": 5.38125e-05,
      "loss": 1.0879,
      "step": 2340
    },
    {
      "epoch": 36.578125,
      "grad_norm": 2.4038915634155273,
      "learning_rate": 5.375e-05,
      "loss": 1.2772,
      "step": 2341
    },
    {
      "epoch": 36.59375,
      "grad_norm": 2.569121837615967,
      "learning_rate": 5.3687500000000004e-05,
      "loss": 1.0045,
      "step": 2342
    },
    {
      "epoch": 36.609375,
      "grad_norm": 2.220290184020996,
      "learning_rate": 5.3625e-05,
      "loss": 1.2767,
      "step": 2343
    },
    {
      "epoch": 36.625,
      "grad_norm": 2.094897508621216,
      "learning_rate": 5.356250000000001e-05,
      "loss": 1.2691,
      "step": 2344
    },
    {
      "epoch": 36.640625,
      "grad_norm": 2.5187957286834717,
      "learning_rate": 5.3500000000000006e-05,
      "loss": 1.1759,
      "step": 2345
    },
    {
      "epoch": 36.65625,
      "grad_norm": 2.575575351715088,
      "learning_rate": 5.3437500000000005e-05,
      "loss": 1.098,
      "step": 2346
    },
    {
      "epoch": 36.671875,
      "grad_norm": 2.3473026752471924,
      "learning_rate": 5.3374999999999996e-05,
      "loss": 1.2787,
      "step": 2347
    },
    {
      "epoch": 36.6875,
      "grad_norm": 2.5047807693481445,
      "learning_rate": 5.33125e-05,
      "loss": 1.3042,
      "step": 2348
    },
    {
      "epoch": 36.703125,
      "grad_norm": 2.422968626022339,
      "learning_rate": 5.325e-05,
      "loss": 1.2585,
      "step": 2349
    },
    {
      "epoch": 36.71875,
      "grad_norm": 3.1577398777008057,
      "learning_rate": 5.31875e-05,
      "loss": 1.1515,
      "step": 2350
    },
    {
      "epoch": 36.734375,
      "grad_norm": 2.380279064178467,
      "learning_rate": 5.3125000000000004e-05,
      "loss": 1.1116,
      "step": 2351
    },
    {
      "epoch": 36.75,
      "grad_norm": 2.3588459491729736,
      "learning_rate": 5.30625e-05,
      "loss": 1.4513,
      "step": 2352
    },
    {
      "epoch": 36.765625,
      "grad_norm": 2.351170301437378,
      "learning_rate": 5.300000000000001e-05,
      "loss": 1.2027,
      "step": 2353
    },
    {
      "epoch": 36.78125,
      "grad_norm": 2.7660365104675293,
      "learning_rate": 5.2937500000000006e-05,
      "loss": 1.3643,
      "step": 2354
    },
    {
      "epoch": 36.796875,
      "grad_norm": 2.5424463748931885,
      "learning_rate": 5.2875000000000005e-05,
      "loss": 1.0079,
      "step": 2355
    },
    {
      "epoch": 36.8125,
      "grad_norm": 2.2590010166168213,
      "learning_rate": 5.2812499999999996e-05,
      "loss": 1.3246,
      "step": 2356
    },
    {
      "epoch": 36.828125,
      "grad_norm": 2.558171510696411,
      "learning_rate": 5.275e-05,
      "loss": 1.1275,
      "step": 2357
    },
    {
      "epoch": 36.84375,
      "grad_norm": 2.690706253051758,
      "learning_rate": 5.26875e-05,
      "loss": 1.2428,
      "step": 2358
    },
    {
      "epoch": 36.859375,
      "grad_norm": 2.206742763519287,
      "learning_rate": 5.2625000000000005e-05,
      "loss": 1.3627,
      "step": 2359
    },
    {
      "epoch": 36.875,
      "grad_norm": 2.3664448261260986,
      "learning_rate": 5.2562500000000004e-05,
      "loss": 1.185,
      "step": 2360
    },
    {
      "epoch": 36.890625,
      "grad_norm": 2.205796480178833,
      "learning_rate": 5.25e-05,
      "loss": 1.4351,
      "step": 2361
    },
    {
      "epoch": 36.90625,
      "grad_norm": 2.4413788318634033,
      "learning_rate": 5.243750000000001e-05,
      "loss": 1.2215,
      "step": 2362
    },
    {
      "epoch": 36.921875,
      "grad_norm": 2.666926860809326,
      "learning_rate": 5.2375000000000006e-05,
      "loss": 1.0707,
      "step": 2363
    },
    {
      "epoch": 36.9375,
      "grad_norm": 2.522573947906494,
      "learning_rate": 5.23125e-05,
      "loss": 1.3857,
      "step": 2364
    },
    {
      "epoch": 36.953125,
      "grad_norm": 2.6294405460357666,
      "learning_rate": 5.2249999999999996e-05,
      "loss": 1.3014,
      "step": 2365
    },
    {
      "epoch": 36.96875,
      "grad_norm": 2.3480916023254395,
      "learning_rate": 5.21875e-05,
      "loss": 0.9582,
      "step": 2366
    },
    {
      "epoch": 36.984375,
      "grad_norm": 2.5063672065734863,
      "learning_rate": 5.2125e-05,
      "loss": 1.2426,
      "step": 2367
    },
    {
      "epoch": 37.0,
      "grad_norm": 2.864591598510742,
      "learning_rate": 5.2062500000000005e-05,
      "loss": 1.5829,
      "step": 2368
    },
    {
      "epoch": 37.0,
      "eval_loss": 3.0820205211639404,
      "eval_runtime": 2.9653,
      "eval_samples_per_second": 172.666,
      "eval_steps_per_second": 43.167,
      "step": 2368
    },
    {
      "epoch": 37.015625,
      "grad_norm": 2.3075387477874756,
      "learning_rate": 5.2000000000000004e-05,
      "loss": 1.2392,
      "step": 2369
    },
    {
      "epoch": 37.03125,
      "grad_norm": 2.430769920349121,
      "learning_rate": 5.193750000000001e-05,
      "loss": 1.2271,
      "step": 2370
    },
    {
      "epoch": 37.046875,
      "grad_norm": 2.7212672233581543,
      "learning_rate": 5.187500000000001e-05,
      "loss": 1.3755,
      "step": 2371
    },
    {
      "epoch": 37.0625,
      "grad_norm": 2.2438063621520996,
      "learning_rate": 5.18125e-05,
      "loss": 1.2871,
      "step": 2372
    },
    {
      "epoch": 37.078125,
      "grad_norm": 2.4187161922454834,
      "learning_rate": 5.175e-05,
      "loss": 1.2533,
      "step": 2373
    },
    {
      "epoch": 37.09375,
      "grad_norm": 2.40276837348938,
      "learning_rate": 5.16875e-05,
      "loss": 1.2657,
      "step": 2374
    },
    {
      "epoch": 37.109375,
      "grad_norm": 2.1988532543182373,
      "learning_rate": 5.1625e-05,
      "loss": 1.0614,
      "step": 2375
    },
    {
      "epoch": 37.125,
      "grad_norm": 2.4844489097595215,
      "learning_rate": 5.15625e-05,
      "loss": 1.0554,
      "step": 2376
    },
    {
      "epoch": 37.140625,
      "grad_norm": 2.4430973529815674,
      "learning_rate": 5.1500000000000005e-05,
      "loss": 1.2189,
      "step": 2377
    },
    {
      "epoch": 37.15625,
      "grad_norm": 2.289041519165039,
      "learning_rate": 5.1437500000000003e-05,
      "loss": 1.3341,
      "step": 2378
    },
    {
      "epoch": 37.171875,
      "grad_norm": 2.5151991844177246,
      "learning_rate": 5.137500000000001e-05,
      "loss": 1.1689,
      "step": 2379
    },
    {
      "epoch": 37.1875,
      "grad_norm": 2.420222043991089,
      "learning_rate": 5.131250000000001e-05,
      "loss": 1.335,
      "step": 2380
    },
    {
      "epoch": 37.203125,
      "grad_norm": 2.4562532901763916,
      "learning_rate": 5.125e-05,
      "loss": 1.0743,
      "step": 2381
    },
    {
      "epoch": 37.21875,
      "grad_norm": 2.5890419483184814,
      "learning_rate": 5.11875e-05,
      "loss": 1.2355,
      "step": 2382
    },
    {
      "epoch": 37.234375,
      "grad_norm": 2.3465209007263184,
      "learning_rate": 5.1125e-05,
      "loss": 1.3818,
      "step": 2383
    },
    {
      "epoch": 37.25,
      "grad_norm": 2.231661081314087,
      "learning_rate": 5.10625e-05,
      "loss": 1.35,
      "step": 2384
    },
    {
      "epoch": 37.265625,
      "grad_norm": 2.664698600769043,
      "learning_rate": 5.1000000000000006e-05,
      "loss": 1.1415,
      "step": 2385
    },
    {
      "epoch": 37.28125,
      "grad_norm": 2.514580011367798,
      "learning_rate": 5.0937500000000005e-05,
      "loss": 1.0413,
      "step": 2386
    },
    {
      "epoch": 37.296875,
      "grad_norm": 2.4381818771362305,
      "learning_rate": 5.0875e-05,
      "loss": 1.1949,
      "step": 2387
    },
    {
      "epoch": 37.3125,
      "grad_norm": 2.4892055988311768,
      "learning_rate": 5.081250000000001e-05,
      "loss": 1.1936,
      "step": 2388
    },
    {
      "epoch": 37.328125,
      "grad_norm": 2.606065034866333,
      "learning_rate": 5.075e-05,
      "loss": 1.1354,
      "step": 2389
    },
    {
      "epoch": 37.34375,
      "grad_norm": 2.2671077251434326,
      "learning_rate": 5.06875e-05,
      "loss": 0.9665,
      "step": 2390
    },
    {
      "epoch": 37.359375,
      "grad_norm": 2.280754804611206,
      "learning_rate": 5.0625e-05,
      "loss": 1.2829,
      "step": 2391
    },
    {
      "epoch": 37.375,
      "grad_norm": 2.4497718811035156,
      "learning_rate": 5.05625e-05,
      "loss": 1.4379,
      "step": 2392
    },
    {
      "epoch": 37.390625,
      "grad_norm": 3.0680718421936035,
      "learning_rate": 5.05e-05,
      "loss": 1.1431,
      "step": 2393
    },
    {
      "epoch": 37.40625,
      "grad_norm": 2.3538811206817627,
      "learning_rate": 5.0437500000000006e-05,
      "loss": 1.3421,
      "step": 2394
    },
    {
      "epoch": 37.421875,
      "grad_norm": 2.307070016860962,
      "learning_rate": 5.0375000000000005e-05,
      "loss": 1.1447,
      "step": 2395
    },
    {
      "epoch": 37.4375,
      "grad_norm": 2.480508327484131,
      "learning_rate": 5.031250000000001e-05,
      "loss": 1.3081,
      "step": 2396
    },
    {
      "epoch": 37.453125,
      "grad_norm": 2.4542768001556396,
      "learning_rate": 5.0249999999999995e-05,
      "loss": 1.2007,
      "step": 2397
    },
    {
      "epoch": 37.46875,
      "grad_norm": 2.531865358352661,
      "learning_rate": 5.01875e-05,
      "loss": 1.137,
      "step": 2398
    },
    {
      "epoch": 37.484375,
      "grad_norm": 2.2803564071655273,
      "learning_rate": 5.0125e-05,
      "loss": 1.0852,
      "step": 2399
    },
    {
      "epoch": 37.5,
      "grad_norm": 2.777787208557129,
      "learning_rate": 5.0062500000000004e-05,
      "loss": 1.1012,
      "step": 2400
    },
    {
      "epoch": 37.515625,
      "grad_norm": 2.388749837875366,
      "learning_rate": 5e-05,
      "loss": 1.3427,
      "step": 2401
    },
    {
      "epoch": 37.53125,
      "grad_norm": 2.6875686645507812,
      "learning_rate": 4.99375e-05,
      "loss": 1.1348,
      "step": 2402
    },
    {
      "epoch": 37.546875,
      "grad_norm": 2.3569982051849365,
      "learning_rate": 4.9875000000000006e-05,
      "loss": 1.348,
      "step": 2403
    },
    {
      "epoch": 37.5625,
      "grad_norm": 2.6640872955322266,
      "learning_rate": 4.98125e-05,
      "loss": 1.1781,
      "step": 2404
    },
    {
      "epoch": 37.578125,
      "grad_norm": 2.1582882404327393,
      "learning_rate": 4.975e-05,
      "loss": 1.3831,
      "step": 2405
    },
    {
      "epoch": 37.59375,
      "grad_norm": 2.4958577156066895,
      "learning_rate": 4.96875e-05,
      "loss": 1.3151,
      "step": 2406
    },
    {
      "epoch": 37.609375,
      "grad_norm": 1.9734408855438232,
      "learning_rate": 4.962500000000001e-05,
      "loss": 1.1772,
      "step": 2407
    },
    {
      "epoch": 37.625,
      "grad_norm": 2.675837993621826,
      "learning_rate": 4.95625e-05,
      "loss": 1.2612,
      "step": 2408
    },
    {
      "epoch": 37.640625,
      "grad_norm": 2.777552843093872,
      "learning_rate": 4.9500000000000004e-05,
      "loss": 1.2703,
      "step": 2409
    },
    {
      "epoch": 37.65625,
      "grad_norm": 2.565886974334717,
      "learning_rate": 4.94375e-05,
      "loss": 1.1364,
      "step": 2410
    },
    {
      "epoch": 37.671875,
      "grad_norm": 2.4439425468444824,
      "learning_rate": 4.937500000000001e-05,
      "loss": 1.2561,
      "step": 2411
    },
    {
      "epoch": 37.6875,
      "grad_norm": 2.2892942428588867,
      "learning_rate": 4.93125e-05,
      "loss": 1.3448,
      "step": 2412
    },
    {
      "epoch": 37.703125,
      "grad_norm": 2.555926561355591,
      "learning_rate": 4.9250000000000004e-05,
      "loss": 1.5386,
      "step": 2413
    },
    {
      "epoch": 37.71875,
      "grad_norm": 2.445833444595337,
      "learning_rate": 4.91875e-05,
      "loss": 1.3491,
      "step": 2414
    },
    {
      "epoch": 37.734375,
      "grad_norm": 2.487778902053833,
      "learning_rate": 4.9125e-05,
      "loss": 1.1476,
      "step": 2415
    },
    {
      "epoch": 37.75,
      "grad_norm": 2.5183279514312744,
      "learning_rate": 4.90625e-05,
      "loss": 1.2879,
      "step": 2416
    },
    {
      "epoch": 37.765625,
      "grad_norm": 2.4757347106933594,
      "learning_rate": 4.9e-05,
      "loss": 1.2058,
      "step": 2417
    },
    {
      "epoch": 37.78125,
      "grad_norm": 2.289010524749756,
      "learning_rate": 4.8937500000000004e-05,
      "loss": 1.1848,
      "step": 2418
    },
    {
      "epoch": 37.796875,
      "grad_norm": 2.8340749740600586,
      "learning_rate": 4.8875e-05,
      "loss": 1.1456,
      "step": 2419
    },
    {
      "epoch": 37.8125,
      "grad_norm": 2.6495087146759033,
      "learning_rate": 4.88125e-05,
      "loss": 1.3464,
      "step": 2420
    },
    {
      "epoch": 37.828125,
      "grad_norm": 2.4645087718963623,
      "learning_rate": 4.875e-05,
      "loss": 1.1609,
      "step": 2421
    },
    {
      "epoch": 37.84375,
      "grad_norm": 2.2095673084259033,
      "learning_rate": 4.8687500000000004e-05,
      "loss": 1.1648,
      "step": 2422
    },
    {
      "epoch": 37.859375,
      "grad_norm": 2.478856086730957,
      "learning_rate": 4.8625e-05,
      "loss": 1.2906,
      "step": 2423
    },
    {
      "epoch": 37.875,
      "grad_norm": 2.2757370471954346,
      "learning_rate": 4.85625e-05,
      "loss": 1.3483,
      "step": 2424
    },
    {
      "epoch": 37.890625,
      "grad_norm": 2.5071184635162354,
      "learning_rate": 4.85e-05,
      "loss": 1.2912,
      "step": 2425
    },
    {
      "epoch": 37.90625,
      "grad_norm": 2.1407370567321777,
      "learning_rate": 4.8437500000000005e-05,
      "loss": 1.2158,
      "step": 2426
    },
    {
      "epoch": 37.921875,
      "grad_norm": 2.5735576152801514,
      "learning_rate": 4.8375000000000004e-05,
      "loss": 1.5229,
      "step": 2427
    },
    {
      "epoch": 37.9375,
      "grad_norm": 2.2680346965789795,
      "learning_rate": 4.83125e-05,
      "loss": 1.2702,
      "step": 2428
    },
    {
      "epoch": 37.953125,
      "grad_norm": 2.933195114135742,
      "learning_rate": 4.825e-05,
      "loss": 1.284,
      "step": 2429
    },
    {
      "epoch": 37.96875,
      "grad_norm": 2.582068681716919,
      "learning_rate": 4.81875e-05,
      "loss": 1.1358,
      "step": 2430
    },
    {
      "epoch": 37.984375,
      "grad_norm": 2.4999032020568848,
      "learning_rate": 4.8125000000000004e-05,
      "loss": 1.3198,
      "step": 2431
    },
    {
      "epoch": 38.0,
      "grad_norm": 2.869053602218628,
      "learning_rate": 4.80625e-05,
      "loss": 1.0653,
      "step": 2432
    },
    {
      "epoch": 38.0,
      "eval_loss": 3.08675217628479,
      "eval_runtime": 2.8924,
      "eval_samples_per_second": 177.017,
      "eval_steps_per_second": 44.254,
      "step": 2432
    },
    {
      "epoch": 38.015625,
      "grad_norm": 2.5251576900482178,
      "learning_rate": 4.8e-05,
      "loss": 1.3337,
      "step": 2433
    },
    {
      "epoch": 38.03125,
      "grad_norm": 2.560723304748535,
      "learning_rate": 4.79375e-05,
      "loss": 1.4823,
      "step": 2434
    },
    {
      "epoch": 38.046875,
      "grad_norm": 2.5332131385803223,
      "learning_rate": 4.7875000000000005e-05,
      "loss": 1.2942,
      "step": 2435
    },
    {
      "epoch": 38.0625,
      "grad_norm": 2.2886948585510254,
      "learning_rate": 4.7812500000000003e-05,
      "loss": 1.0115,
      "step": 2436
    },
    {
      "epoch": 38.078125,
      "grad_norm": 2.2560365200042725,
      "learning_rate": 4.775e-05,
      "loss": 1.0504,
      "step": 2437
    },
    {
      "epoch": 38.09375,
      "grad_norm": 2.280565023422241,
      "learning_rate": 4.76875e-05,
      "loss": 1.4916,
      "step": 2438
    },
    {
      "epoch": 38.109375,
      "grad_norm": 2.284820079803467,
      "learning_rate": 4.7625000000000006e-05,
      "loss": 1.0877,
      "step": 2439
    },
    {
      "epoch": 38.125,
      "grad_norm": 2.602785587310791,
      "learning_rate": 4.7562500000000004e-05,
      "loss": 1.1227,
      "step": 2440
    },
    {
      "epoch": 38.140625,
      "grad_norm": 2.6239986419677734,
      "learning_rate": 4.75e-05,
      "loss": 1.3227,
      "step": 2441
    },
    {
      "epoch": 38.15625,
      "grad_norm": 2.4514381885528564,
      "learning_rate": 4.74375e-05,
      "loss": 1.1608,
      "step": 2442
    },
    {
      "epoch": 38.171875,
      "grad_norm": 2.1651532649993896,
      "learning_rate": 4.7375e-05,
      "loss": 1.1597,
      "step": 2443
    },
    {
      "epoch": 38.1875,
      "grad_norm": 2.657510757446289,
      "learning_rate": 4.7312500000000005e-05,
      "loss": 1.2121,
      "step": 2444
    },
    {
      "epoch": 38.203125,
      "grad_norm": 2.1342451572418213,
      "learning_rate": 4.7249999999999997e-05,
      "loss": 1.1735,
      "step": 2445
    },
    {
      "epoch": 38.21875,
      "grad_norm": 2.6772093772888184,
      "learning_rate": 4.71875e-05,
      "loss": 1.0414,
      "step": 2446
    },
    {
      "epoch": 38.234375,
      "grad_norm": 2.8137423992156982,
      "learning_rate": 4.7125e-05,
      "loss": 1.0904,
      "step": 2447
    },
    {
      "epoch": 38.25,
      "grad_norm": 2.6461029052734375,
      "learning_rate": 4.7062500000000006e-05,
      "loss": 1.2638,
      "step": 2448
    },
    {
      "epoch": 38.265625,
      "grad_norm": 2.39631986618042,
      "learning_rate": 4.7e-05,
      "loss": 1.0889,
      "step": 2449
    },
    {
      "epoch": 38.28125,
      "grad_norm": 2.2550249099731445,
      "learning_rate": 4.69375e-05,
      "loss": 0.9784,
      "step": 2450
    },
    {
      "epoch": 38.296875,
      "grad_norm": 2.4786651134490967,
      "learning_rate": 4.6875e-05,
      "loss": 1.0831,
      "step": 2451
    },
    {
      "epoch": 38.3125,
      "grad_norm": 2.631544828414917,
      "learning_rate": 4.6812500000000006e-05,
      "loss": 1.4308,
      "step": 2452
    },
    {
      "epoch": 38.328125,
      "grad_norm": 2.3832592964172363,
      "learning_rate": 4.6750000000000005e-05,
      "loss": 1.3999,
      "step": 2453
    },
    {
      "epoch": 38.34375,
      "grad_norm": 2.150158643722534,
      "learning_rate": 4.66875e-05,
      "loss": 1.2767,
      "step": 2454
    },
    {
      "epoch": 38.359375,
      "grad_norm": 2.0179433822631836,
      "learning_rate": 4.6625e-05,
      "loss": 1.1179,
      "step": 2455
    },
    {
      "epoch": 38.375,
      "grad_norm": 2.6121339797973633,
      "learning_rate": 4.65625e-05,
      "loss": 1.2506,
      "step": 2456
    },
    {
      "epoch": 38.390625,
      "grad_norm": 2.186060667037964,
      "learning_rate": 4.6500000000000005e-05,
      "loss": 1.0902,
      "step": 2457
    },
    {
      "epoch": 38.40625,
      "grad_norm": 2.3070719242095947,
      "learning_rate": 4.64375e-05,
      "loss": 1.3506,
      "step": 2458
    },
    {
      "epoch": 38.421875,
      "grad_norm": 2.358633518218994,
      "learning_rate": 4.6375e-05,
      "loss": 1.1013,
      "step": 2459
    },
    {
      "epoch": 38.4375,
      "grad_norm": 2.893320322036743,
      "learning_rate": 4.63125e-05,
      "loss": 1.1083,
      "step": 2460
    },
    {
      "epoch": 38.453125,
      "grad_norm": 2.158008098602295,
      "learning_rate": 4.6250000000000006e-05,
      "loss": 1.1821,
      "step": 2461
    },
    {
      "epoch": 38.46875,
      "grad_norm": 2.179551124572754,
      "learning_rate": 4.61875e-05,
      "loss": 1.5346,
      "step": 2462
    },
    {
      "epoch": 38.484375,
      "grad_norm": 2.4604692459106445,
      "learning_rate": 4.6125e-05,
      "loss": 1.2693,
      "step": 2463
    },
    {
      "epoch": 38.5,
      "grad_norm": 2.321843385696411,
      "learning_rate": 4.60625e-05,
      "loss": 1.0057,
      "step": 2464
    },
    {
      "epoch": 38.515625,
      "grad_norm": 2.401614189147949,
      "learning_rate": 4.600000000000001e-05,
      "loss": 1.2382,
      "step": 2465
    },
    {
      "epoch": 38.53125,
      "grad_norm": 2.185986280441284,
      "learning_rate": 4.59375e-05,
      "loss": 1.0976,
      "step": 2466
    },
    {
      "epoch": 38.546875,
      "grad_norm": 2.5104007720947266,
      "learning_rate": 4.5875000000000004e-05,
      "loss": 1.106,
      "step": 2467
    },
    {
      "epoch": 38.5625,
      "grad_norm": 2.2614262104034424,
      "learning_rate": 4.58125e-05,
      "loss": 1.2667,
      "step": 2468
    },
    {
      "epoch": 38.578125,
      "grad_norm": 2.700568675994873,
      "learning_rate": 4.575e-05,
      "loss": 1.0051,
      "step": 2469
    },
    {
      "epoch": 38.59375,
      "grad_norm": 2.2442820072174072,
      "learning_rate": 4.56875e-05,
      "loss": 1.2488,
      "step": 2470
    },
    {
      "epoch": 38.609375,
      "grad_norm": 2.6740283966064453,
      "learning_rate": 4.5625e-05,
      "loss": 1.1665,
      "step": 2471
    },
    {
      "epoch": 38.625,
      "grad_norm": 2.4515836238861084,
      "learning_rate": 4.55625e-05,
      "loss": 1.2656,
      "step": 2472
    },
    {
      "epoch": 38.640625,
      "grad_norm": 2.194737434387207,
      "learning_rate": 4.55e-05,
      "loss": 1.4442,
      "step": 2473
    },
    {
      "epoch": 38.65625,
      "grad_norm": 2.362487316131592,
      "learning_rate": 4.54375e-05,
      "loss": 1.257,
      "step": 2474
    },
    {
      "epoch": 38.671875,
      "grad_norm": 2.746361255645752,
      "learning_rate": 4.5375e-05,
      "loss": 1.2271,
      "step": 2475
    },
    {
      "epoch": 38.6875,
      "grad_norm": 2.509782075881958,
      "learning_rate": 4.5312500000000004e-05,
      "loss": 1.3377,
      "step": 2476
    },
    {
      "epoch": 38.703125,
      "grad_norm": 2.236074686050415,
      "learning_rate": 4.525e-05,
      "loss": 1.2195,
      "step": 2477
    },
    {
      "epoch": 38.71875,
      "grad_norm": 2.2817845344543457,
      "learning_rate": 4.518750000000001e-05,
      "loss": 1.215,
      "step": 2478
    },
    {
      "epoch": 38.734375,
      "grad_norm": 2.379135847091675,
      "learning_rate": 4.5125e-05,
      "loss": 1.226,
      "step": 2479
    },
    {
      "epoch": 38.75,
      "grad_norm": 2.6877970695495605,
      "learning_rate": 4.5062500000000004e-05,
      "loss": 1.1454,
      "step": 2480
    },
    {
      "epoch": 38.765625,
      "grad_norm": 2.6626904010772705,
      "learning_rate": 4.5e-05,
      "loss": 1.4248,
      "step": 2481
    },
    {
      "epoch": 38.78125,
      "grad_norm": 2.5345704555511475,
      "learning_rate": 4.49375e-05,
      "loss": 1.3095,
      "step": 2482
    },
    {
      "epoch": 38.796875,
      "grad_norm": 2.502391815185547,
      "learning_rate": 4.4875e-05,
      "loss": 0.9548,
      "step": 2483
    },
    {
      "epoch": 38.8125,
      "grad_norm": 2.455371379852295,
      "learning_rate": 4.4812500000000005e-05,
      "loss": 1.1388,
      "step": 2484
    },
    {
      "epoch": 38.828125,
      "grad_norm": 2.7876713275909424,
      "learning_rate": 4.4750000000000004e-05,
      "loss": 1.0737,
      "step": 2485
    },
    {
      "epoch": 38.84375,
      "grad_norm": 2.3308091163635254,
      "learning_rate": 4.46875e-05,
      "loss": 1.153,
      "step": 2486
    },
    {
      "epoch": 38.859375,
      "grad_norm": 2.4333012104034424,
      "learning_rate": 4.4625e-05,
      "loss": 1.2391,
      "step": 2487
    },
    {
      "epoch": 38.875,
      "grad_norm": 2.3935344219207764,
      "learning_rate": 4.45625e-05,
      "loss": 1.1272,
      "step": 2488
    },
    {
      "epoch": 38.890625,
      "grad_norm": 2.805319309234619,
      "learning_rate": 4.4500000000000004e-05,
      "loss": 1.1916,
      "step": 2489
    },
    {
      "epoch": 38.90625,
      "grad_norm": 2.3204562664031982,
      "learning_rate": 4.44375e-05,
      "loss": 1.4765,
      "step": 2490
    },
    {
      "epoch": 38.921875,
      "grad_norm": 2.6873884201049805,
      "learning_rate": 4.4375e-05,
      "loss": 1.2537,
      "step": 2491
    },
    {
      "epoch": 38.9375,
      "grad_norm": 2.7756845951080322,
      "learning_rate": 4.43125e-05,
      "loss": 1.335,
      "step": 2492
    },
    {
      "epoch": 38.953125,
      "grad_norm": 2.6339614391326904,
      "learning_rate": 4.4250000000000005e-05,
      "loss": 1.2138,
      "step": 2493
    },
    {
      "epoch": 38.96875,
      "grad_norm": 2.3255813121795654,
      "learning_rate": 4.4187500000000003e-05,
      "loss": 1.205,
      "step": 2494
    },
    {
      "epoch": 38.984375,
      "grad_norm": 2.4474573135375977,
      "learning_rate": 4.4125e-05,
      "loss": 1.3445,
      "step": 2495
    },
    {
      "epoch": 39.0,
      "grad_norm": 2.7683255672454834,
      "learning_rate": 4.40625e-05,
      "loss": 1.3312,
      "step": 2496
    },
    {
      "epoch": 39.0,
      "eval_loss": 3.0933423042297363,
      "eval_runtime": 2.9777,
      "eval_samples_per_second": 171.944,
      "eval_steps_per_second": 42.986,
      "step": 2496
    },
    {
      "epoch": 39.015625,
      "grad_norm": 2.3934571743011475,
      "learning_rate": 4.4000000000000006e-05,
      "loss": 1.1994,
      "step": 2497
    },
    {
      "epoch": 39.03125,
      "grad_norm": 2.709887981414795,
      "learning_rate": 4.3937500000000004e-05,
      "loss": 1.0983,
      "step": 2498
    },
    {
      "epoch": 39.046875,
      "grad_norm": 2.615825653076172,
      "learning_rate": 4.3875e-05,
      "loss": 1.2834,
      "step": 2499
    },
    {
      "epoch": 39.0625,
      "grad_norm": 2.5091798305511475,
      "learning_rate": 4.38125e-05,
      "loss": 1.3722,
      "step": 2500
    },
    {
      "epoch": 39.078125,
      "grad_norm": 2.310389518737793,
      "learning_rate": 4.375e-05,
      "loss": 1.1456,
      "step": 2501
    },
    {
      "epoch": 39.09375,
      "grad_norm": 2.370894432067871,
      "learning_rate": 4.3687500000000005e-05,
      "loss": 1.1114,
      "step": 2502
    },
    {
      "epoch": 39.109375,
      "grad_norm": 2.3780529499053955,
      "learning_rate": 4.3625e-05,
      "loss": 1.2145,
      "step": 2503
    },
    {
      "epoch": 39.125,
      "grad_norm": 2.623994827270508,
      "learning_rate": 4.35625e-05,
      "loss": 1.072,
      "step": 2504
    },
    {
      "epoch": 39.140625,
      "grad_norm": 2.6409108638763428,
      "learning_rate": 4.35e-05,
      "loss": 1.2946,
      "step": 2505
    },
    {
      "epoch": 39.15625,
      "grad_norm": 2.4003381729125977,
      "learning_rate": 4.3437500000000006e-05,
      "loss": 1.3057,
      "step": 2506
    },
    {
      "epoch": 39.171875,
      "grad_norm": 2.4956820011138916,
      "learning_rate": 4.3375000000000004e-05,
      "loss": 1.2401,
      "step": 2507
    },
    {
      "epoch": 39.1875,
      "grad_norm": 2.387091636657715,
      "learning_rate": 4.33125e-05,
      "loss": 1.2246,
      "step": 2508
    },
    {
      "epoch": 39.203125,
      "grad_norm": 2.476961374282837,
      "learning_rate": 4.325e-05,
      "loss": 1.3421,
      "step": 2509
    },
    {
      "epoch": 39.21875,
      "grad_norm": 2.3415684700012207,
      "learning_rate": 4.3187500000000006e-05,
      "loss": 1.3499,
      "step": 2510
    },
    {
      "epoch": 39.234375,
      "grad_norm": 3.6879637241363525,
      "learning_rate": 4.3125000000000005e-05,
      "loss": 1.3221,
      "step": 2511
    },
    {
      "epoch": 39.25,
      "grad_norm": 2.569089651107788,
      "learning_rate": 4.30625e-05,
      "loss": 1.2061,
      "step": 2512
    },
    {
      "epoch": 39.265625,
      "grad_norm": 2.435896635055542,
      "learning_rate": 4.3e-05,
      "loss": 1.2729,
      "step": 2513
    },
    {
      "epoch": 39.28125,
      "grad_norm": 2.2961244583129883,
      "learning_rate": 4.29375e-05,
      "loss": 0.9892,
      "step": 2514
    },
    {
      "epoch": 39.296875,
      "grad_norm": 2.3020169734954834,
      "learning_rate": 4.2875000000000005e-05,
      "loss": 1.1962,
      "step": 2515
    },
    {
      "epoch": 39.3125,
      "grad_norm": 2.5682077407836914,
      "learning_rate": 4.28125e-05,
      "loss": 0.7496,
      "step": 2516
    },
    {
      "epoch": 39.328125,
      "grad_norm": 2.467649221420288,
      "learning_rate": 4.275e-05,
      "loss": 1.3008,
      "step": 2517
    },
    {
      "epoch": 39.34375,
      "grad_norm": 2.132655620574951,
      "learning_rate": 4.26875e-05,
      "loss": 1.2253,
      "step": 2518
    },
    {
      "epoch": 39.359375,
      "grad_norm": 2.089503526687622,
      "learning_rate": 4.2625000000000006e-05,
      "loss": 1.3226,
      "step": 2519
    },
    {
      "epoch": 39.375,
      "grad_norm": 2.580470085144043,
      "learning_rate": 4.25625e-05,
      "loss": 1.308,
      "step": 2520
    },
    {
      "epoch": 39.390625,
      "grad_norm": 2.499681234359741,
      "learning_rate": 4.25e-05,
      "loss": 1.0988,
      "step": 2521
    },
    {
      "epoch": 39.40625,
      "grad_norm": 2.4389889240264893,
      "learning_rate": 4.24375e-05,
      "loss": 1.1179,
      "step": 2522
    },
    {
      "epoch": 39.421875,
      "grad_norm": 2.480201244354248,
      "learning_rate": 4.237500000000001e-05,
      "loss": 1.2817,
      "step": 2523
    },
    {
      "epoch": 39.4375,
      "grad_norm": 2.296304225921631,
      "learning_rate": 4.23125e-05,
      "loss": 1.2519,
      "step": 2524
    },
    {
      "epoch": 39.453125,
      "grad_norm": 2.5176446437835693,
      "learning_rate": 4.2250000000000004e-05,
      "loss": 1.4377,
      "step": 2525
    },
    {
      "epoch": 39.46875,
      "grad_norm": 2.581881284713745,
      "learning_rate": 4.21875e-05,
      "loss": 1.1789,
      "step": 2526
    },
    {
      "epoch": 39.484375,
      "grad_norm": 2.253641366958618,
      "learning_rate": 4.2125e-05,
      "loss": 1.3011,
      "step": 2527
    },
    {
      "epoch": 39.5,
      "grad_norm": 2.634950876235962,
      "learning_rate": 4.2062500000000006e-05,
      "loss": 1.2381,
      "step": 2528
    },
    {
      "epoch": 39.515625,
      "grad_norm": 2.6963913440704346,
      "learning_rate": 4.2e-05,
      "loss": 1.0872,
      "step": 2529
    },
    {
      "epoch": 39.53125,
      "grad_norm": 2.761931896209717,
      "learning_rate": 4.19375e-05,
      "loss": 1.11,
      "step": 2530
    },
    {
      "epoch": 39.546875,
      "grad_norm": 2.6362757682800293,
      "learning_rate": 4.1875e-05,
      "loss": 1.1393,
      "step": 2531
    },
    {
      "epoch": 39.5625,
      "grad_norm": 2.4076762199401855,
      "learning_rate": 4.181250000000001e-05,
      "loss": 1.2356,
      "step": 2532
    },
    {
      "epoch": 39.578125,
      "grad_norm": 2.443892002105713,
      "learning_rate": 4.175e-05,
      "loss": 1.106,
      "step": 2533
    },
    {
      "epoch": 39.59375,
      "grad_norm": 2.627319812774658,
      "learning_rate": 4.1687500000000004e-05,
      "loss": 1.1277,
      "step": 2534
    },
    {
      "epoch": 39.609375,
      "grad_norm": 2.545684576034546,
      "learning_rate": 4.1625e-05,
      "loss": 1.24,
      "step": 2535
    },
    {
      "epoch": 39.625,
      "grad_norm": 2.2597289085388184,
      "learning_rate": 4.156250000000001e-05,
      "loss": 1.5364,
      "step": 2536
    },
    {
      "epoch": 39.640625,
      "grad_norm": 2.4790360927581787,
      "learning_rate": 4.15e-05,
      "loss": 1.2406,
      "step": 2537
    },
    {
      "epoch": 39.65625,
      "grad_norm": 2.220416307449341,
      "learning_rate": 4.1437500000000004e-05,
      "loss": 1.2441,
      "step": 2538
    },
    {
      "epoch": 39.671875,
      "grad_norm": 2.504185676574707,
      "learning_rate": 4.1375e-05,
      "loss": 1.1483,
      "step": 2539
    },
    {
      "epoch": 39.6875,
      "grad_norm": 2.256648540496826,
      "learning_rate": 4.13125e-05,
      "loss": 1.2188,
      "step": 2540
    },
    {
      "epoch": 39.703125,
      "grad_norm": 2.44254994392395,
      "learning_rate": 4.125e-05,
      "loss": 1.3035,
      "step": 2541
    },
    {
      "epoch": 39.71875,
      "grad_norm": 2.4062118530273438,
      "learning_rate": 4.11875e-05,
      "loss": 1.1524,
      "step": 2542
    },
    {
      "epoch": 39.734375,
      "grad_norm": 2.4959356784820557,
      "learning_rate": 4.1125000000000004e-05,
      "loss": 1.2611,
      "step": 2543
    },
    {
      "epoch": 39.75,
      "grad_norm": 2.886883497238159,
      "learning_rate": 4.10625e-05,
      "loss": 1.1997,
      "step": 2544
    },
    {
      "epoch": 39.765625,
      "grad_norm": 2.33504319190979,
      "learning_rate": 4.1e-05,
      "loss": 1.2949,
      "step": 2545
    },
    {
      "epoch": 39.78125,
      "grad_norm": 2.1298582553863525,
      "learning_rate": 4.09375e-05,
      "loss": 1.2892,
      "step": 2546
    },
    {
      "epoch": 39.796875,
      "grad_norm": 2.2756643295288086,
      "learning_rate": 4.0875000000000004e-05,
      "loss": 1.2956,
      "step": 2547
    },
    {
      "epoch": 39.8125,
      "grad_norm": 2.104339122772217,
      "learning_rate": 4.08125e-05,
      "loss": 1.2054,
      "step": 2548
    },
    {
      "epoch": 39.828125,
      "grad_norm": 2.307018995285034,
      "learning_rate": 4.075e-05,
      "loss": 1.2934,
      "step": 2549
    },
    {
      "epoch": 39.84375,
      "grad_norm": 2.813544988632202,
      "learning_rate": 4.06875e-05,
      "loss": 0.8628,
      "step": 2550
    },
    {
      "epoch": 39.859375,
      "grad_norm": 2.994710683822632,
      "learning_rate": 4.0625000000000005e-05,
      "loss": 1.0295,
      "step": 2551
    },
    {
      "epoch": 39.875,
      "grad_norm": 2.6770529747009277,
      "learning_rate": 4.0562500000000003e-05,
      "loss": 1.1938,
      "step": 2552
    },
    {
      "epoch": 39.890625,
      "grad_norm": 2.26878023147583,
      "learning_rate": 4.05e-05,
      "loss": 0.9725,
      "step": 2553
    },
    {
      "epoch": 39.90625,
      "grad_norm": 2.6111834049224854,
      "learning_rate": 4.04375e-05,
      "loss": 1.3972,
      "step": 2554
    },
    {
      "epoch": 39.921875,
      "grad_norm": 2.6579477787017822,
      "learning_rate": 4.0375e-05,
      "loss": 1.3476,
      "step": 2555
    },
    {
      "epoch": 39.9375,
      "grad_norm": 2.3528270721435547,
      "learning_rate": 4.0312500000000004e-05,
      "loss": 1.3901,
      "step": 2556
    },
    {
      "epoch": 39.953125,
      "grad_norm": 2.473478078842163,
      "learning_rate": 4.025e-05,
      "loss": 1.2437,
      "step": 2557
    },
    {
      "epoch": 39.96875,
      "grad_norm": 2.4995930194854736,
      "learning_rate": 4.01875e-05,
      "loss": 1.2816,
      "step": 2558
    },
    {
      "epoch": 39.984375,
      "grad_norm": 2.5089526176452637,
      "learning_rate": 4.0125e-05,
      "loss": 1.3444,
      "step": 2559
    },
    {
      "epoch": 40.0,
      "grad_norm": 3.820305585861206,
      "learning_rate": 4.0062500000000005e-05,
      "loss": 1.2567,
      "step": 2560
    },
    {
      "epoch": 40.0,
      "eval_loss": 3.094651699066162,
      "eval_runtime": 3.0269,
      "eval_samples_per_second": 169.148,
      "eval_steps_per_second": 42.287,
      "step": 2560
    },
    {
      "epoch": 40.015625,
      "grad_norm": 2.4297778606414795,
      "learning_rate": 4e-05,
      "loss": 1.0963,
      "step": 2561
    },
    {
      "epoch": 40.03125,
      "grad_norm": 2.7408950328826904,
      "learning_rate": 3.99375e-05,
      "loss": 1.2495,
      "step": 2562
    },
    {
      "epoch": 40.046875,
      "grad_norm": 2.540273904800415,
      "learning_rate": 3.9875e-05,
      "loss": 1.1754,
      "step": 2563
    },
    {
      "epoch": 40.0625,
      "grad_norm": 2.478773593902588,
      "learning_rate": 3.9812500000000005e-05,
      "loss": 1.2085,
      "step": 2564
    },
    {
      "epoch": 40.078125,
      "grad_norm": 2.46528697013855,
      "learning_rate": 3.9750000000000004e-05,
      "loss": 1.1757,
      "step": 2565
    },
    {
      "epoch": 40.09375,
      "grad_norm": 2.5190987586975098,
      "learning_rate": 3.96875e-05,
      "loss": 1.1568,
      "step": 2566
    },
    {
      "epoch": 40.109375,
      "grad_norm": 2.610447883605957,
      "learning_rate": 3.9625e-05,
      "loss": 1.1368,
      "step": 2567
    },
    {
      "epoch": 40.125,
      "grad_norm": 2.288994312286377,
      "learning_rate": 3.95625e-05,
      "loss": 1.3182,
      "step": 2568
    },
    {
      "epoch": 40.140625,
      "grad_norm": 2.5079238414764404,
      "learning_rate": 3.9500000000000005e-05,
      "loss": 1.2902,
      "step": 2569
    },
    {
      "epoch": 40.15625,
      "grad_norm": 2.458632230758667,
      "learning_rate": 3.9437499999999996e-05,
      "loss": 1.2218,
      "step": 2570
    },
    {
      "epoch": 40.171875,
      "grad_norm": 2.5671756267547607,
      "learning_rate": 3.9375e-05,
      "loss": 1.3565,
      "step": 2571
    },
    {
      "epoch": 40.1875,
      "grad_norm": 2.3744001388549805,
      "learning_rate": 3.93125e-05,
      "loss": 1.1899,
      "step": 2572
    },
    {
      "epoch": 40.203125,
      "grad_norm": 2.171532392501831,
      "learning_rate": 3.9250000000000005e-05,
      "loss": 1.1318,
      "step": 2573
    },
    {
      "epoch": 40.21875,
      "grad_norm": 2.136967658996582,
      "learning_rate": 3.91875e-05,
      "loss": 1.1334,
      "step": 2574
    },
    {
      "epoch": 40.234375,
      "grad_norm": 2.381612539291382,
      "learning_rate": 3.9125e-05,
      "loss": 1.2743,
      "step": 2575
    },
    {
      "epoch": 40.25,
      "grad_norm": 2.4735825061798096,
      "learning_rate": 3.90625e-05,
      "loss": 1.2928,
      "step": 2576
    },
    {
      "epoch": 40.265625,
      "grad_norm": 2.536140203475952,
      "learning_rate": 3.9000000000000006e-05,
      "loss": 0.7749,
      "step": 2577
    },
    {
      "epoch": 40.28125,
      "grad_norm": 2.3858325481414795,
      "learning_rate": 3.8937500000000005e-05,
      "loss": 1.1087,
      "step": 2578
    },
    {
      "epoch": 40.296875,
      "grad_norm": 2.758117198944092,
      "learning_rate": 3.8875e-05,
      "loss": 1.0997,
      "step": 2579
    },
    {
      "epoch": 40.3125,
      "grad_norm": 2.033874273300171,
      "learning_rate": 3.88125e-05,
      "loss": 0.9998,
      "step": 2580
    },
    {
      "epoch": 40.328125,
      "grad_norm": 2.514662265777588,
      "learning_rate": 3.875e-05,
      "loss": 1.2481,
      "step": 2581
    },
    {
      "epoch": 40.34375,
      "grad_norm": 2.5035266876220703,
      "learning_rate": 3.8687500000000005e-05,
      "loss": 1.258,
      "step": 2582
    },
    {
      "epoch": 40.359375,
      "grad_norm": 2.3464548587799072,
      "learning_rate": 3.8625e-05,
      "loss": 1.1329,
      "step": 2583
    },
    {
      "epoch": 40.375,
      "grad_norm": 2.323132038116455,
      "learning_rate": 3.85625e-05,
      "loss": 1.0125,
      "step": 2584
    },
    {
      "epoch": 40.390625,
      "grad_norm": 2.4581069946289062,
      "learning_rate": 3.85e-05,
      "loss": 1.2771,
      "step": 2585
    },
    {
      "epoch": 40.40625,
      "grad_norm": 2.700352191925049,
      "learning_rate": 3.8437500000000006e-05,
      "loss": 1.2115,
      "step": 2586
    },
    {
      "epoch": 40.421875,
      "grad_norm": 2.3771145343780518,
      "learning_rate": 3.8375e-05,
      "loss": 1.1742,
      "step": 2587
    },
    {
      "epoch": 40.4375,
      "grad_norm": 2.4798989295959473,
      "learning_rate": 3.83125e-05,
      "loss": 1.0807,
      "step": 2588
    },
    {
      "epoch": 40.453125,
      "grad_norm": 2.5848612785339355,
      "learning_rate": 3.825e-05,
      "loss": 1.3748,
      "step": 2589
    },
    {
      "epoch": 40.46875,
      "grad_norm": 2.338510274887085,
      "learning_rate": 3.818750000000001e-05,
      "loss": 0.8995,
      "step": 2590
    },
    {
      "epoch": 40.484375,
      "grad_norm": 2.6487395763397217,
      "learning_rate": 3.8125e-05,
      "loss": 1.278,
      "step": 2591
    },
    {
      "epoch": 40.5,
      "grad_norm": 2.9919023513793945,
      "learning_rate": 3.8062500000000004e-05,
      "loss": 1.1363,
      "step": 2592
    },
    {
      "epoch": 40.515625,
      "grad_norm": 2.4636025428771973,
      "learning_rate": 3.8e-05,
      "loss": 1.1376,
      "step": 2593
    },
    {
      "epoch": 40.53125,
      "grad_norm": 2.214099645614624,
      "learning_rate": 3.79375e-05,
      "loss": 1.199,
      "step": 2594
    },
    {
      "epoch": 40.546875,
      "grad_norm": 2.2829902172088623,
      "learning_rate": 3.7875e-05,
      "loss": 1.2441,
      "step": 2595
    },
    {
      "epoch": 40.5625,
      "grad_norm": 2.3876233100891113,
      "learning_rate": 3.78125e-05,
      "loss": 1.2645,
      "step": 2596
    },
    {
      "epoch": 40.578125,
      "grad_norm": 2.3961713314056396,
      "learning_rate": 3.775e-05,
      "loss": 1.1659,
      "step": 2597
    },
    {
      "epoch": 40.59375,
      "grad_norm": 2.3235368728637695,
      "learning_rate": 3.76875e-05,
      "loss": 1.2019,
      "step": 2598
    },
    {
      "epoch": 40.609375,
      "grad_norm": 2.5562808513641357,
      "learning_rate": 3.7625e-05,
      "loss": 1.1754,
      "step": 2599
    },
    {
      "epoch": 40.625,
      "grad_norm": 2.494673252105713,
      "learning_rate": 3.75625e-05,
      "loss": 1.2969,
      "step": 2600
    },
    {
      "epoch": 40.640625,
      "grad_norm": 2.6250481605529785,
      "learning_rate": 3.7500000000000003e-05,
      "loss": 1.1637,
      "step": 2601
    },
    {
      "epoch": 40.65625,
      "grad_norm": 2.41280460357666,
      "learning_rate": 3.74375e-05,
      "loss": 1.3047,
      "step": 2602
    },
    {
      "epoch": 40.671875,
      "grad_norm": 2.3434817790985107,
      "learning_rate": 3.737500000000001e-05,
      "loss": 1.1087,
      "step": 2603
    },
    {
      "epoch": 40.6875,
      "grad_norm": 2.4926364421844482,
      "learning_rate": 3.73125e-05,
      "loss": 1.1279,
      "step": 2604
    },
    {
      "epoch": 40.703125,
      "grad_norm": 2.808342695236206,
      "learning_rate": 3.7250000000000004e-05,
      "loss": 1.2451,
      "step": 2605
    },
    {
      "epoch": 40.71875,
      "grad_norm": 2.411726713180542,
      "learning_rate": 3.71875e-05,
      "loss": 1.3974,
      "step": 2606
    },
    {
      "epoch": 40.734375,
      "grad_norm": 2.301586389541626,
      "learning_rate": 3.7125e-05,
      "loss": 1.2482,
      "step": 2607
    },
    {
      "epoch": 40.75,
      "grad_norm": 2.3796679973602295,
      "learning_rate": 3.70625e-05,
      "loss": 1.4419,
      "step": 2608
    },
    {
      "epoch": 40.765625,
      "grad_norm": 2.2763671875,
      "learning_rate": 3.7e-05,
      "loss": 1.104,
      "step": 2609
    },
    {
      "epoch": 40.78125,
      "grad_norm": 2.672067403793335,
      "learning_rate": 3.69375e-05,
      "loss": 1.2783,
      "step": 2610
    },
    {
      "epoch": 40.796875,
      "grad_norm": 2.401934862136841,
      "learning_rate": 3.6875e-05,
      "loss": 1.3675,
      "step": 2611
    },
    {
      "epoch": 40.8125,
      "grad_norm": 2.4273734092712402,
      "learning_rate": 3.68125e-05,
      "loss": 1.1584,
      "step": 2612
    },
    {
      "epoch": 40.828125,
      "grad_norm": 2.5027246475219727,
      "learning_rate": 3.675e-05,
      "loss": 1.3472,
      "step": 2613
    },
    {
      "epoch": 40.84375,
      "grad_norm": 2.7212746143341064,
      "learning_rate": 3.6687500000000004e-05,
      "loss": 1.0443,
      "step": 2614
    },
    {
      "epoch": 40.859375,
      "grad_norm": 2.6267967224121094,
      "learning_rate": 3.6625e-05,
      "loss": 1.1726,
      "step": 2615
    },
    {
      "epoch": 40.875,
      "grad_norm": 2.3736870288848877,
      "learning_rate": 3.65625e-05,
      "loss": 1.2996,
      "step": 2616
    },
    {
      "epoch": 40.890625,
      "grad_norm": 2.3467698097229004,
      "learning_rate": 3.65e-05,
      "loss": 1.4511,
      "step": 2617
    },
    {
      "epoch": 40.90625,
      "grad_norm": 2.265599250793457,
      "learning_rate": 3.6437500000000005e-05,
      "loss": 1.0561,
      "step": 2618
    },
    {
      "epoch": 40.921875,
      "grad_norm": 2.50246262550354,
      "learning_rate": 3.6375e-05,
      "loss": 1.267,
      "step": 2619
    },
    {
      "epoch": 40.9375,
      "grad_norm": 2.4827325344085693,
      "learning_rate": 3.63125e-05,
      "loss": 1.1472,
      "step": 2620
    },
    {
      "epoch": 40.953125,
      "grad_norm": 2.1171791553497314,
      "learning_rate": 3.625e-05,
      "loss": 1.1816,
      "step": 2621
    },
    {
      "epoch": 40.96875,
      "grad_norm": 2.541794776916504,
      "learning_rate": 3.61875e-05,
      "loss": 1.3172,
      "step": 2622
    },
    {
      "epoch": 40.984375,
      "grad_norm": 2.7482526302337646,
      "learning_rate": 3.6125000000000004e-05,
      "loss": 1.5795,
      "step": 2623
    },
    {
      "epoch": 41.0,
      "grad_norm": 3.858243227005005,
      "learning_rate": 3.60625e-05,
      "loss": 1.172,
      "step": 2624
    },
    {
      "epoch": 41.0,
      "eval_loss": 3.095574378967285,
      "eval_runtime": 2.9293,
      "eval_samples_per_second": 174.783,
      "eval_steps_per_second": 43.696,
      "step": 2624
    },
    {
      "epoch": 41.015625,
      "grad_norm": 2.471578598022461,
      "learning_rate": 3.6e-05,
      "loss": 1.3065,
      "step": 2625
    },
    {
      "epoch": 41.03125,
      "grad_norm": 2.534151077270508,
      "learning_rate": 3.59375e-05,
      "loss": 1.3825,
      "step": 2626
    },
    {
      "epoch": 41.046875,
      "grad_norm": 2.545274257659912,
      "learning_rate": 3.5875000000000005e-05,
      "loss": 1.3407,
      "step": 2627
    },
    {
      "epoch": 41.0625,
      "grad_norm": 2.2009711265563965,
      "learning_rate": 3.58125e-05,
      "loss": 1.3739,
      "step": 2628
    },
    {
      "epoch": 41.078125,
      "grad_norm": 2.5715298652648926,
      "learning_rate": 3.575e-05,
      "loss": 1.0759,
      "step": 2629
    },
    {
      "epoch": 41.09375,
      "grad_norm": 2.8132431507110596,
      "learning_rate": 3.56875e-05,
      "loss": 1.1911,
      "step": 2630
    },
    {
      "epoch": 41.109375,
      "grad_norm": 2.460184335708618,
      "learning_rate": 3.5625000000000005e-05,
      "loss": 1.0924,
      "step": 2631
    },
    {
      "epoch": 41.125,
      "grad_norm": 2.4974751472473145,
      "learning_rate": 3.5562500000000004e-05,
      "loss": 1.2557,
      "step": 2632
    },
    {
      "epoch": 41.140625,
      "grad_norm": 2.707573652267456,
      "learning_rate": 3.55e-05,
      "loss": 1.277,
      "step": 2633
    },
    {
      "epoch": 41.15625,
      "grad_norm": 2.5246548652648926,
      "learning_rate": 3.54375e-05,
      "loss": 1.0254,
      "step": 2634
    },
    {
      "epoch": 41.171875,
      "grad_norm": 2.4480972290039062,
      "learning_rate": 3.5375e-05,
      "loss": 1.3319,
      "step": 2635
    },
    {
      "epoch": 41.1875,
      "grad_norm": 2.2227513790130615,
      "learning_rate": 3.5312500000000005e-05,
      "loss": 1.4026,
      "step": 2636
    },
    {
      "epoch": 41.203125,
      "grad_norm": 2.6972477436065674,
      "learning_rate": 3.525e-05,
      "loss": 1.2778,
      "step": 2637
    },
    {
      "epoch": 41.21875,
      "grad_norm": 2.46268892288208,
      "learning_rate": 3.51875e-05,
      "loss": 1.3685,
      "step": 2638
    },
    {
      "epoch": 41.234375,
      "grad_norm": 2.4477291107177734,
      "learning_rate": 3.5125e-05,
      "loss": 1.0959,
      "step": 2639
    },
    {
      "epoch": 41.25,
      "grad_norm": 2.260582447052002,
      "learning_rate": 3.5062500000000005e-05,
      "loss": 1.2718,
      "step": 2640
    },
    {
      "epoch": 41.265625,
      "grad_norm": 2.372626781463623,
      "learning_rate": 3.5e-05,
      "loss": 1.3364,
      "step": 2641
    },
    {
      "epoch": 41.28125,
      "grad_norm": 2.593846082687378,
      "learning_rate": 3.49375e-05,
      "loss": 1.1319,
      "step": 2642
    },
    {
      "epoch": 41.296875,
      "grad_norm": 2.355816602706909,
      "learning_rate": 3.4875e-05,
      "loss": 1.4354,
      "step": 2643
    },
    {
      "epoch": 41.3125,
      "grad_norm": 2.2676827907562256,
      "learning_rate": 3.4812500000000006e-05,
      "loss": 0.8866,
      "step": 2644
    },
    {
      "epoch": 41.328125,
      "grad_norm": 2.151750326156616,
      "learning_rate": 3.475e-05,
      "loss": 1.0898,
      "step": 2645
    },
    {
      "epoch": 41.34375,
      "grad_norm": 2.470905065536499,
      "learning_rate": 3.46875e-05,
      "loss": 1.0824,
      "step": 2646
    },
    {
      "epoch": 41.359375,
      "grad_norm": 2.5446934700012207,
      "learning_rate": 3.4625e-05,
      "loss": 1.1777,
      "step": 2647
    },
    {
      "epoch": 41.375,
      "grad_norm": 2.2487545013427734,
      "learning_rate": 3.45625e-05,
      "loss": 1.2948,
      "step": 2648
    },
    {
      "epoch": 41.390625,
      "grad_norm": 2.413461923599243,
      "learning_rate": 3.45e-05,
      "loss": 1.1779,
      "step": 2649
    },
    {
      "epoch": 41.40625,
      "grad_norm": 2.5567097663879395,
      "learning_rate": 3.4437500000000004e-05,
      "loss": 0.9199,
      "step": 2650
    },
    {
      "epoch": 41.421875,
      "grad_norm": 2.3439674377441406,
      "learning_rate": 3.4375e-05,
      "loss": 1.1846,
      "step": 2651
    },
    {
      "epoch": 41.4375,
      "grad_norm": 2.342423439025879,
      "learning_rate": 3.43125e-05,
      "loss": 1.0997,
      "step": 2652
    },
    {
      "epoch": 41.453125,
      "grad_norm": 2.272202730178833,
      "learning_rate": 3.4250000000000006e-05,
      "loss": 1.1889,
      "step": 2653
    },
    {
      "epoch": 41.46875,
      "grad_norm": 2.278956651687622,
      "learning_rate": 3.41875e-05,
      "loss": 1.0282,
      "step": 2654
    },
    {
      "epoch": 41.484375,
      "grad_norm": 2.347881555557251,
      "learning_rate": 3.4125e-05,
      "loss": 1.1823,
      "step": 2655
    },
    {
      "epoch": 41.5,
      "grad_norm": 2.428847312927246,
      "learning_rate": 3.40625e-05,
      "loss": 1.2431,
      "step": 2656
    },
    {
      "epoch": 41.515625,
      "grad_norm": 2.2257328033447266,
      "learning_rate": 3.4000000000000007e-05,
      "loss": 1.3405,
      "step": 2657
    },
    {
      "epoch": 41.53125,
      "grad_norm": 2.4844300746917725,
      "learning_rate": 3.39375e-05,
      "loss": 1.1387,
      "step": 2658
    },
    {
      "epoch": 41.546875,
      "grad_norm": 2.3044657707214355,
      "learning_rate": 3.3875000000000003e-05,
      "loss": 1.0253,
      "step": 2659
    },
    {
      "epoch": 41.5625,
      "grad_norm": 2.6096858978271484,
      "learning_rate": 3.38125e-05,
      "loss": 1.0252,
      "step": 2660
    },
    {
      "epoch": 41.578125,
      "grad_norm": 2.2948241233825684,
      "learning_rate": 3.375000000000001e-05,
      "loss": 1.2572,
      "step": 2661
    },
    {
      "epoch": 41.59375,
      "grad_norm": 2.1540679931640625,
      "learning_rate": 3.36875e-05,
      "loss": 1.18,
      "step": 2662
    },
    {
      "epoch": 41.609375,
      "grad_norm": 2.5778322219848633,
      "learning_rate": 3.3625000000000004e-05,
      "loss": 1.113,
      "step": 2663
    },
    {
      "epoch": 41.625,
      "grad_norm": 2.4486377239227295,
      "learning_rate": 3.35625e-05,
      "loss": 1.2698,
      "step": 2664
    },
    {
      "epoch": 41.640625,
      "grad_norm": 2.444331407546997,
      "learning_rate": 3.35e-05,
      "loss": 1.2399,
      "step": 2665
    },
    {
      "epoch": 41.65625,
      "grad_norm": 2.80783748626709,
      "learning_rate": 3.34375e-05,
      "loss": 1.1954,
      "step": 2666
    },
    {
      "epoch": 41.671875,
      "grad_norm": 2.19472599029541,
      "learning_rate": 3.3375e-05,
      "loss": 1.3285,
      "step": 2667
    },
    {
      "epoch": 41.6875,
      "grad_norm": 2.393714189529419,
      "learning_rate": 3.33125e-05,
      "loss": 0.9927,
      "step": 2668
    },
    {
      "epoch": 41.703125,
      "grad_norm": 2.341789960861206,
      "learning_rate": 3.325e-05,
      "loss": 1.3435,
      "step": 2669
    },
    {
      "epoch": 41.71875,
      "grad_norm": 2.6332736015319824,
      "learning_rate": 3.31875e-05,
      "loss": 1.0147,
      "step": 2670
    },
    {
      "epoch": 41.734375,
      "grad_norm": 2.5298094749450684,
      "learning_rate": 3.3125e-05,
      "loss": 1.2422,
      "step": 2671
    },
    {
      "epoch": 41.75,
      "grad_norm": 2.316898822784424,
      "learning_rate": 3.3062500000000004e-05,
      "loss": 1.0439,
      "step": 2672
    },
    {
      "epoch": 41.765625,
      "grad_norm": 2.667917490005493,
      "learning_rate": 3.3e-05,
      "loss": 1.1372,
      "step": 2673
    },
    {
      "epoch": 41.78125,
      "grad_norm": 2.7123608589172363,
      "learning_rate": 3.29375e-05,
      "loss": 1.1554,
      "step": 2674
    },
    {
      "epoch": 41.796875,
      "grad_norm": 2.2717010974884033,
      "learning_rate": 3.2875e-05,
      "loss": 1.0745,
      "step": 2675
    },
    {
      "epoch": 41.8125,
      "grad_norm": 2.7065956592559814,
      "learning_rate": 3.2812500000000005e-05,
      "loss": 1.0241,
      "step": 2676
    },
    {
      "epoch": 41.828125,
      "grad_norm": 2.37487530708313,
      "learning_rate": 3.275e-05,
      "loss": 1.2881,
      "step": 2677
    },
    {
      "epoch": 41.84375,
      "grad_norm": 2.6967384815216064,
      "learning_rate": 3.26875e-05,
      "loss": 1.1303,
      "step": 2678
    },
    {
      "epoch": 41.859375,
      "grad_norm": 2.274902105331421,
      "learning_rate": 3.2625e-05,
      "loss": 1.1684,
      "step": 2679
    },
    {
      "epoch": 41.875,
      "grad_norm": 2.3104958534240723,
      "learning_rate": 3.25625e-05,
      "loss": 1.2304,
      "step": 2680
    },
    {
      "epoch": 41.890625,
      "grad_norm": 2.36221981048584,
      "learning_rate": 3.2500000000000004e-05,
      "loss": 1.0286,
      "step": 2681
    },
    {
      "epoch": 41.90625,
      "grad_norm": 2.5237698554992676,
      "learning_rate": 3.24375e-05,
      "loss": 1.1659,
      "step": 2682
    },
    {
      "epoch": 41.921875,
      "grad_norm": 2.428013324737549,
      "learning_rate": 3.2375e-05,
      "loss": 1.1905,
      "step": 2683
    },
    {
      "epoch": 41.9375,
      "grad_norm": 2.4509992599487305,
      "learning_rate": 3.23125e-05,
      "loss": 1.2545,
      "step": 2684
    },
    {
      "epoch": 41.953125,
      "grad_norm": 2.6310081481933594,
      "learning_rate": 3.2250000000000005e-05,
      "loss": 1.0442,
      "step": 2685
    },
    {
      "epoch": 41.96875,
      "grad_norm": 2.2675113677978516,
      "learning_rate": 3.21875e-05,
      "loss": 0.98,
      "step": 2686
    },
    {
      "epoch": 41.984375,
      "grad_norm": 2.3213257789611816,
      "learning_rate": 3.2125e-05,
      "loss": 1.412,
      "step": 2687
    },
    {
      "epoch": 42.0,
      "grad_norm": 2.671680450439453,
      "learning_rate": 3.20625e-05,
      "loss": 1.259,
      "step": 2688
    },
    {
      "epoch": 42.0,
      "eval_loss": 3.0972280502319336,
      "eval_runtime": 2.9254,
      "eval_samples_per_second": 175.021,
      "eval_steps_per_second": 43.755,
      "step": 2688
    },
    {
      "epoch": 42.015625,
      "grad_norm": 2.475613832473755,
      "learning_rate": 3.2000000000000005e-05,
      "loss": 1.0803,
      "step": 2689
    },
    {
      "epoch": 42.03125,
      "grad_norm": 2.4353902339935303,
      "learning_rate": 3.1937500000000004e-05,
      "loss": 1.2921,
      "step": 2690
    },
    {
      "epoch": 42.046875,
      "grad_norm": 2.419356346130371,
      "learning_rate": 3.1875e-05,
      "loss": 1.341,
      "step": 2691
    },
    {
      "epoch": 42.0625,
      "grad_norm": 2.27209734916687,
      "learning_rate": 3.18125e-05,
      "loss": 1.171,
      "step": 2692
    },
    {
      "epoch": 42.078125,
      "grad_norm": 2.6878230571746826,
      "learning_rate": 3.175e-05,
      "loss": 1.1953,
      "step": 2693
    },
    {
      "epoch": 42.09375,
      "grad_norm": 2.5040080547332764,
      "learning_rate": 3.1687500000000005e-05,
      "loss": 1.1855,
      "step": 2694
    },
    {
      "epoch": 42.109375,
      "grad_norm": 2.4992878437042236,
      "learning_rate": 3.1624999999999996e-05,
      "loss": 1.0711,
      "step": 2695
    },
    {
      "epoch": 42.125,
      "grad_norm": 2.2573580741882324,
      "learning_rate": 3.15625e-05,
      "loss": 1.2957,
      "step": 2696
    },
    {
      "epoch": 42.140625,
      "grad_norm": 2.695172071456909,
      "learning_rate": 3.15e-05,
      "loss": 1.3168,
      "step": 2697
    },
    {
      "epoch": 42.15625,
      "grad_norm": 2.4612441062927246,
      "learning_rate": 3.1437500000000005e-05,
      "loss": 1.4493,
      "step": 2698
    },
    {
      "epoch": 42.171875,
      "grad_norm": 2.6228952407836914,
      "learning_rate": 3.1375e-05,
      "loss": 1.2465,
      "step": 2699
    },
    {
      "epoch": 42.1875,
      "grad_norm": 2.362769842147827,
      "learning_rate": 3.13125e-05,
      "loss": 1.3031,
      "step": 2700
    },
    {
      "epoch": 42.203125,
      "grad_norm": 2.4696662425994873,
      "learning_rate": 3.125e-05,
      "loss": 1.1681,
      "step": 2701
    },
    {
      "epoch": 42.21875,
      "grad_norm": 2.112215518951416,
      "learning_rate": 3.1187500000000006e-05,
      "loss": 1.1877,
      "step": 2702
    },
    {
      "epoch": 42.234375,
      "grad_norm": 2.1868057250976562,
      "learning_rate": 3.1125000000000004e-05,
      "loss": 1.1678,
      "step": 2703
    },
    {
      "epoch": 42.25,
      "grad_norm": 2.4941864013671875,
      "learning_rate": 3.10625e-05,
      "loss": 1.1944,
      "step": 2704
    },
    {
      "epoch": 42.265625,
      "grad_norm": 2.4657108783721924,
      "learning_rate": 3.1e-05,
      "loss": 1.0909,
      "step": 2705
    },
    {
      "epoch": 42.28125,
      "grad_norm": 2.2086589336395264,
      "learning_rate": 3.09375e-05,
      "loss": 1.0061,
      "step": 2706
    },
    {
      "epoch": 42.296875,
      "grad_norm": 2.7440261840820312,
      "learning_rate": 3.0875000000000005e-05,
      "loss": 1.0369,
      "step": 2707
    },
    {
      "epoch": 42.3125,
      "grad_norm": 2.3309335708618164,
      "learning_rate": 3.08125e-05,
      "loss": 1.0875,
      "step": 2708
    },
    {
      "epoch": 42.328125,
      "grad_norm": 2.5594136714935303,
      "learning_rate": 3.075e-05,
      "loss": 1.0731,
      "step": 2709
    },
    {
      "epoch": 42.34375,
      "grad_norm": 2.472770929336548,
      "learning_rate": 3.06875e-05,
      "loss": 1.4292,
      "step": 2710
    },
    {
      "epoch": 42.359375,
      "grad_norm": 2.4768545627593994,
      "learning_rate": 3.0625000000000006e-05,
      "loss": 1.4916,
      "step": 2711
    },
    {
      "epoch": 42.375,
      "grad_norm": 2.1882755756378174,
      "learning_rate": 3.05625e-05,
      "loss": 1.2473,
      "step": 2712
    },
    {
      "epoch": 42.390625,
      "grad_norm": 2.6267199516296387,
      "learning_rate": 3.05e-05,
      "loss": 1.1554,
      "step": 2713
    },
    {
      "epoch": 42.40625,
      "grad_norm": 2.409977912902832,
      "learning_rate": 3.04375e-05,
      "loss": 1.2906,
      "step": 2714
    },
    {
      "epoch": 42.421875,
      "grad_norm": 2.287445545196533,
      "learning_rate": 3.0375000000000003e-05,
      "loss": 1.3287,
      "step": 2715
    },
    {
      "epoch": 42.4375,
      "grad_norm": 2.4203593730926514,
      "learning_rate": 3.0312499999999998e-05,
      "loss": 1.063,
      "step": 2716
    },
    {
      "epoch": 42.453125,
      "grad_norm": 2.7540359497070312,
      "learning_rate": 3.025e-05,
      "loss": 1.2977,
      "step": 2717
    },
    {
      "epoch": 42.46875,
      "grad_norm": 2.3857526779174805,
      "learning_rate": 3.0187500000000002e-05,
      "loss": 1.1327,
      "step": 2718
    },
    {
      "epoch": 42.484375,
      "grad_norm": 2.4010088443756104,
      "learning_rate": 3.0125000000000004e-05,
      "loss": 1.0723,
      "step": 2719
    },
    {
      "epoch": 42.5,
      "grad_norm": 2.626354932785034,
      "learning_rate": 3.00625e-05,
      "loss": 1.2006,
      "step": 2720
    },
    {
      "epoch": 42.515625,
      "grad_norm": 2.3263797760009766,
      "learning_rate": 3e-05,
      "loss": 1.0737,
      "step": 2721
    },
    {
      "epoch": 42.53125,
      "grad_norm": 2.612545967102051,
      "learning_rate": 2.9937500000000003e-05,
      "loss": 1.1784,
      "step": 2722
    },
    {
      "epoch": 42.546875,
      "grad_norm": 2.3897507190704346,
      "learning_rate": 2.9875000000000004e-05,
      "loss": 1.1119,
      "step": 2723
    },
    {
      "epoch": 42.5625,
      "grad_norm": 2.4584875106811523,
      "learning_rate": 2.98125e-05,
      "loss": 1.2387,
      "step": 2724
    },
    {
      "epoch": 42.578125,
      "grad_norm": 2.3746025562286377,
      "learning_rate": 2.975e-05,
      "loss": 1.2191,
      "step": 2725
    },
    {
      "epoch": 42.59375,
      "grad_norm": 2.344606876373291,
      "learning_rate": 2.96875e-05,
      "loss": 1.2983,
      "step": 2726
    },
    {
      "epoch": 42.609375,
      "grad_norm": 2.4242169857025146,
      "learning_rate": 2.9625000000000002e-05,
      "loss": 1.2239,
      "step": 2727
    },
    {
      "epoch": 42.625,
      "grad_norm": 2.625930070877075,
      "learning_rate": 2.9562500000000004e-05,
      "loss": 1.1636,
      "step": 2728
    },
    {
      "epoch": 42.640625,
      "grad_norm": 2.490241527557373,
      "learning_rate": 2.95e-05,
      "loss": 1.3841,
      "step": 2729
    },
    {
      "epoch": 42.65625,
      "grad_norm": 2.5366830825805664,
      "learning_rate": 2.94375e-05,
      "loss": 1.2919,
      "step": 2730
    },
    {
      "epoch": 42.671875,
      "grad_norm": 2.2196755409240723,
      "learning_rate": 2.9375000000000003e-05,
      "loss": 1.0538,
      "step": 2731
    },
    {
      "epoch": 42.6875,
      "grad_norm": 2.454305410385132,
      "learning_rate": 2.9312500000000004e-05,
      "loss": 1.2682,
      "step": 2732
    },
    {
      "epoch": 42.703125,
      "grad_norm": 2.6578006744384766,
      "learning_rate": 2.925e-05,
      "loss": 1.3389,
      "step": 2733
    },
    {
      "epoch": 42.71875,
      "grad_norm": 2.240450620651245,
      "learning_rate": 2.91875e-05,
      "loss": 1.2778,
      "step": 2734
    },
    {
      "epoch": 42.734375,
      "grad_norm": 2.476793050765991,
      "learning_rate": 2.9125000000000003e-05,
      "loss": 1.2587,
      "step": 2735
    },
    {
      "epoch": 42.75,
      "grad_norm": 2.475320816040039,
      "learning_rate": 2.9062500000000005e-05,
      "loss": 1.3894,
      "step": 2736
    },
    {
      "epoch": 42.765625,
      "grad_norm": 2.5630123615264893,
      "learning_rate": 2.9e-05,
      "loss": 1.1921,
      "step": 2737
    },
    {
      "epoch": 42.78125,
      "grad_norm": 2.2658450603485107,
      "learning_rate": 2.8937500000000002e-05,
      "loss": 1.057,
      "step": 2738
    },
    {
      "epoch": 42.796875,
      "grad_norm": 2.4620161056518555,
      "learning_rate": 2.8875e-05,
      "loss": 1.295,
      "step": 2739
    },
    {
      "epoch": 42.8125,
      "grad_norm": 2.499530076980591,
      "learning_rate": 2.8812500000000002e-05,
      "loss": 1.0249,
      "step": 2740
    },
    {
      "epoch": 42.828125,
      "grad_norm": 2.4321253299713135,
      "learning_rate": 2.8749999999999997e-05,
      "loss": 1.0734,
      "step": 2741
    },
    {
      "epoch": 42.84375,
      "grad_norm": 2.6723973751068115,
      "learning_rate": 2.86875e-05,
      "loss": 1.3495,
      "step": 2742
    },
    {
      "epoch": 42.859375,
      "grad_norm": 2.390220880508423,
      "learning_rate": 2.8625e-05,
      "loss": 1.1313,
      "step": 2743
    },
    {
      "epoch": 42.875,
      "grad_norm": 2.2521090507507324,
      "learning_rate": 2.8562500000000003e-05,
      "loss": 1.0281,
      "step": 2744
    },
    {
      "epoch": 42.890625,
      "grad_norm": 2.4320785999298096,
      "learning_rate": 2.8499999999999998e-05,
      "loss": 1.1977,
      "step": 2745
    },
    {
      "epoch": 42.90625,
      "grad_norm": 2.7909481525421143,
      "learning_rate": 2.84375e-05,
      "loss": 0.979,
      "step": 2746
    },
    {
      "epoch": 42.921875,
      "grad_norm": 2.3112900257110596,
      "learning_rate": 2.8375000000000002e-05,
      "loss": 1.0286,
      "step": 2747
    },
    {
      "epoch": 42.9375,
      "grad_norm": 2.4714651107788086,
      "learning_rate": 2.8312500000000004e-05,
      "loss": 1.006,
      "step": 2748
    },
    {
      "epoch": 42.953125,
      "grad_norm": 2.0498578548431396,
      "learning_rate": 2.825e-05,
      "loss": 1.087,
      "step": 2749
    },
    {
      "epoch": 42.96875,
      "grad_norm": 2.417611837387085,
      "learning_rate": 2.81875e-05,
      "loss": 1.1788,
      "step": 2750
    },
    {
      "epoch": 42.984375,
      "grad_norm": 2.61012601852417,
      "learning_rate": 2.8125000000000003e-05,
      "loss": 1.1755,
      "step": 2751
    },
    {
      "epoch": 43.0,
      "grad_norm": 4.069469928741455,
      "learning_rate": 2.80625e-05,
      "loss": 1.1209,
      "step": 2752
    },
    {
      "epoch": 43.0,
      "eval_loss": 3.1068687438964844,
      "eval_runtime": 2.9141,
      "eval_samples_per_second": 175.696,
      "eval_steps_per_second": 43.924,
      "step": 2752
    },
    {
      "epoch": 43.015625,
      "grad_norm": 2.4750943183898926,
      "learning_rate": 2.8000000000000003e-05,
      "loss": 0.9746,
      "step": 2753
    },
    {
      "epoch": 43.03125,
      "grad_norm": 2.4494035243988037,
      "learning_rate": 2.79375e-05,
      "loss": 1.3107,
      "step": 2754
    },
    {
      "epoch": 43.046875,
      "grad_norm": 2.828680992126465,
      "learning_rate": 2.7875e-05,
      "loss": 1.3037,
      "step": 2755
    },
    {
      "epoch": 43.0625,
      "grad_norm": 2.6593801975250244,
      "learning_rate": 2.7812500000000002e-05,
      "loss": 1.1315,
      "step": 2756
    },
    {
      "epoch": 43.078125,
      "grad_norm": 2.556492805480957,
      "learning_rate": 2.7750000000000004e-05,
      "loss": 1.2445,
      "step": 2757
    },
    {
      "epoch": 43.09375,
      "grad_norm": 2.309091567993164,
      "learning_rate": 2.76875e-05,
      "loss": 1.3266,
      "step": 2758
    },
    {
      "epoch": 43.109375,
      "grad_norm": 2.3435139656066895,
      "learning_rate": 2.7625e-05,
      "loss": 1.1273,
      "step": 2759
    },
    {
      "epoch": 43.125,
      "grad_norm": 2.1871654987335205,
      "learning_rate": 2.7562500000000002e-05,
      "loss": 1.4062,
      "step": 2760
    },
    {
      "epoch": 43.140625,
      "grad_norm": 2.4761109352111816,
      "learning_rate": 2.7500000000000004e-05,
      "loss": 1.2031,
      "step": 2761
    },
    {
      "epoch": 43.15625,
      "grad_norm": 2.3947606086730957,
      "learning_rate": 2.74375e-05,
      "loss": 1.0777,
      "step": 2762
    },
    {
      "epoch": 43.171875,
      "grad_norm": 2.6596732139587402,
      "learning_rate": 2.7375e-05,
      "loss": 1.2739,
      "step": 2763
    },
    {
      "epoch": 43.1875,
      "grad_norm": 2.2814416885375977,
      "learning_rate": 2.7312500000000003e-05,
      "loss": 1.3923,
      "step": 2764
    },
    {
      "epoch": 43.203125,
      "grad_norm": 2.2753913402557373,
      "learning_rate": 2.725e-05,
      "loss": 1.0799,
      "step": 2765
    },
    {
      "epoch": 43.21875,
      "grad_norm": 2.157034397125244,
      "learning_rate": 2.71875e-05,
      "loss": 1.306,
      "step": 2766
    },
    {
      "epoch": 43.234375,
      "grad_norm": 2.4572529792785645,
      "learning_rate": 2.7125000000000002e-05,
      "loss": 1.0952,
      "step": 2767
    },
    {
      "epoch": 43.25,
      "grad_norm": 2.436840295791626,
      "learning_rate": 2.70625e-05,
      "loss": 1.1604,
      "step": 2768
    },
    {
      "epoch": 43.265625,
      "grad_norm": 2.566051959991455,
      "learning_rate": 2.7000000000000002e-05,
      "loss": 1.073,
      "step": 2769
    },
    {
      "epoch": 43.28125,
      "grad_norm": 2.3944320678710938,
      "learning_rate": 2.6937499999999997e-05,
      "loss": 0.9778,
      "step": 2770
    },
    {
      "epoch": 43.296875,
      "grad_norm": 2.5431716442108154,
      "learning_rate": 2.6875e-05,
      "loss": 1.1952,
      "step": 2771
    },
    {
      "epoch": 43.3125,
      "grad_norm": 2.1081626415252686,
      "learning_rate": 2.68125e-05,
      "loss": 1.3763,
      "step": 2772
    },
    {
      "epoch": 43.328125,
      "grad_norm": 2.457777261734009,
      "learning_rate": 2.6750000000000003e-05,
      "loss": 1.2822,
      "step": 2773
    },
    {
      "epoch": 43.34375,
      "grad_norm": 2.6582374572753906,
      "learning_rate": 2.6687499999999998e-05,
      "loss": 1.2069,
      "step": 2774
    },
    {
      "epoch": 43.359375,
      "grad_norm": 2.453037738800049,
      "learning_rate": 2.6625e-05,
      "loss": 1.2416,
      "step": 2775
    },
    {
      "epoch": 43.375,
      "grad_norm": 2.4768948554992676,
      "learning_rate": 2.6562500000000002e-05,
      "loss": 1.2808,
      "step": 2776
    },
    {
      "epoch": 43.390625,
      "grad_norm": 2.298992395401001,
      "learning_rate": 2.6500000000000004e-05,
      "loss": 1.2077,
      "step": 2777
    },
    {
      "epoch": 43.40625,
      "grad_norm": 2.513270139694214,
      "learning_rate": 2.6437500000000002e-05,
      "loss": 1.3279,
      "step": 2778
    },
    {
      "epoch": 43.421875,
      "grad_norm": 2.2735772132873535,
      "learning_rate": 2.6375e-05,
      "loss": 1.1629,
      "step": 2779
    },
    {
      "epoch": 43.4375,
      "grad_norm": 2.3572447299957275,
      "learning_rate": 2.6312500000000003e-05,
      "loss": 1.2024,
      "step": 2780
    },
    {
      "epoch": 43.453125,
      "grad_norm": 2.4243102073669434,
      "learning_rate": 2.625e-05,
      "loss": 1.2097,
      "step": 2781
    },
    {
      "epoch": 43.46875,
      "grad_norm": 2.2641124725341797,
      "learning_rate": 2.6187500000000003e-05,
      "loss": 1.2813,
      "step": 2782
    },
    {
      "epoch": 43.484375,
      "grad_norm": 2.823176145553589,
      "learning_rate": 2.6124999999999998e-05,
      "loss": 1.2658,
      "step": 2783
    },
    {
      "epoch": 43.5,
      "grad_norm": 2.720815658569336,
      "learning_rate": 2.60625e-05,
      "loss": 1.206,
      "step": 2784
    },
    {
      "epoch": 43.515625,
      "grad_norm": 2.5369770526885986,
      "learning_rate": 2.6000000000000002e-05,
      "loss": 1.2987,
      "step": 2785
    },
    {
      "epoch": 43.53125,
      "grad_norm": 2.4620554447174072,
      "learning_rate": 2.5937500000000004e-05,
      "loss": 1.1919,
      "step": 2786
    },
    {
      "epoch": 43.546875,
      "grad_norm": 2.3221139907836914,
      "learning_rate": 2.5875e-05,
      "loss": 0.9489,
      "step": 2787
    },
    {
      "epoch": 43.5625,
      "grad_norm": 2.30381441116333,
      "learning_rate": 2.58125e-05,
      "loss": 1.1982,
      "step": 2788
    },
    {
      "epoch": 43.578125,
      "grad_norm": 2.5033509731292725,
      "learning_rate": 2.5750000000000002e-05,
      "loss": 0.996,
      "step": 2789
    },
    {
      "epoch": 43.59375,
      "grad_norm": 2.298875093460083,
      "learning_rate": 2.5687500000000004e-05,
      "loss": 0.9849,
      "step": 2790
    },
    {
      "epoch": 43.609375,
      "grad_norm": 2.635570526123047,
      "learning_rate": 2.5625e-05,
      "loss": 1.1041,
      "step": 2791
    },
    {
      "epoch": 43.625,
      "grad_norm": 2.2661798000335693,
      "learning_rate": 2.55625e-05,
      "loss": 1.0946,
      "step": 2792
    },
    {
      "epoch": 43.640625,
      "grad_norm": 2.3072845935821533,
      "learning_rate": 2.5500000000000003e-05,
      "loss": 1.0544,
      "step": 2793
    },
    {
      "epoch": 43.65625,
      "grad_norm": 2.882960796356201,
      "learning_rate": 2.54375e-05,
      "loss": 1.1534,
      "step": 2794
    },
    {
      "epoch": 43.671875,
      "grad_norm": 2.220487356185913,
      "learning_rate": 2.5375e-05,
      "loss": 1.1268,
      "step": 2795
    },
    {
      "epoch": 43.6875,
      "grad_norm": 2.3770833015441895,
      "learning_rate": 2.53125e-05,
      "loss": 1.1744,
      "step": 2796
    },
    {
      "epoch": 43.703125,
      "grad_norm": 2.4134552478790283,
      "learning_rate": 2.525e-05,
      "loss": 1.108,
      "step": 2797
    },
    {
      "epoch": 43.71875,
      "grad_norm": 2.3666529655456543,
      "learning_rate": 2.5187500000000002e-05,
      "loss": 1.1025,
      "step": 2798
    },
    {
      "epoch": 43.734375,
      "grad_norm": 2.3284032344818115,
      "learning_rate": 2.5124999999999997e-05,
      "loss": 1.1145,
      "step": 2799
    },
    {
      "epoch": 43.75,
      "grad_norm": 2.5647850036621094,
      "learning_rate": 2.50625e-05,
      "loss": 1.3046,
      "step": 2800
    },
    {
      "epoch": 43.765625,
      "grad_norm": 2.8137524127960205,
      "learning_rate": 2.5e-05,
      "loss": 1.3158,
      "step": 2801
    },
    {
      "epoch": 43.78125,
      "grad_norm": 2.3966639041900635,
      "learning_rate": 2.4937500000000003e-05,
      "loss": 1.1626,
      "step": 2802
    },
    {
      "epoch": 43.796875,
      "grad_norm": 2.8384530544281006,
      "learning_rate": 2.4875e-05,
      "loss": 1.1199,
      "step": 2803
    },
    {
      "epoch": 43.8125,
      "grad_norm": 2.451833724975586,
      "learning_rate": 2.4812500000000003e-05,
      "loss": 0.8208,
      "step": 2804
    },
    {
      "epoch": 43.828125,
      "grad_norm": 2.43778920173645,
      "learning_rate": 2.4750000000000002e-05,
      "loss": 1.2967,
      "step": 2805
    },
    {
      "epoch": 43.84375,
      "grad_norm": 2.4553310871124268,
      "learning_rate": 2.4687500000000004e-05,
      "loss": 1.218,
      "step": 2806
    },
    {
      "epoch": 43.859375,
      "grad_norm": 2.440749406814575,
      "learning_rate": 2.4625000000000002e-05,
      "loss": 1.1869,
      "step": 2807
    },
    {
      "epoch": 43.875,
      "grad_norm": 2.4439210891723633,
      "learning_rate": 2.45625e-05,
      "loss": 1.1718,
      "step": 2808
    },
    {
      "epoch": 43.890625,
      "grad_norm": 2.4662392139434814,
      "learning_rate": 2.45e-05,
      "loss": 1.1341,
      "step": 2809
    },
    {
      "epoch": 43.90625,
      "grad_norm": 2.541395664215088,
      "learning_rate": 2.44375e-05,
      "loss": 1.0246,
      "step": 2810
    },
    {
      "epoch": 43.921875,
      "grad_norm": 2.3018085956573486,
      "learning_rate": 2.4375e-05,
      "loss": 1.1793,
      "step": 2811
    },
    {
      "epoch": 43.9375,
      "grad_norm": 2.6576755046844482,
      "learning_rate": 2.43125e-05,
      "loss": 1.2251,
      "step": 2812
    },
    {
      "epoch": 43.953125,
      "grad_norm": 2.390911102294922,
      "learning_rate": 2.425e-05,
      "loss": 1.1667,
      "step": 2813
    },
    {
      "epoch": 43.96875,
      "grad_norm": 2.802027463912964,
      "learning_rate": 2.4187500000000002e-05,
      "loss": 1.145,
      "step": 2814
    },
    {
      "epoch": 43.984375,
      "grad_norm": 2.4770989418029785,
      "learning_rate": 2.4125e-05,
      "loss": 1.1296,
      "step": 2815
    },
    {
      "epoch": 44.0,
      "grad_norm": 3.288362979888916,
      "learning_rate": 2.4062500000000002e-05,
      "loss": 0.9966,
      "step": 2816
    },
    {
      "epoch": 44.0,
      "eval_loss": 3.1000537872314453,
      "eval_runtime": 2.947,
      "eval_samples_per_second": 173.736,
      "eval_steps_per_second": 43.434,
      "step": 2816
    },
    {
      "epoch": 44.015625,
      "grad_norm": 2.409644365310669,
      "learning_rate": 2.4e-05,
      "loss": 1.0906,
      "step": 2817
    },
    {
      "epoch": 44.03125,
      "grad_norm": 2.673661947250366,
      "learning_rate": 2.3937500000000002e-05,
      "loss": 1.2602,
      "step": 2818
    },
    {
      "epoch": 44.046875,
      "grad_norm": 2.2938337326049805,
      "learning_rate": 2.3875e-05,
      "loss": 1.1847,
      "step": 2819
    },
    {
      "epoch": 44.0625,
      "grad_norm": 2.1721954345703125,
      "learning_rate": 2.3812500000000003e-05,
      "loss": 1.2735,
      "step": 2820
    },
    {
      "epoch": 44.078125,
      "grad_norm": 2.4037692546844482,
      "learning_rate": 2.375e-05,
      "loss": 1.2687,
      "step": 2821
    },
    {
      "epoch": 44.09375,
      "grad_norm": 2.358041763305664,
      "learning_rate": 2.36875e-05,
      "loss": 1.0911,
      "step": 2822
    },
    {
      "epoch": 44.109375,
      "grad_norm": 2.588340997695923,
      "learning_rate": 2.3624999999999998e-05,
      "loss": 1.002,
      "step": 2823
    },
    {
      "epoch": 44.125,
      "grad_norm": 2.6705334186553955,
      "learning_rate": 2.35625e-05,
      "loss": 1.0746,
      "step": 2824
    },
    {
      "epoch": 44.140625,
      "grad_norm": 2.5213074684143066,
      "learning_rate": 2.35e-05,
      "loss": 1.1037,
      "step": 2825
    },
    {
      "epoch": 44.15625,
      "grad_norm": 2.297910690307617,
      "learning_rate": 2.34375e-05,
      "loss": 1.1095,
      "step": 2826
    },
    {
      "epoch": 44.171875,
      "grad_norm": 2.6412668228149414,
      "learning_rate": 2.3375000000000002e-05,
      "loss": 1.0917,
      "step": 2827
    },
    {
      "epoch": 44.1875,
      "grad_norm": 2.301602363586426,
      "learning_rate": 2.33125e-05,
      "loss": 1.3733,
      "step": 2828
    },
    {
      "epoch": 44.203125,
      "grad_norm": 2.3363893032073975,
      "learning_rate": 2.3250000000000003e-05,
      "loss": 1.3798,
      "step": 2829
    },
    {
      "epoch": 44.21875,
      "grad_norm": 2.2964444160461426,
      "learning_rate": 2.31875e-05,
      "loss": 1.2098,
      "step": 2830
    },
    {
      "epoch": 44.234375,
      "grad_norm": 2.2454638481140137,
      "learning_rate": 2.3125000000000003e-05,
      "loss": 0.9989,
      "step": 2831
    },
    {
      "epoch": 44.25,
      "grad_norm": 2.7472822666168213,
      "learning_rate": 2.30625e-05,
      "loss": 1.3958,
      "step": 2832
    },
    {
      "epoch": 44.265625,
      "grad_norm": 2.5875861644744873,
      "learning_rate": 2.3000000000000003e-05,
      "loss": 1.2216,
      "step": 2833
    },
    {
      "epoch": 44.28125,
      "grad_norm": 2.613590955734253,
      "learning_rate": 2.2937500000000002e-05,
      "loss": 1.1597,
      "step": 2834
    },
    {
      "epoch": 44.296875,
      "grad_norm": 2.3459904193878174,
      "learning_rate": 2.2875e-05,
      "loss": 1.2707,
      "step": 2835
    },
    {
      "epoch": 44.3125,
      "grad_norm": 2.2222228050231934,
      "learning_rate": 2.28125e-05,
      "loss": 0.9476,
      "step": 2836
    },
    {
      "epoch": 44.328125,
      "grad_norm": 2.3691904544830322,
      "learning_rate": 2.275e-05,
      "loss": 1.2025,
      "step": 2837
    },
    {
      "epoch": 44.34375,
      "grad_norm": 2.276081085205078,
      "learning_rate": 2.26875e-05,
      "loss": 1.3001,
      "step": 2838
    },
    {
      "epoch": 44.359375,
      "grad_norm": 2.515096426010132,
      "learning_rate": 2.2625e-05,
      "loss": 1.1872,
      "step": 2839
    },
    {
      "epoch": 44.375,
      "grad_norm": 2.493860960006714,
      "learning_rate": 2.25625e-05,
      "loss": 1.2131,
      "step": 2840
    },
    {
      "epoch": 44.390625,
      "grad_norm": 2.480360507965088,
      "learning_rate": 2.25e-05,
      "loss": 1.2076,
      "step": 2841
    },
    {
      "epoch": 44.40625,
      "grad_norm": 2.393655776977539,
      "learning_rate": 2.24375e-05,
      "loss": 1.0464,
      "step": 2842
    },
    {
      "epoch": 44.421875,
      "grad_norm": 2.3374969959259033,
      "learning_rate": 2.2375000000000002e-05,
      "loss": 1.0295,
      "step": 2843
    },
    {
      "epoch": 44.4375,
      "grad_norm": 2.5968966484069824,
      "learning_rate": 2.23125e-05,
      "loss": 1.1236,
      "step": 2844
    },
    {
      "epoch": 44.453125,
      "grad_norm": 2.460550308227539,
      "learning_rate": 2.2250000000000002e-05,
      "loss": 1.3098,
      "step": 2845
    },
    {
      "epoch": 44.46875,
      "grad_norm": 2.2904489040374756,
      "learning_rate": 2.21875e-05,
      "loss": 1.1139,
      "step": 2846
    },
    {
      "epoch": 44.484375,
      "grad_norm": 2.279609203338623,
      "learning_rate": 2.2125000000000002e-05,
      "loss": 1.0763,
      "step": 2847
    },
    {
      "epoch": 44.5,
      "grad_norm": 2.524930477142334,
      "learning_rate": 2.20625e-05,
      "loss": 1.2234,
      "step": 2848
    },
    {
      "epoch": 44.515625,
      "grad_norm": 2.4716598987579346,
      "learning_rate": 2.2000000000000003e-05,
      "loss": 1.2485,
      "step": 2849
    },
    {
      "epoch": 44.53125,
      "grad_norm": 2.670097589492798,
      "learning_rate": 2.19375e-05,
      "loss": 1.1929,
      "step": 2850
    },
    {
      "epoch": 44.546875,
      "grad_norm": 2.690058469772339,
      "learning_rate": 2.1875e-05,
      "loss": 1.011,
      "step": 2851
    },
    {
      "epoch": 44.5625,
      "grad_norm": 2.1861307621002197,
      "learning_rate": 2.18125e-05,
      "loss": 1.1486,
      "step": 2852
    },
    {
      "epoch": 44.578125,
      "grad_norm": 2.680424451828003,
      "learning_rate": 2.175e-05,
      "loss": 1.1983,
      "step": 2853
    },
    {
      "epoch": 44.59375,
      "grad_norm": 2.2810981273651123,
      "learning_rate": 2.1687500000000002e-05,
      "loss": 1.2929,
      "step": 2854
    },
    {
      "epoch": 44.609375,
      "grad_norm": 2.5041956901550293,
      "learning_rate": 2.1625e-05,
      "loss": 1.2575,
      "step": 2855
    },
    {
      "epoch": 44.625,
      "grad_norm": 2.2455668449401855,
      "learning_rate": 2.1562500000000002e-05,
      "loss": 1.1822,
      "step": 2856
    },
    {
      "epoch": 44.640625,
      "grad_norm": 2.4362449645996094,
      "learning_rate": 2.15e-05,
      "loss": 1.0041,
      "step": 2857
    },
    {
      "epoch": 44.65625,
      "grad_norm": 2.78784441947937,
      "learning_rate": 2.1437500000000003e-05,
      "loss": 1.2537,
      "step": 2858
    },
    {
      "epoch": 44.671875,
      "grad_norm": 2.1694464683532715,
      "learning_rate": 2.1375e-05,
      "loss": 1.3658,
      "step": 2859
    },
    {
      "epoch": 44.6875,
      "grad_norm": 2.8847196102142334,
      "learning_rate": 2.1312500000000003e-05,
      "loss": 1.1889,
      "step": 2860
    },
    {
      "epoch": 44.703125,
      "grad_norm": 2.2504994869232178,
      "learning_rate": 2.125e-05,
      "loss": 1.0606,
      "step": 2861
    },
    {
      "epoch": 44.71875,
      "grad_norm": 2.5293691158294678,
      "learning_rate": 2.1187500000000003e-05,
      "loss": 1.1611,
      "step": 2862
    },
    {
      "epoch": 44.734375,
      "grad_norm": 2.488469123840332,
      "learning_rate": 2.1125000000000002e-05,
      "loss": 1.0594,
      "step": 2863
    },
    {
      "epoch": 44.75,
      "grad_norm": 2.711676597595215,
      "learning_rate": 2.10625e-05,
      "loss": 1.0857,
      "step": 2864
    },
    {
      "epoch": 44.765625,
      "grad_norm": 2.3191709518432617,
      "learning_rate": 2.1e-05,
      "loss": 1.0155,
      "step": 2865
    },
    {
      "epoch": 44.78125,
      "grad_norm": 2.3514137268066406,
      "learning_rate": 2.09375e-05,
      "loss": 1.3291,
      "step": 2866
    },
    {
      "epoch": 44.796875,
      "grad_norm": 2.347370147705078,
      "learning_rate": 2.0875e-05,
      "loss": 1.3415,
      "step": 2867
    },
    {
      "epoch": 44.8125,
      "grad_norm": 2.386111259460449,
      "learning_rate": 2.08125e-05,
      "loss": 1.0668,
      "step": 2868
    },
    {
      "epoch": 44.828125,
      "grad_norm": 2.644868850708008,
      "learning_rate": 2.075e-05,
      "loss": 1.1098,
      "step": 2869
    },
    {
      "epoch": 44.84375,
      "grad_norm": 2.4955410957336426,
      "learning_rate": 2.06875e-05,
      "loss": 1.311,
      "step": 2870
    },
    {
      "epoch": 44.859375,
      "grad_norm": 2.703411817550659,
      "learning_rate": 2.0625e-05,
      "loss": 1.1851,
      "step": 2871
    },
    {
      "epoch": 44.875,
      "grad_norm": 2.2772605419158936,
      "learning_rate": 2.0562500000000002e-05,
      "loss": 1.3491,
      "step": 2872
    },
    {
      "epoch": 44.890625,
      "grad_norm": 2.4535439014434814,
      "learning_rate": 2.05e-05,
      "loss": 1.0653,
      "step": 2873
    },
    {
      "epoch": 44.90625,
      "grad_norm": 2.5521345138549805,
      "learning_rate": 2.0437500000000002e-05,
      "loss": 1.1304,
      "step": 2874
    },
    {
      "epoch": 44.921875,
      "grad_norm": 2.392369031906128,
      "learning_rate": 2.0375e-05,
      "loss": 1.2749,
      "step": 2875
    },
    {
      "epoch": 44.9375,
      "grad_norm": 2.5022313594818115,
      "learning_rate": 2.0312500000000002e-05,
      "loss": 1.0435,
      "step": 2876
    },
    {
      "epoch": 44.953125,
      "grad_norm": 2.4940192699432373,
      "learning_rate": 2.025e-05,
      "loss": 1.0392,
      "step": 2877
    },
    {
      "epoch": 44.96875,
      "grad_norm": 2.5066022872924805,
      "learning_rate": 2.01875e-05,
      "loss": 1.116,
      "step": 2878
    },
    {
      "epoch": 44.984375,
      "grad_norm": 2.2253828048706055,
      "learning_rate": 2.0125e-05,
      "loss": 1.1527,
      "step": 2879
    },
    {
      "epoch": 45.0,
      "grad_norm": 2.5327870845794678,
      "learning_rate": 2.00625e-05,
      "loss": 1.0355,
      "step": 2880
    },
    {
      "epoch": 45.0,
      "eval_loss": 3.105412006378174,
      "eval_runtime": 2.8493,
      "eval_samples_per_second": 179.693,
      "eval_steps_per_second": 44.923,
      "step": 2880
    },
    {
      "epoch": 45.015625,
      "grad_norm": 2.196115732192993,
      "learning_rate": 2e-05,
      "loss": 1.3592,
      "step": 2881
    },
    {
      "epoch": 45.03125,
      "grad_norm": 2.51855731010437,
      "learning_rate": 1.99375e-05,
      "loss": 1.305,
      "step": 2882
    },
    {
      "epoch": 45.046875,
      "grad_norm": 2.4158713817596436,
      "learning_rate": 1.9875000000000002e-05,
      "loss": 1.2308,
      "step": 2883
    },
    {
      "epoch": 45.0625,
      "grad_norm": 2.4094629287719727,
      "learning_rate": 1.98125e-05,
      "loss": 1.1567,
      "step": 2884
    },
    {
      "epoch": 45.078125,
      "grad_norm": 2.1717708110809326,
      "learning_rate": 1.9750000000000002e-05,
      "loss": 1.2655,
      "step": 2885
    },
    {
      "epoch": 45.09375,
      "grad_norm": 2.3107683658599854,
      "learning_rate": 1.96875e-05,
      "loss": 1.1593,
      "step": 2886
    },
    {
      "epoch": 45.109375,
      "grad_norm": 2.0286848545074463,
      "learning_rate": 1.9625000000000003e-05,
      "loss": 1.082,
      "step": 2887
    },
    {
      "epoch": 45.125,
      "grad_norm": 2.4671008586883545,
      "learning_rate": 1.95625e-05,
      "loss": 1.3201,
      "step": 2888
    },
    {
      "epoch": 45.140625,
      "grad_norm": 2.4482383728027344,
      "learning_rate": 1.9500000000000003e-05,
      "loss": 1.0158,
      "step": 2889
    },
    {
      "epoch": 45.15625,
      "grad_norm": 2.5378470420837402,
      "learning_rate": 1.94375e-05,
      "loss": 1.3557,
      "step": 2890
    },
    {
      "epoch": 45.171875,
      "grad_norm": 2.6747798919677734,
      "learning_rate": 1.9375e-05,
      "loss": 1.1951,
      "step": 2891
    },
    {
      "epoch": 45.1875,
      "grad_norm": 2.404109239578247,
      "learning_rate": 1.93125e-05,
      "loss": 1.0646,
      "step": 2892
    },
    {
      "epoch": 45.203125,
      "grad_norm": 2.5015957355499268,
      "learning_rate": 1.925e-05,
      "loss": 1.3287,
      "step": 2893
    },
    {
      "epoch": 45.21875,
      "grad_norm": 2.3428401947021484,
      "learning_rate": 1.91875e-05,
      "loss": 1.2273,
      "step": 2894
    },
    {
      "epoch": 45.234375,
      "grad_norm": 2.479483127593994,
      "learning_rate": 1.9125e-05,
      "loss": 1.1948,
      "step": 2895
    },
    {
      "epoch": 45.25,
      "grad_norm": 2.236844539642334,
      "learning_rate": 1.90625e-05,
      "loss": 1.104,
      "step": 2896
    },
    {
      "epoch": 45.265625,
      "grad_norm": 2.2109832763671875,
      "learning_rate": 1.9e-05,
      "loss": 1.1569,
      "step": 2897
    },
    {
      "epoch": 45.28125,
      "grad_norm": 2.313844919204712,
      "learning_rate": 1.89375e-05,
      "loss": 1.2288,
      "step": 2898
    },
    {
      "epoch": 45.296875,
      "grad_norm": 2.151214122772217,
      "learning_rate": 1.8875e-05,
      "loss": 1.2196,
      "step": 2899
    },
    {
      "epoch": 45.3125,
      "grad_norm": 2.3583168983459473,
      "learning_rate": 1.88125e-05,
      "loss": 1.2238,
      "step": 2900
    },
    {
      "epoch": 45.328125,
      "grad_norm": 2.3206863403320312,
      "learning_rate": 1.8750000000000002e-05,
      "loss": 1.0785,
      "step": 2901
    },
    {
      "epoch": 45.34375,
      "grad_norm": 2.497419595718384,
      "learning_rate": 1.8687500000000004e-05,
      "loss": 1.3091,
      "step": 2902
    },
    {
      "epoch": 45.359375,
      "grad_norm": 2.567824363708496,
      "learning_rate": 1.8625000000000002e-05,
      "loss": 1.0306,
      "step": 2903
    },
    {
      "epoch": 45.375,
      "grad_norm": 2.675183057785034,
      "learning_rate": 1.85625e-05,
      "loss": 1.2194,
      "step": 2904
    },
    {
      "epoch": 45.390625,
      "grad_norm": 2.377007007598877,
      "learning_rate": 1.85e-05,
      "loss": 1.2668,
      "step": 2905
    },
    {
      "epoch": 45.40625,
      "grad_norm": 2.308773994445801,
      "learning_rate": 1.84375e-05,
      "loss": 1.1701,
      "step": 2906
    },
    {
      "epoch": 45.421875,
      "grad_norm": 2.2715916633605957,
      "learning_rate": 1.8375e-05,
      "loss": 1.111,
      "step": 2907
    },
    {
      "epoch": 45.4375,
      "grad_norm": 2.4394092559814453,
      "learning_rate": 1.83125e-05,
      "loss": 1.1934,
      "step": 2908
    },
    {
      "epoch": 45.453125,
      "grad_norm": 2.3767683506011963,
      "learning_rate": 1.825e-05,
      "loss": 1.3559,
      "step": 2909
    },
    {
      "epoch": 45.46875,
      "grad_norm": 2.5780656337738037,
      "learning_rate": 1.81875e-05,
      "loss": 1.1878,
      "step": 2910
    },
    {
      "epoch": 45.484375,
      "grad_norm": 2.4016926288604736,
      "learning_rate": 1.8125e-05,
      "loss": 1.0589,
      "step": 2911
    },
    {
      "epoch": 45.5,
      "grad_norm": 2.5470457077026367,
      "learning_rate": 1.8062500000000002e-05,
      "loss": 1.2405,
      "step": 2912
    },
    {
      "epoch": 45.515625,
      "grad_norm": 2.159393072128296,
      "learning_rate": 1.8e-05,
      "loss": 1.1583,
      "step": 2913
    },
    {
      "epoch": 45.53125,
      "grad_norm": 2.53121280670166,
      "learning_rate": 1.7937500000000002e-05,
      "loss": 1.0646,
      "step": 2914
    },
    {
      "epoch": 45.546875,
      "grad_norm": 2.8538427352905273,
      "learning_rate": 1.7875e-05,
      "loss": 1.0561,
      "step": 2915
    },
    {
      "epoch": 45.5625,
      "grad_norm": 2.666128635406494,
      "learning_rate": 1.7812500000000003e-05,
      "loss": 1.0358,
      "step": 2916
    },
    {
      "epoch": 45.578125,
      "grad_norm": 2.4529001712799072,
      "learning_rate": 1.775e-05,
      "loss": 1.208,
      "step": 2917
    },
    {
      "epoch": 45.59375,
      "grad_norm": 2.3907933235168457,
      "learning_rate": 1.76875e-05,
      "loss": 1.2417,
      "step": 2918
    },
    {
      "epoch": 45.609375,
      "grad_norm": 2.540334701538086,
      "learning_rate": 1.7625e-05,
      "loss": 1.094,
      "step": 2919
    },
    {
      "epoch": 45.625,
      "grad_norm": 2.8666157722473145,
      "learning_rate": 1.75625e-05,
      "loss": 1.0499,
      "step": 2920
    },
    {
      "epoch": 45.640625,
      "grad_norm": 2.17844557762146,
      "learning_rate": 1.75e-05,
      "loss": 1.1186,
      "step": 2921
    },
    {
      "epoch": 45.65625,
      "grad_norm": 2.2549359798431396,
      "learning_rate": 1.74375e-05,
      "loss": 1.4351,
      "step": 2922
    },
    {
      "epoch": 45.671875,
      "grad_norm": 2.2951834201812744,
      "learning_rate": 1.7375e-05,
      "loss": 1.1059,
      "step": 2923
    },
    {
      "epoch": 45.6875,
      "grad_norm": 2.4729669094085693,
      "learning_rate": 1.73125e-05,
      "loss": 1.0897,
      "step": 2924
    },
    {
      "epoch": 45.703125,
      "grad_norm": 2.9687843322753906,
      "learning_rate": 1.725e-05,
      "loss": 1.2557,
      "step": 2925
    },
    {
      "epoch": 45.71875,
      "grad_norm": 2.500861883163452,
      "learning_rate": 1.71875e-05,
      "loss": 1.148,
      "step": 2926
    },
    {
      "epoch": 45.734375,
      "grad_norm": 2.560964822769165,
      "learning_rate": 1.7125000000000003e-05,
      "loss": 0.8362,
      "step": 2927
    },
    {
      "epoch": 45.75,
      "grad_norm": 2.8030872344970703,
      "learning_rate": 1.70625e-05,
      "loss": 1.3242,
      "step": 2928
    },
    {
      "epoch": 45.765625,
      "grad_norm": 2.3654818534851074,
      "learning_rate": 1.7000000000000003e-05,
      "loss": 1.2149,
      "step": 2929
    },
    {
      "epoch": 45.78125,
      "grad_norm": 2.6871697902679443,
      "learning_rate": 1.6937500000000002e-05,
      "loss": 1.2076,
      "step": 2930
    },
    {
      "epoch": 45.796875,
      "grad_norm": 2.21801495552063,
      "learning_rate": 1.6875000000000004e-05,
      "loss": 1.1514,
      "step": 2931
    },
    {
      "epoch": 45.8125,
      "grad_norm": 2.149412155151367,
      "learning_rate": 1.6812500000000002e-05,
      "loss": 1.1412,
      "step": 2932
    },
    {
      "epoch": 45.828125,
      "grad_norm": 2.4001100063323975,
      "learning_rate": 1.675e-05,
      "loss": 1.1736,
      "step": 2933
    },
    {
      "epoch": 45.84375,
      "grad_norm": 2.5856988430023193,
      "learning_rate": 1.66875e-05,
      "loss": 1.1589,
      "step": 2934
    },
    {
      "epoch": 45.859375,
      "grad_norm": 2.4305808544158936,
      "learning_rate": 1.6625e-05,
      "loss": 1.2115,
      "step": 2935
    },
    {
      "epoch": 45.875,
      "grad_norm": 2.5104408264160156,
      "learning_rate": 1.65625e-05,
      "loss": 1.0852,
      "step": 2936
    },
    {
      "epoch": 45.890625,
      "grad_norm": 2.7565414905548096,
      "learning_rate": 1.65e-05,
      "loss": 1.0871,
      "step": 2937
    },
    {
      "epoch": 45.90625,
      "grad_norm": 2.399238348007202,
      "learning_rate": 1.64375e-05,
      "loss": 1.0677,
      "step": 2938
    },
    {
      "epoch": 45.921875,
      "grad_norm": 2.6433889865875244,
      "learning_rate": 1.6375e-05,
      "loss": 1.287,
      "step": 2939
    },
    {
      "epoch": 45.9375,
      "grad_norm": 2.7179532051086426,
      "learning_rate": 1.63125e-05,
      "loss": 0.8017,
      "step": 2940
    },
    {
      "epoch": 45.953125,
      "grad_norm": 2.455639362335205,
      "learning_rate": 1.6250000000000002e-05,
      "loss": 1.1988,
      "step": 2941
    },
    {
      "epoch": 45.96875,
      "grad_norm": 2.5747902393341064,
      "learning_rate": 1.61875e-05,
      "loss": 1.1233,
      "step": 2942
    },
    {
      "epoch": 45.984375,
      "grad_norm": 2.218672513961792,
      "learning_rate": 1.6125000000000002e-05,
      "loss": 1.1437,
      "step": 2943
    },
    {
      "epoch": 46.0,
      "grad_norm": 3.2494869232177734,
      "learning_rate": 1.60625e-05,
      "loss": 1.0543,
      "step": 2944
    },
    {
      "epoch": 46.0,
      "eval_loss": 3.104710102081299,
      "eval_runtime": 2.8861,
      "eval_samples_per_second": 177.403,
      "eval_steps_per_second": 44.351,
      "step": 2944
    },
    {
      "epoch": 46.015625,
      "grad_norm": 2.5384535789489746,
      "learning_rate": 1.6000000000000003e-05,
      "loss": 1.1388,
      "step": 2945
    },
    {
      "epoch": 46.03125,
      "grad_norm": 2.4476356506347656,
      "learning_rate": 1.59375e-05,
      "loss": 1.1915,
      "step": 2946
    },
    {
      "epoch": 46.046875,
      "grad_norm": 2.1690833568573,
      "learning_rate": 1.5875e-05,
      "loss": 1.2224,
      "step": 2947
    },
    {
      "epoch": 46.0625,
      "grad_norm": 2.800915479660034,
      "learning_rate": 1.5812499999999998e-05,
      "loss": 1.1045,
      "step": 2948
    },
    {
      "epoch": 46.078125,
      "grad_norm": 2.3445770740509033,
      "learning_rate": 1.575e-05,
      "loss": 1.1964,
      "step": 2949
    },
    {
      "epoch": 46.09375,
      "grad_norm": 2.0678164958953857,
      "learning_rate": 1.56875e-05,
      "loss": 1.1745,
      "step": 2950
    },
    {
      "epoch": 46.109375,
      "grad_norm": 2.2408058643341064,
      "learning_rate": 1.5625e-05,
      "loss": 1.0053,
      "step": 2951
    },
    {
      "epoch": 46.125,
      "grad_norm": 2.646674633026123,
      "learning_rate": 1.5562500000000002e-05,
      "loss": 1.2634,
      "step": 2952
    },
    {
      "epoch": 46.140625,
      "grad_norm": 2.2199440002441406,
      "learning_rate": 1.55e-05,
      "loss": 1.1366,
      "step": 2953
    },
    {
      "epoch": 46.15625,
      "grad_norm": 2.2758429050445557,
      "learning_rate": 1.5437500000000003e-05,
      "loss": 1.0995,
      "step": 2954
    },
    {
      "epoch": 46.171875,
      "grad_norm": 2.838995933532715,
      "learning_rate": 1.5375e-05,
      "loss": 1.068,
      "step": 2955
    },
    {
      "epoch": 46.1875,
      "grad_norm": 2.3922500610351562,
      "learning_rate": 1.5312500000000003e-05,
      "loss": 1.3248,
      "step": 2956
    },
    {
      "epoch": 46.203125,
      "grad_norm": 2.2002532482147217,
      "learning_rate": 1.525e-05,
      "loss": 1.4244,
      "step": 2957
    },
    {
      "epoch": 46.21875,
      "grad_norm": 2.4690866470336914,
      "learning_rate": 1.5187500000000002e-05,
      "loss": 0.9417,
      "step": 2958
    },
    {
      "epoch": 46.234375,
      "grad_norm": 2.432359218597412,
      "learning_rate": 1.5125e-05,
      "loss": 1.1724,
      "step": 2959
    },
    {
      "epoch": 46.25,
      "grad_norm": 2.431945562362671,
      "learning_rate": 1.5062500000000002e-05,
      "loss": 0.9345,
      "step": 2960
    },
    {
      "epoch": 46.265625,
      "grad_norm": 2.3707621097564697,
      "learning_rate": 1.5e-05,
      "loss": 1.3764,
      "step": 2961
    },
    {
      "epoch": 46.28125,
      "grad_norm": 2.4996750354766846,
      "learning_rate": 1.4937500000000002e-05,
      "loss": 1.0862,
      "step": 2962
    },
    {
      "epoch": 46.296875,
      "grad_norm": 2.4901134967803955,
      "learning_rate": 1.4875e-05,
      "loss": 1.0768,
      "step": 2963
    },
    {
      "epoch": 46.3125,
      "grad_norm": 2.2629201412200928,
      "learning_rate": 1.4812500000000001e-05,
      "loss": 1.314,
      "step": 2964
    },
    {
      "epoch": 46.328125,
      "grad_norm": 2.6836698055267334,
      "learning_rate": 1.475e-05,
      "loss": 1.0879,
      "step": 2965
    },
    {
      "epoch": 46.34375,
      "grad_norm": 2.4034698009490967,
      "learning_rate": 1.4687500000000001e-05,
      "loss": 1.2711,
      "step": 2966
    },
    {
      "epoch": 46.359375,
      "grad_norm": 2.5641472339630127,
      "learning_rate": 1.4625e-05,
      "loss": 1.0156,
      "step": 2967
    },
    {
      "epoch": 46.375,
      "grad_norm": 2.150449514389038,
      "learning_rate": 1.4562500000000002e-05,
      "loss": 1.1878,
      "step": 2968
    },
    {
      "epoch": 46.390625,
      "grad_norm": 2.05808687210083,
      "learning_rate": 1.45e-05,
      "loss": 1.2927,
      "step": 2969
    },
    {
      "epoch": 46.40625,
      "grad_norm": 2.209505319595337,
      "learning_rate": 1.44375e-05,
      "loss": 1.2352,
      "step": 2970
    },
    {
      "epoch": 46.421875,
      "grad_norm": 2.659234046936035,
      "learning_rate": 1.4374999999999999e-05,
      "loss": 1.3107,
      "step": 2971
    },
    {
      "epoch": 46.4375,
      "grad_norm": 2.730201005935669,
      "learning_rate": 1.43125e-05,
      "loss": 1.0941,
      "step": 2972
    },
    {
      "epoch": 46.453125,
      "grad_norm": 2.155754566192627,
      "learning_rate": 1.4249999999999999e-05,
      "loss": 1.2433,
      "step": 2973
    },
    {
      "epoch": 46.46875,
      "grad_norm": 2.6393847465515137,
      "learning_rate": 1.4187500000000001e-05,
      "loss": 1.2771,
      "step": 2974
    },
    {
      "epoch": 46.484375,
      "grad_norm": 2.248565196990967,
      "learning_rate": 1.4125e-05,
      "loss": 1.0615,
      "step": 2975
    },
    {
      "epoch": 46.5,
      "grad_norm": 2.2955880165100098,
      "learning_rate": 1.4062500000000001e-05,
      "loss": 1.0657,
      "step": 2976
    },
    {
      "epoch": 46.515625,
      "grad_norm": 2.5820469856262207,
      "learning_rate": 1.4000000000000001e-05,
      "loss": 1.0967,
      "step": 2977
    },
    {
      "epoch": 46.53125,
      "grad_norm": 2.400912284851074,
      "learning_rate": 1.39375e-05,
      "loss": 1.1955,
      "step": 2978
    },
    {
      "epoch": 46.546875,
      "grad_norm": 2.37082839012146,
      "learning_rate": 1.3875000000000002e-05,
      "loss": 1.1661,
      "step": 2979
    },
    {
      "epoch": 46.5625,
      "grad_norm": 2.502331018447876,
      "learning_rate": 1.38125e-05,
      "loss": 1.2037,
      "step": 2980
    },
    {
      "epoch": 46.578125,
      "grad_norm": 2.7231993675231934,
      "learning_rate": 1.3750000000000002e-05,
      "loss": 1.1475,
      "step": 2981
    },
    {
      "epoch": 46.59375,
      "grad_norm": 2.567122220993042,
      "learning_rate": 1.36875e-05,
      "loss": 1.3067,
      "step": 2982
    },
    {
      "epoch": 46.609375,
      "grad_norm": 2.476853847503662,
      "learning_rate": 1.3625e-05,
      "loss": 1.0322,
      "step": 2983
    },
    {
      "epoch": 46.625,
      "grad_norm": 2.778193235397339,
      "learning_rate": 1.3562500000000001e-05,
      "loss": 1.1768,
      "step": 2984
    },
    {
      "epoch": 46.640625,
      "grad_norm": 2.446499824523926,
      "learning_rate": 1.3500000000000001e-05,
      "loss": 1.1649,
      "step": 2985
    },
    {
      "epoch": 46.65625,
      "grad_norm": 2.655759811401367,
      "learning_rate": 1.34375e-05,
      "loss": 1.0947,
      "step": 2986
    },
    {
      "epoch": 46.671875,
      "grad_norm": 2.231990098953247,
      "learning_rate": 1.3375000000000002e-05,
      "loss": 1.0338,
      "step": 2987
    },
    {
      "epoch": 46.6875,
      "grad_norm": 2.369236946105957,
      "learning_rate": 1.33125e-05,
      "loss": 1.154,
      "step": 2988
    },
    {
      "epoch": 46.703125,
      "grad_norm": 2.5781495571136475,
      "learning_rate": 1.3250000000000002e-05,
      "loss": 1.2256,
      "step": 2989
    },
    {
      "epoch": 46.71875,
      "grad_norm": 2.284078598022461,
      "learning_rate": 1.31875e-05,
      "loss": 0.9952,
      "step": 2990
    },
    {
      "epoch": 46.734375,
      "grad_norm": 2.5265281200408936,
      "learning_rate": 1.3125e-05,
      "loss": 1.1604,
      "step": 2991
    },
    {
      "epoch": 46.75,
      "grad_norm": 2.182009696960449,
      "learning_rate": 1.3062499999999999e-05,
      "loss": 1.4133,
      "step": 2992
    },
    {
      "epoch": 46.765625,
      "grad_norm": 2.646064281463623,
      "learning_rate": 1.3000000000000001e-05,
      "loss": 1.1,
      "step": 2993
    },
    {
      "epoch": 46.78125,
      "grad_norm": 2.641434907913208,
      "learning_rate": 1.29375e-05,
      "loss": 0.9377,
      "step": 2994
    },
    {
      "epoch": 46.796875,
      "grad_norm": 2.538027763366699,
      "learning_rate": 1.2875000000000001e-05,
      "loss": 1.3108,
      "step": 2995
    },
    {
      "epoch": 46.8125,
      "grad_norm": 2.3424928188323975,
      "learning_rate": 1.28125e-05,
      "loss": 0.9637,
      "step": 2996
    },
    {
      "epoch": 46.828125,
      "grad_norm": 2.6498677730560303,
      "learning_rate": 1.2750000000000002e-05,
      "loss": 1.1524,
      "step": 2997
    },
    {
      "epoch": 46.84375,
      "grad_norm": 2.2221639156341553,
      "learning_rate": 1.26875e-05,
      "loss": 1.1835,
      "step": 2998
    },
    {
      "epoch": 46.859375,
      "grad_norm": 2.4990484714508057,
      "learning_rate": 1.2625e-05,
      "loss": 1.0556,
      "step": 2999
    },
    {
      "epoch": 46.875,
      "grad_norm": 2.455411434173584,
      "learning_rate": 1.2562499999999999e-05,
      "loss": 1.1721,
      "step": 3000
    },
    {
      "epoch": 46.890625,
      "grad_norm": 2.4197587966918945,
      "learning_rate": 1.25e-05,
      "loss": 1.1188,
      "step": 3001
    },
    {
      "epoch": 46.90625,
      "grad_norm": 2.7952983379364014,
      "learning_rate": 1.24375e-05,
      "loss": 1.1527,
      "step": 3002
    },
    {
      "epoch": 46.921875,
      "grad_norm": 2.432068109512329,
      "learning_rate": 1.2375000000000001e-05,
      "loss": 1.0686,
      "step": 3003
    },
    {
      "epoch": 46.9375,
      "grad_norm": 2.5888843536376953,
      "learning_rate": 1.2312500000000001e-05,
      "loss": 1.4372,
      "step": 3004
    },
    {
      "epoch": 46.953125,
      "grad_norm": 2.321885108947754,
      "learning_rate": 1.225e-05,
      "loss": 1.1743,
      "step": 3005
    },
    {
      "epoch": 46.96875,
      "grad_norm": 2.6068828105926514,
      "learning_rate": 1.21875e-05,
      "loss": 1.1987,
      "step": 3006
    },
    {
      "epoch": 46.984375,
      "grad_norm": 2.496223211288452,
      "learning_rate": 1.2125e-05,
      "loss": 1.4832,
      "step": 3007
    },
    {
      "epoch": 47.0,
      "grad_norm": 2.423100233078003,
      "learning_rate": 1.20625e-05,
      "loss": 0.983,
      "step": 3008
    },
    {
      "epoch": 47.0,
      "eval_loss": 3.1055960655212402,
      "eval_runtime": 2.8906,
      "eval_samples_per_second": 177.126,
      "eval_steps_per_second": 44.282,
      "step": 3008
    },
    {
      "epoch": 47.015625,
      "grad_norm": 2.509795904159546,
      "learning_rate": 1.2e-05,
      "loss": 0.9541,
      "step": 3009
    },
    {
      "epoch": 47.03125,
      "grad_norm": 2.372809410095215,
      "learning_rate": 1.19375e-05,
      "loss": 1.3905,
      "step": 3010
    },
    {
      "epoch": 47.046875,
      "grad_norm": 2.4252126216888428,
      "learning_rate": 1.1875e-05,
      "loss": 1.3044,
      "step": 3011
    },
    {
      "epoch": 47.0625,
      "grad_norm": 2.390352964401245,
      "learning_rate": 1.1812499999999999e-05,
      "loss": 1.2471,
      "step": 3012
    },
    {
      "epoch": 47.078125,
      "grad_norm": 2.869349956512451,
      "learning_rate": 1.175e-05,
      "loss": 1.2907,
      "step": 3013
    },
    {
      "epoch": 47.09375,
      "grad_norm": 2.320693254470825,
      "learning_rate": 1.1687500000000001e-05,
      "loss": 1.084,
      "step": 3014
    },
    {
      "epoch": 47.109375,
      "grad_norm": 2.459099769592285,
      "learning_rate": 1.1625000000000001e-05,
      "loss": 1.2965,
      "step": 3015
    },
    {
      "epoch": 47.125,
      "grad_norm": 2.4093432426452637,
      "learning_rate": 1.1562500000000002e-05,
      "loss": 1.3164,
      "step": 3016
    },
    {
      "epoch": 47.140625,
      "grad_norm": 2.300201177597046,
      "learning_rate": 1.1500000000000002e-05,
      "loss": 1.2299,
      "step": 3017
    },
    {
      "epoch": 47.15625,
      "grad_norm": 2.644428014755249,
      "learning_rate": 1.14375e-05,
      "loss": 1.3138,
      "step": 3018
    },
    {
      "epoch": 47.171875,
      "grad_norm": 2.192413806915283,
      "learning_rate": 1.1375e-05,
      "loss": 1.3206,
      "step": 3019
    },
    {
      "epoch": 47.1875,
      "grad_norm": 2.1274116039276123,
      "learning_rate": 1.13125e-05,
      "loss": 0.9683,
      "step": 3020
    },
    {
      "epoch": 47.203125,
      "grad_norm": 2.6596081256866455,
      "learning_rate": 1.125e-05,
      "loss": 1.0842,
      "step": 3021
    },
    {
      "epoch": 47.21875,
      "grad_norm": 2.381910562515259,
      "learning_rate": 1.1187500000000001e-05,
      "loss": 1.0604,
      "step": 3022
    },
    {
      "epoch": 47.234375,
      "grad_norm": 2.2472829818725586,
      "learning_rate": 1.1125000000000001e-05,
      "loss": 1.1977,
      "step": 3023
    },
    {
      "epoch": 47.25,
      "grad_norm": 2.316030502319336,
      "learning_rate": 1.1062500000000001e-05,
      "loss": 1.1029,
      "step": 3024
    },
    {
      "epoch": 47.265625,
      "grad_norm": 2.5653951168060303,
      "learning_rate": 1.1000000000000001e-05,
      "loss": 1.3483,
      "step": 3025
    },
    {
      "epoch": 47.28125,
      "grad_norm": 2.9239420890808105,
      "learning_rate": 1.09375e-05,
      "loss": 1.0561,
      "step": 3026
    },
    {
      "epoch": 47.296875,
      "grad_norm": 2.5205023288726807,
      "learning_rate": 1.0875e-05,
      "loss": 1.1223,
      "step": 3027
    },
    {
      "epoch": 47.3125,
      "grad_norm": 2.6007916927337646,
      "learning_rate": 1.08125e-05,
      "loss": 1.1878,
      "step": 3028
    },
    {
      "epoch": 47.328125,
      "grad_norm": 2.586651563644409,
      "learning_rate": 1.075e-05,
      "loss": 1.0812,
      "step": 3029
    },
    {
      "epoch": 47.34375,
      "grad_norm": 2.47239351272583,
      "learning_rate": 1.06875e-05,
      "loss": 1.0784,
      "step": 3030
    },
    {
      "epoch": 47.359375,
      "grad_norm": 2.3301188945770264,
      "learning_rate": 1.0625e-05,
      "loss": 1.2416,
      "step": 3031
    },
    {
      "epoch": 47.375,
      "grad_norm": 2.35758638381958,
      "learning_rate": 1.0562500000000001e-05,
      "loss": 1.1992,
      "step": 3032
    },
    {
      "epoch": 47.390625,
      "grad_norm": 2.3415229320526123,
      "learning_rate": 1.05e-05,
      "loss": 1.2667,
      "step": 3033
    },
    {
      "epoch": 47.40625,
      "grad_norm": 2.585875988006592,
      "learning_rate": 1.04375e-05,
      "loss": 1.2902,
      "step": 3034
    },
    {
      "epoch": 47.421875,
      "grad_norm": 2.3519504070281982,
      "learning_rate": 1.0375e-05,
      "loss": 1.0936,
      "step": 3035
    },
    {
      "epoch": 47.4375,
      "grad_norm": 2.466120719909668,
      "learning_rate": 1.03125e-05,
      "loss": 1.1966,
      "step": 3036
    },
    {
      "epoch": 47.453125,
      "grad_norm": 2.3116726875305176,
      "learning_rate": 1.025e-05,
      "loss": 1.1692,
      "step": 3037
    },
    {
      "epoch": 47.46875,
      "grad_norm": 2.576486825942993,
      "learning_rate": 1.01875e-05,
      "loss": 1.3147,
      "step": 3038
    },
    {
      "epoch": 47.484375,
      "grad_norm": 2.4766693115234375,
      "learning_rate": 1.0125e-05,
      "loss": 1.0947,
      "step": 3039
    },
    {
      "epoch": 47.5,
      "grad_norm": 2.406907796859741,
      "learning_rate": 1.00625e-05,
      "loss": 1.1266,
      "step": 3040
    },
    {
      "epoch": 47.515625,
      "grad_norm": 2.502495288848877,
      "learning_rate": 1e-05,
      "loss": 1.184,
      "step": 3041
    },
    {
      "epoch": 47.53125,
      "grad_norm": 2.4132320880889893,
      "learning_rate": 9.937500000000001e-06,
      "loss": 1.345,
      "step": 3042
    },
    {
      "epoch": 47.546875,
      "grad_norm": 2.5477330684661865,
      "learning_rate": 9.875000000000001e-06,
      "loss": 0.9524,
      "step": 3043
    },
    {
      "epoch": 47.5625,
      "grad_norm": 2.3703062534332275,
      "learning_rate": 9.812500000000001e-06,
      "loss": 1.1437,
      "step": 3044
    },
    {
      "epoch": 47.578125,
      "grad_norm": 2.364993095397949,
      "learning_rate": 9.750000000000002e-06,
      "loss": 1.2287,
      "step": 3045
    },
    {
      "epoch": 47.59375,
      "grad_norm": 2.4110267162323,
      "learning_rate": 9.6875e-06,
      "loss": 1.0911,
      "step": 3046
    },
    {
      "epoch": 47.609375,
      "grad_norm": 2.746173143386841,
      "learning_rate": 9.625e-06,
      "loss": 1.3643,
      "step": 3047
    },
    {
      "epoch": 47.625,
      "grad_norm": 2.206632614135742,
      "learning_rate": 9.5625e-06,
      "loss": 1.1816,
      "step": 3048
    },
    {
      "epoch": 47.640625,
      "grad_norm": 2.450880527496338,
      "learning_rate": 9.5e-06,
      "loss": 1.2326,
      "step": 3049
    },
    {
      "epoch": 47.65625,
      "grad_norm": 2.2306089401245117,
      "learning_rate": 9.4375e-06,
      "loss": 1.1382,
      "step": 3050
    },
    {
      "epoch": 47.671875,
      "grad_norm": 2.331016778945923,
      "learning_rate": 9.375000000000001e-06,
      "loss": 1.215,
      "step": 3051
    },
    {
      "epoch": 47.6875,
      "grad_norm": 2.3852171897888184,
      "learning_rate": 9.312500000000001e-06,
      "loss": 1.2454,
      "step": 3052
    },
    {
      "epoch": 47.703125,
      "grad_norm": 2.428661346435547,
      "learning_rate": 9.25e-06,
      "loss": 1.1296,
      "step": 3053
    },
    {
      "epoch": 47.71875,
      "grad_norm": 2.05684757232666,
      "learning_rate": 9.1875e-06,
      "loss": 0.9949,
      "step": 3054
    },
    {
      "epoch": 47.734375,
      "grad_norm": 2.4488658905029297,
      "learning_rate": 9.125e-06,
      "loss": 1.3259,
      "step": 3055
    },
    {
      "epoch": 47.75,
      "grad_norm": 2.295776844024658,
      "learning_rate": 9.0625e-06,
      "loss": 1.2978,
      "step": 3056
    },
    {
      "epoch": 47.765625,
      "grad_norm": 2.3064956665039062,
      "learning_rate": 9e-06,
      "loss": 1.1748,
      "step": 3057
    },
    {
      "epoch": 47.78125,
      "grad_norm": 2.31868052482605,
      "learning_rate": 8.9375e-06,
      "loss": 0.9693,
      "step": 3058
    },
    {
      "epoch": 47.796875,
      "grad_norm": 2.4005706310272217,
      "learning_rate": 8.875e-06,
      "loss": 0.8298,
      "step": 3059
    },
    {
      "epoch": 47.8125,
      "grad_norm": 2.576357841491699,
      "learning_rate": 8.8125e-06,
      "loss": 1.0801,
      "step": 3060
    },
    {
      "epoch": 47.828125,
      "grad_norm": 2.4679722785949707,
      "learning_rate": 8.75e-06,
      "loss": 1.2962,
      "step": 3061
    },
    {
      "epoch": 47.84375,
      "grad_norm": 2.356828451156616,
      "learning_rate": 8.6875e-06,
      "loss": 1.3422,
      "step": 3062
    },
    {
      "epoch": 47.859375,
      "grad_norm": 2.8082754611968994,
      "learning_rate": 8.625e-06,
      "loss": 0.9639,
      "step": 3063
    },
    {
      "epoch": 47.875,
      "grad_norm": 2.3986878395080566,
      "learning_rate": 8.562500000000001e-06,
      "loss": 1.0478,
      "step": 3064
    },
    {
      "epoch": 47.890625,
      "grad_norm": 2.5709598064422607,
      "learning_rate": 8.500000000000002e-06,
      "loss": 1.4837,
      "step": 3065
    },
    {
      "epoch": 47.90625,
      "grad_norm": 2.3125269412994385,
      "learning_rate": 8.437500000000002e-06,
      "loss": 1.0349,
      "step": 3066
    },
    {
      "epoch": 47.921875,
      "grad_norm": 2.3507580757141113,
      "learning_rate": 8.375e-06,
      "loss": 0.9892,
      "step": 3067
    },
    {
      "epoch": 47.9375,
      "grad_norm": 2.502917766571045,
      "learning_rate": 8.3125e-06,
      "loss": 1.1798,
      "step": 3068
    },
    {
      "epoch": 47.953125,
      "grad_norm": 2.8454196453094482,
      "learning_rate": 8.25e-06,
      "loss": 1.059,
      "step": 3069
    },
    {
      "epoch": 47.96875,
      "grad_norm": 2.230910301208496,
      "learning_rate": 8.1875e-06,
      "loss": 1.1632,
      "step": 3070
    },
    {
      "epoch": 47.984375,
      "grad_norm": 2.643282651901245,
      "learning_rate": 8.125000000000001e-06,
      "loss": 1.1329,
      "step": 3071
    },
    {
      "epoch": 48.0,
      "grad_norm": 2.8129491806030273,
      "learning_rate": 8.062500000000001e-06,
      "loss": 1.0808,
      "step": 3072
    },
    {
      "epoch": 48.0,
      "eval_loss": 3.1079158782958984,
      "eval_runtime": 2.8571,
      "eval_samples_per_second": 179.204,
      "eval_steps_per_second": 44.801,
      "step": 3072
    },
    {
      "epoch": 48.015625,
      "grad_norm": 2.3928282260894775,
      "learning_rate": 8.000000000000001e-06,
      "loss": 1.2861,
      "step": 3073
    },
    {
      "epoch": 48.03125,
      "grad_norm": 2.201160192489624,
      "learning_rate": 7.9375e-06,
      "loss": 1.0933,
      "step": 3074
    },
    {
      "epoch": 48.046875,
      "grad_norm": 2.257796287536621,
      "learning_rate": 7.875e-06,
      "loss": 1.2243,
      "step": 3075
    },
    {
      "epoch": 48.0625,
      "grad_norm": 2.452867031097412,
      "learning_rate": 7.8125e-06,
      "loss": 1.1892,
      "step": 3076
    },
    {
      "epoch": 48.078125,
      "grad_norm": 2.2708446979522705,
      "learning_rate": 7.75e-06,
      "loss": 1.0874,
      "step": 3077
    },
    {
      "epoch": 48.09375,
      "grad_norm": 2.4367597103118896,
      "learning_rate": 7.6875e-06,
      "loss": 1.1887,
      "step": 3078
    },
    {
      "epoch": 48.109375,
      "grad_norm": 2.156236410140991,
      "learning_rate": 7.625e-06,
      "loss": 1.1971,
      "step": 3079
    },
    {
      "epoch": 48.125,
      "grad_norm": 2.4845733642578125,
      "learning_rate": 7.5625e-06,
      "loss": 1.0989,
      "step": 3080
    },
    {
      "epoch": 48.140625,
      "grad_norm": 2.381406784057617,
      "learning_rate": 7.5e-06,
      "loss": 1.074,
      "step": 3081
    },
    {
      "epoch": 48.15625,
      "grad_norm": 2.59542179107666,
      "learning_rate": 7.4375e-06,
      "loss": 1.2461,
      "step": 3082
    },
    {
      "epoch": 48.171875,
      "grad_norm": 2.5944342613220215,
      "learning_rate": 7.375e-06,
      "loss": 1.1396,
      "step": 3083
    },
    {
      "epoch": 48.1875,
      "grad_norm": 2.6048905849456787,
      "learning_rate": 7.3125e-06,
      "loss": 1.008,
      "step": 3084
    },
    {
      "epoch": 48.203125,
      "grad_norm": 2.4482784271240234,
      "learning_rate": 7.25e-06,
      "loss": 1.0707,
      "step": 3085
    },
    {
      "epoch": 48.21875,
      "grad_norm": 2.325038194656372,
      "learning_rate": 7.187499999999999e-06,
      "loss": 1.1798,
      "step": 3086
    },
    {
      "epoch": 48.234375,
      "grad_norm": 2.631857395172119,
      "learning_rate": 7.1249999999999995e-06,
      "loss": 1.134,
      "step": 3087
    },
    {
      "epoch": 48.25,
      "grad_norm": 2.635930061340332,
      "learning_rate": 7.0625e-06,
      "loss": 1.447,
      "step": 3088
    },
    {
      "epoch": 48.265625,
      "grad_norm": 2.209765911102295,
      "learning_rate": 7.000000000000001e-06,
      "loss": 1.2854,
      "step": 3089
    },
    {
      "epoch": 48.28125,
      "grad_norm": 2.623439073562622,
      "learning_rate": 6.937500000000001e-06,
      "loss": 1.1727,
      "step": 3090
    },
    {
      "epoch": 48.296875,
      "grad_norm": 2.366400718688965,
      "learning_rate": 6.875000000000001e-06,
      "loss": 1.1337,
      "step": 3091
    },
    {
      "epoch": 48.3125,
      "grad_norm": 2.3957326412200928,
      "learning_rate": 6.8125e-06,
      "loss": 1.315,
      "step": 3092
    },
    {
      "epoch": 48.328125,
      "grad_norm": 2.21511173248291,
      "learning_rate": 6.750000000000001e-06,
      "loss": 1.1285,
      "step": 3093
    },
    {
      "epoch": 48.34375,
      "grad_norm": 2.5379178524017334,
      "learning_rate": 6.687500000000001e-06,
      "loss": 1.2043,
      "step": 3094
    },
    {
      "epoch": 48.359375,
      "grad_norm": 2.3843088150024414,
      "learning_rate": 6.625000000000001e-06,
      "loss": 1.1837,
      "step": 3095
    },
    {
      "epoch": 48.375,
      "grad_norm": 2.1893980503082275,
      "learning_rate": 6.5625e-06,
      "loss": 1.2465,
      "step": 3096
    },
    {
      "epoch": 48.390625,
      "grad_norm": 2.301934003829956,
      "learning_rate": 6.5000000000000004e-06,
      "loss": 1.0199,
      "step": 3097
    },
    {
      "epoch": 48.40625,
      "grad_norm": 2.306544065475464,
      "learning_rate": 6.437500000000001e-06,
      "loss": 1.126,
      "step": 3098
    },
    {
      "epoch": 48.421875,
      "grad_norm": 2.347616672515869,
      "learning_rate": 6.375000000000001e-06,
      "loss": 1.2156,
      "step": 3099
    },
    {
      "epoch": 48.4375,
      "grad_norm": 2.7637836933135986,
      "learning_rate": 6.3125e-06,
      "loss": 1.1393,
      "step": 3100
    },
    {
      "epoch": 48.453125,
      "grad_norm": 2.382791757583618,
      "learning_rate": 6.25e-06,
      "loss": 1.3176,
      "step": 3101
    },
    {
      "epoch": 48.46875,
      "grad_norm": 2.572197675704956,
      "learning_rate": 6.1875000000000005e-06,
      "loss": 1.1461,
      "step": 3102
    },
    {
      "epoch": 48.484375,
      "grad_norm": 2.3344779014587402,
      "learning_rate": 6.125e-06,
      "loss": 1.1365,
      "step": 3103
    },
    {
      "epoch": 48.5,
      "grad_norm": 2.5489089488983154,
      "learning_rate": 6.0625e-06,
      "loss": 1.3057,
      "step": 3104
    },
    {
      "epoch": 48.515625,
      "grad_norm": 2.2533655166625977,
      "learning_rate": 6e-06,
      "loss": 1.0426,
      "step": 3105
    },
    {
      "epoch": 48.53125,
      "grad_norm": 2.497016191482544,
      "learning_rate": 5.9375e-06,
      "loss": 1.1019,
      "step": 3106
    },
    {
      "epoch": 48.546875,
      "grad_norm": 2.4689576625823975,
      "learning_rate": 5.875e-06,
      "loss": 1.1875,
      "step": 3107
    },
    {
      "epoch": 48.5625,
      "grad_norm": 2.578665256500244,
      "learning_rate": 5.812500000000001e-06,
      "loss": 1.1353,
      "step": 3108
    },
    {
      "epoch": 48.578125,
      "grad_norm": 2.2875282764434814,
      "learning_rate": 5.750000000000001e-06,
      "loss": 1.1819,
      "step": 3109
    },
    {
      "epoch": 48.59375,
      "grad_norm": 2.2324817180633545,
      "learning_rate": 5.6875e-06,
      "loss": 1.1556,
      "step": 3110
    },
    {
      "epoch": 48.609375,
      "grad_norm": 2.3836026191711426,
      "learning_rate": 5.625e-06,
      "loss": 1.1761,
      "step": 3111
    },
    {
      "epoch": 48.625,
      "grad_norm": 2.32076096534729,
      "learning_rate": 5.5625000000000005e-06,
      "loss": 1.1222,
      "step": 3112
    },
    {
      "epoch": 48.640625,
      "grad_norm": 2.593010663986206,
      "learning_rate": 5.500000000000001e-06,
      "loss": 1.1138,
      "step": 3113
    },
    {
      "epoch": 48.65625,
      "grad_norm": 2.319077491760254,
      "learning_rate": 5.4375e-06,
      "loss": 0.8703,
      "step": 3114
    },
    {
      "epoch": 48.671875,
      "grad_norm": 2.354836940765381,
      "learning_rate": 5.375e-06,
      "loss": 1.2917,
      "step": 3115
    },
    {
      "epoch": 48.6875,
      "grad_norm": 2.604189872741699,
      "learning_rate": 5.3125e-06,
      "loss": 1.1356,
      "step": 3116
    },
    {
      "epoch": 48.703125,
      "grad_norm": 2.5877809524536133,
      "learning_rate": 5.25e-06,
      "loss": 1.2548,
      "step": 3117
    },
    {
      "epoch": 48.71875,
      "grad_norm": 2.239002227783203,
      "learning_rate": 5.1875e-06,
      "loss": 1.131,
      "step": 3118
    },
    {
      "epoch": 48.734375,
      "grad_norm": 2.546199083328247,
      "learning_rate": 5.125e-06,
      "loss": 1.1148,
      "step": 3119
    },
    {
      "epoch": 48.75,
      "grad_norm": 2.6385042667388916,
      "learning_rate": 5.0625e-06,
      "loss": 1.205,
      "step": 3120
    },
    {
      "epoch": 48.765625,
      "grad_norm": 2.905940294265747,
      "learning_rate": 5e-06,
      "loss": 1.1026,
      "step": 3121
    },
    {
      "epoch": 48.78125,
      "grad_norm": 2.299159049987793,
      "learning_rate": 4.937500000000001e-06,
      "loss": 1.0924,
      "step": 3122
    },
    {
      "epoch": 48.796875,
      "grad_norm": 2.249237537384033,
      "learning_rate": 4.875000000000001e-06,
      "loss": 1.0105,
      "step": 3123
    },
    {
      "epoch": 48.8125,
      "grad_norm": 2.3360843658447266,
      "learning_rate": 4.8125e-06,
      "loss": 1.3671,
      "step": 3124
    },
    {
      "epoch": 48.828125,
      "grad_norm": 2.6462650299072266,
      "learning_rate": 4.75e-06,
      "loss": 0.9846,
      "step": 3125
    },
    {
      "epoch": 48.84375,
      "grad_norm": 2.864973783493042,
      "learning_rate": 4.6875000000000004e-06,
      "loss": 1.1106,
      "step": 3126
    },
    {
      "epoch": 48.859375,
      "grad_norm": 2.471169948577881,
      "learning_rate": 4.625e-06,
      "loss": 1.3318,
      "step": 3127
    },
    {
      "epoch": 48.875,
      "grad_norm": 2.479504346847534,
      "learning_rate": 4.5625e-06,
      "loss": 1.3903,
      "step": 3128
    },
    {
      "epoch": 48.890625,
      "grad_norm": 2.9465384483337402,
      "learning_rate": 4.5e-06,
      "loss": 0.9955,
      "step": 3129
    },
    {
      "epoch": 48.90625,
      "grad_norm": 2.685180425643921,
      "learning_rate": 4.4375e-06,
      "loss": 1.1651,
      "step": 3130
    },
    {
      "epoch": 48.921875,
      "grad_norm": 2.635226249694824,
      "learning_rate": 4.375e-06,
      "loss": 0.9207,
      "step": 3131
    },
    {
      "epoch": 48.9375,
      "grad_norm": 2.229553699493408,
      "learning_rate": 4.3125e-06,
      "loss": 1.3849,
      "step": 3132
    },
    {
      "epoch": 48.953125,
      "grad_norm": 2.4026846885681152,
      "learning_rate": 4.250000000000001e-06,
      "loss": 1.3687,
      "step": 3133
    },
    {
      "epoch": 48.96875,
      "grad_norm": 2.3672003746032715,
      "learning_rate": 4.1875e-06,
      "loss": 1.0265,
      "step": 3134
    },
    {
      "epoch": 48.984375,
      "grad_norm": 2.3130927085876465,
      "learning_rate": 4.125e-06,
      "loss": 1.1621,
      "step": 3135
    },
    {
      "epoch": 49.0,
      "grad_norm": 2.7719027996063232,
      "learning_rate": 4.0625000000000005e-06,
      "loss": 1.2928,
      "step": 3136
    },
    {
      "epoch": 49.0,
      "eval_loss": 3.110339879989624,
      "eval_runtime": 2.9085,
      "eval_samples_per_second": 176.035,
      "eval_steps_per_second": 44.009,
      "step": 3136
    },
    {
      "epoch": 49.015625,
      "grad_norm": 2.294367551803589,
      "learning_rate": 4.000000000000001e-06,
      "loss": 1.2487,
      "step": 3137
    },
    {
      "epoch": 49.03125,
      "grad_norm": 2.7908895015716553,
      "learning_rate": 3.9375e-06,
      "loss": 1.2677,
      "step": 3138
    },
    {
      "epoch": 49.046875,
      "grad_norm": 2.4802637100219727,
      "learning_rate": 3.875e-06,
      "loss": 1.0641,
      "step": 3139
    },
    {
      "epoch": 49.0625,
      "grad_norm": 2.6591179370880127,
      "learning_rate": 3.8125e-06,
      "loss": 1.066,
      "step": 3140
    },
    {
      "epoch": 49.078125,
      "grad_norm": 2.3746299743652344,
      "learning_rate": 3.75e-06,
      "loss": 0.9107,
      "step": 3141
    },
    {
      "epoch": 49.09375,
      "grad_norm": 2.238903045654297,
      "learning_rate": 3.6875e-06,
      "loss": 1.2896,
      "step": 3142
    },
    {
      "epoch": 49.109375,
      "grad_norm": 2.538064956665039,
      "learning_rate": 3.625e-06,
      "loss": 1.2444,
      "step": 3143
    },
    {
      "epoch": 49.125,
      "grad_norm": 2.4688735008239746,
      "learning_rate": 3.5624999999999998e-06,
      "loss": 1.1816,
      "step": 3144
    },
    {
      "epoch": 49.140625,
      "grad_norm": 2.2864370346069336,
      "learning_rate": 3.5000000000000004e-06,
      "loss": 1.031,
      "step": 3145
    },
    {
      "epoch": 49.15625,
      "grad_norm": 2.4640986919403076,
      "learning_rate": 3.4375000000000005e-06,
      "loss": 1.2056,
      "step": 3146
    },
    {
      "epoch": 49.171875,
      "grad_norm": 2.523707866668701,
      "learning_rate": 3.3750000000000003e-06,
      "loss": 1.213,
      "step": 3147
    },
    {
      "epoch": 49.1875,
      "grad_norm": 2.4467716217041016,
      "learning_rate": 3.3125000000000005e-06,
      "loss": 0.9684,
      "step": 3148
    },
    {
      "epoch": 49.203125,
      "grad_norm": 2.4916765689849854,
      "learning_rate": 3.2500000000000002e-06,
      "loss": 1.1932,
      "step": 3149
    },
    {
      "epoch": 49.21875,
      "grad_norm": 2.7916197776794434,
      "learning_rate": 3.1875000000000004e-06,
      "loss": 1.1623,
      "step": 3150
    },
    {
      "epoch": 49.234375,
      "grad_norm": 2.2963547706604004,
      "learning_rate": 3.125e-06,
      "loss": 1.0987,
      "step": 3151
    },
    {
      "epoch": 49.25,
      "grad_norm": 2.514925479888916,
      "learning_rate": 3.0625e-06,
      "loss": 1.1952,
      "step": 3152
    },
    {
      "epoch": 49.265625,
      "grad_norm": 2.124866008758545,
      "learning_rate": 3e-06,
      "loss": 1.2173,
      "step": 3153
    },
    {
      "epoch": 49.28125,
      "grad_norm": 2.7122440338134766,
      "learning_rate": 2.9375e-06,
      "loss": 1.1944,
      "step": 3154
    },
    {
      "epoch": 49.296875,
      "grad_norm": 2.4314825534820557,
      "learning_rate": 2.8750000000000004e-06,
      "loss": 1.2882,
      "step": 3155
    },
    {
      "epoch": 49.3125,
      "grad_norm": 2.3211724758148193,
      "learning_rate": 2.8125e-06,
      "loss": 1.106,
      "step": 3156
    },
    {
      "epoch": 49.328125,
      "grad_norm": 2.4640369415283203,
      "learning_rate": 2.7500000000000004e-06,
      "loss": 1.1214,
      "step": 3157
    },
    {
      "epoch": 49.34375,
      "grad_norm": 2.428194761276245,
      "learning_rate": 2.6875e-06,
      "loss": 1.28,
      "step": 3158
    },
    {
      "epoch": 49.359375,
      "grad_norm": 2.6056251525878906,
      "learning_rate": 2.625e-06,
      "loss": 1.0517,
      "step": 3159
    },
    {
      "epoch": 49.375,
      "grad_norm": 2.2244675159454346,
      "learning_rate": 2.5625e-06,
      "loss": 1.1124,
      "step": 3160
    },
    {
      "epoch": 49.390625,
      "grad_norm": 2.425952434539795,
      "learning_rate": 2.5e-06,
      "loss": 0.9356,
      "step": 3161
    },
    {
      "epoch": 49.40625,
      "grad_norm": 2.5440285205841064,
      "learning_rate": 2.4375000000000004e-06,
      "loss": 1.2232,
      "step": 3162
    },
    {
      "epoch": 49.421875,
      "grad_norm": 2.380816698074341,
      "learning_rate": 2.375e-06,
      "loss": 1.2925,
      "step": 3163
    },
    {
      "epoch": 49.4375,
      "grad_norm": 2.275545358657837,
      "learning_rate": 2.3125e-06,
      "loss": 0.9834,
      "step": 3164
    },
    {
      "epoch": 49.453125,
      "grad_norm": 2.219804048538208,
      "learning_rate": 2.25e-06,
      "loss": 0.9485,
      "step": 3165
    },
    {
      "epoch": 49.46875,
      "grad_norm": 2.5316874980926514,
      "learning_rate": 2.1875e-06,
      "loss": 1.1471,
      "step": 3166
    },
    {
      "epoch": 49.484375,
      "grad_norm": 2.155351161956787,
      "learning_rate": 2.1250000000000004e-06,
      "loss": 1.2879,
      "step": 3167
    },
    {
      "epoch": 49.5,
      "grad_norm": 2.385019540786743,
      "learning_rate": 2.0625e-06,
      "loss": 1.2963,
      "step": 3168
    },
    {
      "epoch": 49.515625,
      "grad_norm": 2.6091301441192627,
      "learning_rate": 2.0000000000000003e-06,
      "loss": 1.2311,
      "step": 3169
    },
    {
      "epoch": 49.53125,
      "grad_norm": 2.1706490516662598,
      "learning_rate": 1.9375e-06,
      "loss": 1.0857,
      "step": 3170
    },
    {
      "epoch": 49.546875,
      "grad_norm": 2.1843278408050537,
      "learning_rate": 1.875e-06,
      "loss": 1.2141,
      "step": 3171
    },
    {
      "epoch": 49.5625,
      "grad_norm": 2.7409772872924805,
      "learning_rate": 1.8125e-06,
      "loss": 1.0974,
      "step": 3172
    },
    {
      "epoch": 49.578125,
      "grad_norm": 2.5963306427001953,
      "learning_rate": 1.7500000000000002e-06,
      "loss": 1.1038,
      "step": 3173
    },
    {
      "epoch": 49.59375,
      "grad_norm": 2.4064159393310547,
      "learning_rate": 1.6875000000000001e-06,
      "loss": 1.2412,
      "step": 3174
    },
    {
      "epoch": 49.609375,
      "grad_norm": 2.5437941551208496,
      "learning_rate": 1.6250000000000001e-06,
      "loss": 1.0848,
      "step": 3175
    },
    {
      "epoch": 49.625,
      "grad_norm": 2.3702032566070557,
      "learning_rate": 1.5625e-06,
      "loss": 1.3774,
      "step": 3176
    },
    {
      "epoch": 49.640625,
      "grad_norm": 2.2947046756744385,
      "learning_rate": 1.5e-06,
      "loss": 1.2063,
      "step": 3177
    },
    {
      "epoch": 49.65625,
      "grad_norm": 2.5193099975585938,
      "learning_rate": 1.4375000000000002e-06,
      "loss": 1.1917,
      "step": 3178
    },
    {
      "epoch": 49.671875,
      "grad_norm": 2.3890082836151123,
      "learning_rate": 1.3750000000000002e-06,
      "loss": 1.2157,
      "step": 3179
    },
    {
      "epoch": 49.6875,
      "grad_norm": 2.296818733215332,
      "learning_rate": 1.3125e-06,
      "loss": 1.2041,
      "step": 3180
    },
    {
      "epoch": 49.703125,
      "grad_norm": 2.4799201488494873,
      "learning_rate": 1.25e-06,
      "loss": 1.2832,
      "step": 3181
    },
    {
      "epoch": 49.71875,
      "grad_norm": 2.1099891662597656,
      "learning_rate": 1.1875e-06,
      "loss": 1.0985,
      "step": 3182
    },
    {
      "epoch": 49.734375,
      "grad_norm": 2.503634214401245,
      "learning_rate": 1.125e-06,
      "loss": 1.2214,
      "step": 3183
    },
    {
      "epoch": 49.75,
      "grad_norm": 2.2290198802948,
      "learning_rate": 1.0625000000000002e-06,
      "loss": 1.1211,
      "step": 3184
    },
    {
      "epoch": 49.765625,
      "grad_norm": 2.6939406394958496,
      "learning_rate": 1.0000000000000002e-06,
      "loss": 1.012,
      "step": 3185
    },
    {
      "epoch": 49.78125,
      "grad_norm": 2.428685426712036,
      "learning_rate": 9.375e-07,
      "loss": 0.9611,
      "step": 3186
    },
    {
      "epoch": 49.796875,
      "grad_norm": 2.732412815093994,
      "learning_rate": 8.750000000000001e-07,
      "loss": 1.1851,
      "step": 3187
    },
    {
      "epoch": 49.8125,
      "grad_norm": 2.422429323196411,
      "learning_rate": 8.125000000000001e-07,
      "loss": 1.1099,
      "step": 3188
    },
    {
      "epoch": 49.828125,
      "grad_norm": 2.495234251022339,
      "learning_rate": 7.5e-07,
      "loss": 1.1421,
      "step": 3189
    },
    {
      "epoch": 49.84375,
      "grad_norm": 2.601391077041626,
      "learning_rate": 6.875000000000001e-07,
      "loss": 1.015,
      "step": 3190
    },
    {
      "epoch": 49.859375,
      "grad_norm": 2.4532949924468994,
      "learning_rate": 6.25e-07,
      "loss": 1.2108,
      "step": 3191
    },
    {
      "epoch": 49.875,
      "grad_norm": 2.8382925987243652,
      "learning_rate": 5.625e-07,
      "loss": 1.3154,
      "step": 3192
    },
    {
      "epoch": 49.890625,
      "grad_norm": 2.799362897872925,
      "learning_rate": 5.000000000000001e-07,
      "loss": 1.0809,
      "step": 3193
    },
    {
      "epoch": 49.90625,
      "grad_norm": 2.2004003524780273,
      "learning_rate": 4.3750000000000005e-07,
      "loss": 1.3217,
      "step": 3194
    },
    {
      "epoch": 49.921875,
      "grad_norm": 2.6756105422973633,
      "learning_rate": 3.75e-07,
      "loss": 1.2261,
      "step": 3195
    },
    {
      "epoch": 49.9375,
      "grad_norm": 2.369204044342041,
      "learning_rate": 3.125e-07,
      "loss": 1.0835,
      "step": 3196
    },
    {
      "epoch": 49.953125,
      "grad_norm": 2.518136501312256,
      "learning_rate": 2.5000000000000004e-07,
      "loss": 1.4901,
      "step": 3197
    },
    {
      "epoch": 49.96875,
      "grad_norm": 2.4189226627349854,
      "learning_rate": 1.875e-07,
      "loss": 0.9419,
      "step": 3198
    },
    {
      "epoch": 49.984375,
      "grad_norm": 2.501368999481201,
      "learning_rate": 1.2500000000000002e-07,
      "loss": 1.1135,
      "step": 3199
    },
    {
      "epoch": 50.0,
      "grad_norm": 3.3031222820281982,
      "learning_rate": 6.250000000000001e-08,
      "loss": 1.1255,
      "step": 3200
    },
    {
      "epoch": 50.0,
      "eval_loss": 3.1100478172302246,
      "eval_runtime": 2.8596,
      "eval_samples_per_second": 179.046,
      "eval_steps_per_second": 44.762,
      "step": 3200
    }
  ],
  "logging_steps": 1,
  "max_steps": 3200,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 50,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1.6095882903552e+16,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}