diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,19152 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9987642455032266,
+  "eval_steps": 500,
+  "global_step": 2730,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0010984484415762735,
+      "grad_norm": 0.13173329830169678,
+      "learning_rate": 1.0989010989010988e-06,
+      "loss": 0.8751,
+      "step": 1
+    },
+    {
+      "epoch": 0.002196896883152547,
+      "grad_norm": 0.19401921331882477,
+      "learning_rate": 2.1978021978021976e-06,
+      "loss": 1.3488,
+      "step": 2
+    },
+    {
+      "epoch": 0.0032953453247288205,
+      "grad_norm": 0.142131969332695,
+      "learning_rate": 3.2967032967032968e-06,
+      "loss": 0.8371,
+      "step": 3
+    },
+    {
+      "epoch": 0.004393793766305094,
+      "grad_norm": 0.1124999076128006,
+      "learning_rate": 4.395604395604395e-06,
+      "loss": 1.0039,
+      "step": 4
+    },
+    {
+      "epoch": 0.005492242207881368,
+      "grad_norm": 0.20683947205543518,
+      "learning_rate": 5.494505494505494e-06,
+      "loss": 1.4423,
+      "step": 5
+    },
+    {
+      "epoch": 0.006590690649457641,
+      "grad_norm": 0.2007640153169632,
+      "learning_rate": 6.5934065934065935e-06,
+      "loss": 0.9797,
+      "step": 6
+    },
+    {
+      "epoch": 0.007689139091033915,
+      "grad_norm": 0.1362670361995697,
+      "learning_rate": 7.692307692307692e-06,
+      "loss": 1.0443,
+      "step": 7
+    },
+    {
+      "epoch": 0.008787587532610188,
+      "grad_norm": 0.21512511372566223,
+      "learning_rate": 8.79120879120879e-06,
+      "loss": 1.2888,
+      "step": 8
+    },
+    {
+      "epoch": 0.009886035974186462,
+      "grad_norm": 0.13403186202049255,
+      "learning_rate": 9.89010989010989e-06,
+      "loss": 0.9637,
+      "step": 9
+    },
+    {
+      "epoch": 0.010984484415762736,
+      "grad_norm": 0.16911157965660095,
+      "learning_rate": 1.0989010989010989e-05,
+      "loss": 0.8824,
+      "step": 10
+    },
+    {
+      "epoch": 0.012082932857339008,
+      "grad_norm": 0.19280359148979187,
+      "learning_rate": 1.2087912087912087e-05,
+      "loss": 0.9843,
+      "step": 11
+    },
+    {
+      "epoch": 0.013181381298915282,
+      "grad_norm": 0.15720519423484802,
+      "learning_rate": 1.3186813186813187e-05,
+      "loss": 0.9769,
+      "step": 12
+    },
+    {
+      "epoch": 0.014279829740491556,
+      "grad_norm": 0.18622402846813202,
+      "learning_rate": 1.4285714285714284e-05,
+      "loss": 0.903,
+      "step": 13
+    },
+    {
+      "epoch": 0.01537827818206783,
+      "grad_norm": 0.1491895169019699,
+      "learning_rate": 1.5384615384615384e-05,
+      "loss": 1.065,
+      "step": 14
+    },
+    {
+      "epoch": 0.016476726623644102,
+      "grad_norm": 0.16883142292499542,
+      "learning_rate": 1.6483516483516482e-05,
+      "loss": 0.9916,
+      "step": 15
+    },
+    {
+      "epoch": 0.017575175065220376,
+      "grad_norm": 0.155453160405159,
+      "learning_rate": 1.758241758241758e-05,
+      "loss": 1.1048,
+      "step": 16
+    },
+    {
+      "epoch": 0.01867362350679665,
+      "grad_norm": 0.12869666516780853,
+      "learning_rate": 1.868131868131868e-05,
+      "loss": 0.9355,
+      "step": 17
+    },
+    {
+      "epoch": 0.019772071948372924,
+      "grad_norm": 0.18860433995723724,
+      "learning_rate": 1.978021978021978e-05,
+      "loss": 1.1779,
+      "step": 18
+    },
+    {
+      "epoch": 0.020870520389949198,
+      "grad_norm": 0.30738529562950134,
+      "learning_rate": 2.087912087912088e-05,
+      "loss": 0.905,
+      "step": 19
+    },
+    {
+      "epoch": 0.021968968831525472,
+      "grad_norm": 0.30248674750328064,
+      "learning_rate": 2.1978021978021977e-05,
+      "loss": 1.0749,
+      "step": 20
+    },
+    {
+      "epoch": 0.023067417273101742,
+      "grad_norm": 0.17005079984664917,
+      "learning_rate": 2.3076923076923076e-05,
+      "loss": 1.0141,
+      "step": 21
+    },
+    {
+      "epoch": 0.024165865714678016,
+      "grad_norm": 0.5497377514839172,
+      "learning_rate": 2.4175824175824174e-05,
+      "loss": 0.804,
+      "step": 22
+    },
+    {
+      "epoch": 0.02526431415625429,
+      "grad_norm": 0.23464925587177277,
+      "learning_rate": 2.5274725274725276e-05,
+      "loss": 1.0592,
+      "step": 23
+    },
+    {
+      "epoch": 0.026362762597830564,
+      "grad_norm": 0.2906591594219208,
+      "learning_rate": 2.6373626373626374e-05,
+      "loss": 1.4096,
+      "step": 24
+    },
+    {
+      "epoch": 0.027461211039406838,
+      "grad_norm": 0.14552968740463257,
+      "learning_rate": 2.747252747252747e-05,
+      "loss": 0.8827,
+      "step": 25
+    },
+    {
+      "epoch": 0.028559659480983112,
+      "grad_norm": 0.26139914989471436,
+      "learning_rate": 2.8571428571428567e-05,
+      "loss": 1.1081,
+      "step": 26
+    },
+    {
+      "epoch": 0.029658107922559386,
+      "grad_norm": 0.16122505068778992,
+      "learning_rate": 2.9670329670329666e-05,
+      "loss": 0.8967,
+      "step": 27
+    },
+    {
+      "epoch": 0.03075655636413566,
+      "grad_norm": 0.19174647331237793,
+      "learning_rate": 3.076923076923077e-05,
+      "loss": 0.7527,
+      "step": 28
+    },
+    {
+      "epoch": 0.031855004805711934,
+      "grad_norm": 0.24506032466888428,
+      "learning_rate": 3.1868131868131866e-05,
+      "loss": 1.0981,
+      "step": 29
+    },
+    {
+      "epoch": 0.032953453247288204,
+      "grad_norm": 0.18928349018096924,
+      "learning_rate": 3.2967032967032964e-05,
+      "loss": 1.2955,
+      "step": 30
+    },
+    {
+      "epoch": 0.03405190168886448,
+      "grad_norm": 0.20482106506824493,
+      "learning_rate": 3.406593406593406e-05,
+      "loss": 0.886,
+      "step": 31
+    },
+    {
+      "epoch": 0.03515035013044075,
+      "grad_norm": 0.17304010689258575,
+      "learning_rate": 3.516483516483516e-05,
+      "loss": 1.0062,
+      "step": 32
+    },
+    {
+      "epoch": 0.03624879857201702,
+      "grad_norm": 0.17006444931030273,
+      "learning_rate": 3.626373626373626e-05,
+      "loss": 0.76,
+      "step": 33
+    },
+    {
+      "epoch": 0.0373472470135933,
+      "grad_norm": 0.16570955514907837,
+      "learning_rate": 3.736263736263736e-05,
+      "loss": 0.7512,
+      "step": 34
+    },
+    {
+      "epoch": 0.03844569545516957,
+      "grad_norm": 0.4470347464084625,
+      "learning_rate": 3.8461538461538456e-05,
+      "loss": 1.051,
+      "step": 35
+    },
+    {
+      "epoch": 0.03954414389674585,
+      "grad_norm": 0.3013080060482025,
+      "learning_rate": 3.956043956043956e-05,
+      "loss": 1.1269,
+      "step": 36
+    },
+    {
+      "epoch": 0.04064259233832212,
+      "grad_norm": 0.33114469051361084,
+      "learning_rate": 4.065934065934065e-05,
+      "loss": 1.046,
+      "step": 37
+    },
+    {
+      "epoch": 0.041741040779898396,
+      "grad_norm": 0.3496829867362976,
+      "learning_rate": 4.175824175824176e-05,
+      "loss": 0.9139,
+      "step": 38
+    },
+    {
+      "epoch": 0.042839489221474666,
+      "grad_norm": 0.36173877120018005,
+      "learning_rate": 4.285714285714285e-05,
+      "loss": 1.16,
+      "step": 39
+    },
+    {
+      "epoch": 0.043937937663050944,
+      "grad_norm": 0.23047995567321777,
+      "learning_rate": 4.3956043956043955e-05,
+      "loss": 0.8623,
+      "step": 40
+    },
+    {
+      "epoch": 0.045036386104627214,
+      "grad_norm": 0.33733946084976196,
+      "learning_rate": 4.5054945054945046e-05,
+      "loss": 0.873,
+      "step": 41
+    },
+    {
+      "epoch": 0.046134834546203485,
+      "grad_norm": 0.43975624442100525,
+      "learning_rate": 4.615384615384615e-05,
+      "loss": 0.9374,
+      "step": 42
+    },
+    {
+      "epoch": 0.04723328298777976,
+      "grad_norm": 0.5429202318191528,
+      "learning_rate": 4.725274725274725e-05,
+      "loss": 1.0699,
+      "step": 43
+    },
+    {
+      "epoch": 0.04833173142935603,
+      "grad_norm": 0.39317595958709717,
+      "learning_rate": 4.835164835164835e-05,
+      "loss": 0.7719,
+      "step": 44
+    },
+    {
+      "epoch": 0.04943017987093231,
+      "grad_norm": 0.41328710317611694,
+      "learning_rate": 4.9450549450549446e-05,
+      "loss": 1.112,
+      "step": 45
+    },
+    {
+      "epoch": 0.05052862831250858,
+      "grad_norm": 0.5977774858474731,
+      "learning_rate": 5.054945054945055e-05,
+      "loss": 0.9408,
+      "step": 46
+    },
+    {
+      "epoch": 0.05162707675408486,
+      "grad_norm": 0.6984797716140747,
+      "learning_rate": 5.164835164835164e-05,
+      "loss": 0.9766,
+      "step": 47
+    },
+    {
+      "epoch": 0.05272552519566113,
+      "grad_norm": 0.5161548256874084,
+      "learning_rate": 5.274725274725275e-05,
+      "loss": 1.3705,
+      "step": 48
+    },
+    {
+      "epoch": 0.0538239736372374,
+      "grad_norm": 0.5750108361244202,
+      "learning_rate": 5.384615384615384e-05,
+      "loss": 0.9492,
+      "step": 49
+    },
+    {
+      "epoch": 0.054922422078813676,
+      "grad_norm": 0.7861920595169067,
+      "learning_rate": 5.494505494505494e-05,
+      "loss": 1.1495,
+      "step": 50
+    },
+    {
+      "epoch": 0.05602087052038995,
+      "grad_norm": 0.5992287993431091,
+      "learning_rate": 5.6043956043956037e-05,
+      "loss": 1.2818,
+      "step": 51
+    },
+    {
+      "epoch": 0.057119318961966224,
+      "grad_norm": 0.5470016598701477,
+      "learning_rate": 5.7142857142857135e-05,
+      "loss": 1.0385,
+      "step": 52
+    },
+    {
+      "epoch": 0.058217767403542495,
+      "grad_norm": 0.7035269141197205,
+      "learning_rate": 5.824175824175824e-05,
+      "loss": 0.785,
+      "step": 53
+    },
+    {
+      "epoch": 0.05931621584511877,
+      "grad_norm": 0.5253639817237854,
+      "learning_rate": 5.934065934065933e-05,
+      "loss": 0.6092,
+      "step": 54
+    },
+    {
+      "epoch": 0.06041466428669504,
+      "grad_norm": 0.5233064293861389,
+      "learning_rate": 6.043956043956044e-05,
+      "loss": 0.7853,
+      "step": 55
+    },
+    {
+      "epoch": 0.06151311272827132,
+      "grad_norm": 0.4508589804172516,
+      "learning_rate": 6.153846153846154e-05,
+      "loss": 0.5737,
+      "step": 56
+    },
+    {
+      "epoch": 0.06261156116984759,
+      "grad_norm": 1.0521594285964966,
+      "learning_rate": 6.263736263736263e-05,
+      "loss": 1.0132,
+      "step": 57
+    },
+    {
+      "epoch": 0.06371000961142387,
+      "grad_norm": 0.3572557866573334,
+      "learning_rate": 6.373626373626373e-05,
+      "loss": 0.655,
+      "step": 58
+    },
+    {
+      "epoch": 0.06480845805300013,
+      "grad_norm": 0.600371241569519,
+      "learning_rate": 6.483516483516483e-05,
+      "loss": 0.8897,
+      "step": 59
+    },
+    {
+      "epoch": 0.06590690649457641,
+      "grad_norm": 0.6430579423904419,
+      "learning_rate": 6.593406593406593e-05,
+      "loss": 0.8058,
+      "step": 60
+    },
+    {
+      "epoch": 0.06700535493615269,
+      "grad_norm": 0.5309410095214844,
+      "learning_rate": 6.703296703296703e-05,
+      "loss": 0.7312,
+      "step": 61
+    },
+    {
+      "epoch": 0.06810380337772896,
+      "grad_norm": 0.46225860714912415,
+      "learning_rate": 6.813186813186813e-05,
+      "loss": 0.8607,
+      "step": 62
+    },
+    {
+      "epoch": 0.06920225181930523,
+      "grad_norm": 0.8889493346214294,
+      "learning_rate": 6.923076923076922e-05,
+      "loss": 0.7791,
+      "step": 63
+    },
+    {
+      "epoch": 0.0703007002608815,
+      "grad_norm": 0.5721575617790222,
+      "learning_rate": 7.032967032967032e-05,
+      "loss": 0.9426,
+      "step": 64
+    },
+    {
+      "epoch": 0.07139914870245778,
+      "grad_norm": 0.8355056047439575,
+      "learning_rate": 7.142857142857142e-05,
+      "loss": 0.621,
+      "step": 65
+    },
+    {
+      "epoch": 0.07249759714403405,
+      "grad_norm": 1.3048707246780396,
+      "learning_rate": 7.252747252747252e-05,
+      "loss": 0.8869,
+      "step": 66
+    },
+    {
+      "epoch": 0.07359604558561032,
+      "grad_norm": 0.5817797183990479,
+      "learning_rate": 7.362637362637362e-05,
+      "loss": 0.8385,
+      "step": 67
+    },
+    {
+      "epoch": 0.0746944940271866,
+      "grad_norm": 1.2051454782485962,
+      "learning_rate": 7.472527472527472e-05,
+      "loss": 0.7566,
+      "step": 68
+    },
+    {
+      "epoch": 0.07579294246876288,
+      "grad_norm": 0.8565987944602966,
+      "learning_rate": 7.582417582417581e-05,
+      "loss": 0.8374,
+      "step": 69
+    },
+    {
+      "epoch": 0.07689139091033914,
+      "grad_norm": 0.7503894567489624,
+      "learning_rate": 7.692307692307691e-05,
+      "loss": 0.6749,
+      "step": 70
+    },
+    {
+      "epoch": 0.07798983935191542,
+      "grad_norm": 0.6298589706420898,
+      "learning_rate": 7.802197802197802e-05,
+      "loss": 0.9096,
+      "step": 71
+    },
+    {
+      "epoch": 0.0790882877934917,
+      "grad_norm": 0.8327789306640625,
+      "learning_rate": 7.912087912087912e-05,
+      "loss": 0.9836,
+      "step": 72
+    },
+    {
+      "epoch": 0.08018673623506796,
+      "grad_norm": 1.0001461505889893,
+      "learning_rate": 8.021978021978021e-05,
+      "loss": 0.6917,
+      "step": 73
+    },
+    {
+      "epoch": 0.08128518467664424,
+      "grad_norm": 0.8373435735702515,
+      "learning_rate": 8.13186813186813e-05,
+      "loss": 0.7703,
+      "step": 74
+    },
+    {
+      "epoch": 0.08238363311822051,
+      "grad_norm": 0.9785758256912231,
+      "learning_rate": 8.241758241758242e-05,
+      "loss": 0.8004,
+      "step": 75
+    },
+    {
+      "epoch": 0.08348208155979679,
+      "grad_norm": 0.8900540471076965,
+      "learning_rate": 8.351648351648352e-05,
+      "loss": 0.8238,
+      "step": 76
+    },
+    {
+      "epoch": 0.08458053000137306,
+      "grad_norm": 0.7411159873008728,
+      "learning_rate": 8.46153846153846e-05,
+      "loss": 1.0364,
+      "step": 77
+    },
+    {
+      "epoch": 0.08567897844294933,
+      "grad_norm": 0.4975040555000305,
+      "learning_rate": 8.57142857142857e-05,
+      "loss": 0.4814,
+      "step": 78
+    },
+    {
+      "epoch": 0.08677742688452561,
+      "grad_norm": 0.6698398590087891,
+      "learning_rate": 8.681318681318681e-05,
+      "loss": 0.6828,
+      "step": 79
+    },
+    {
+      "epoch": 0.08787587532610189,
+      "grad_norm": 0.5883696675300598,
+      "learning_rate": 8.791208791208791e-05,
+      "loss": 0.92,
+      "step": 80
+    },
+    {
+      "epoch": 0.08897432376767815,
+      "grad_norm": 0.9050906896591187,
+      "learning_rate": 8.901098901098901e-05,
+      "loss": 0.7229,
+      "step": 81
+    },
+    {
+      "epoch": 0.09007277220925443,
+      "grad_norm": 0.5996706485748291,
+      "learning_rate": 9.010989010989009e-05,
+      "loss": 0.699,
+      "step": 82
+    },
+    {
+      "epoch": 0.0911712206508307,
+      "grad_norm": 2.0782630443573,
+      "learning_rate": 9.120879120879119e-05,
+      "loss": 1.2118,
+      "step": 83
+    },
+    {
+      "epoch": 0.09226966909240697,
+      "grad_norm": 0.759730875492096,
+      "learning_rate": 9.23076923076923e-05,
+      "loss": 0.6397,
+      "step": 84
+    },
+    {
+      "epoch": 0.09336811753398325,
+      "grad_norm": 1.1138097047805786,
+      "learning_rate": 9.34065934065934e-05,
+      "loss": 0.8973,
+      "step": 85
+    },
+    {
+      "epoch": 0.09446656597555952,
+      "grad_norm": 0.9852680563926697,
+      "learning_rate": 9.45054945054945e-05,
+      "loss": 1.0733,
+      "step": 86
+    },
+    {
+      "epoch": 0.0955650144171358,
+      "grad_norm": 0.8435002565383911,
+      "learning_rate": 9.560439560439558e-05,
+      "loss": 0.8977,
+      "step": 87
+    },
+    {
+      "epoch": 0.09666346285871207,
+      "grad_norm": 1.3031998872756958,
+      "learning_rate": 9.67032967032967e-05,
+      "loss": 0.9852,
+      "step": 88
+    },
+    {
+      "epoch": 0.09776191130028834,
+      "grad_norm": 0.6343463063240051,
+      "learning_rate": 9.78021978021978e-05,
+      "loss": 0.6147,
+      "step": 89
+    },
+    {
+      "epoch": 0.09886035974186462,
+      "grad_norm": 0.7061794996261597,
+      "learning_rate": 9.890109890109889e-05,
+      "loss": 0.7437,
+      "step": 90
+    },
+    {
+      "epoch": 0.09995880818344088,
+      "grad_norm": 1.2231422662734985,
+      "learning_rate": 9.999999999999999e-05,
+      "loss": 0.7944,
+      "step": 91
+    },
+    {
+      "epoch": 0.10105725662501716,
+      "grad_norm": 0.7199704647064209,
+      "learning_rate": 0.0001010989010989011,
+      "loss": 0.7355,
+      "step": 92
+    },
+    {
+      "epoch": 0.10215570506659344,
+      "grad_norm": 1.2740516662597656,
+      "learning_rate": 0.00010219780219780219,
+      "loss": 0.7622,
+      "step": 93
+    },
+    {
+      "epoch": 0.10325415350816972,
+      "grad_norm": 0.7762659788131714,
+      "learning_rate": 0.00010329670329670329,
+      "loss": 0.7074,
+      "step": 94
+    },
+    {
+      "epoch": 0.10435260194974598,
+      "grad_norm": 0.6618936061859131,
+      "learning_rate": 0.00010439560439560438,
+      "loss": 0.7667,
+      "step": 95
+    },
+    {
+      "epoch": 0.10545105039132226,
+      "grad_norm": 0.7244533896446228,
+      "learning_rate": 0.0001054945054945055,
+      "loss": 0.6451,
+      "step": 96
+    },
+    {
+      "epoch": 0.10654949883289853,
+      "grad_norm": 0.6391953229904175,
+      "learning_rate": 0.0001065934065934066,
+      "loss": 0.5637,
+      "step": 97
+    },
+    {
+      "epoch": 0.1076479472744748,
+      "grad_norm": 0.6992442607879639,
+      "learning_rate": 0.00010769230769230768,
+      "loss": 0.7112,
+      "step": 98
+    },
+    {
+      "epoch": 0.10874639571605108,
+      "grad_norm": 1.0820791721343994,
+      "learning_rate": 0.00010879120879120878,
+      "loss": 0.9199,
+      "step": 99
+    },
+    {
+      "epoch": 0.10984484415762735,
+      "grad_norm": 0.6012185215950012,
+      "learning_rate": 0.00010989010989010988,
+      "loss": 0.5574,
+      "step": 100
+    },
+    {
+      "epoch": 0.11094329259920363,
+      "grad_norm": 0.822455644607544,
+      "learning_rate": 0.00011098901098901099,
+      "loss": 0.5185,
+      "step": 101
+    },
+    {
+      "epoch": 0.1120417410407799,
+      "grad_norm": 0.9417555332183838,
+      "learning_rate": 0.00011208791208791207,
+      "loss": 0.6883,
+      "step": 102
+    },
+    {
+      "epoch": 0.11314018948235617,
+      "grad_norm": 1.0258208513259888,
+      "learning_rate": 0.00011318681318681317,
+      "loss": 0.7588,
+      "step": 103
+    },
+    {
+      "epoch": 0.11423863792393245,
+      "grad_norm": 1.904179573059082,
+      "learning_rate": 0.00011428571428571427,
+      "loss": 0.7425,
+      "step": 104
+    },
+    {
+      "epoch": 0.11533708636550873,
+      "grad_norm": 1.5453238487243652,
+      "learning_rate": 0.00011538461538461538,
+      "loss": 0.658,
+      "step": 105
+    },
+    {
+      "epoch": 0.11643553480708499,
+      "grad_norm": 0.8801619410514832,
+      "learning_rate": 0.00011648351648351648,
+      "loss": 0.8432,
+      "step": 106
+    },
+    {
+      "epoch": 0.11753398324866127,
+      "grad_norm": 0.8567579388618469,
+      "learning_rate": 0.00011758241758241756,
+      "loss": 0.5904,
+      "step": 107
+    },
+    {
+      "epoch": 0.11863243169023754,
+      "grad_norm": 0.9351131319999695,
+      "learning_rate": 0.00011868131868131866,
+      "loss": 0.7228,
+      "step": 108
+    },
+    {
+      "epoch": 0.11973088013181381,
+      "grad_norm": 0.8817545175552368,
+      "learning_rate": 0.00011978021978021978,
+      "loss": 0.7853,
+      "step": 109
+    },
+    {
+      "epoch": 0.12082932857339009,
+      "grad_norm": 1.0484094619750977,
+      "learning_rate": 0.00012087912087912087,
+      "loss": 0.7049,
+      "step": 110
+    },
+    {
+      "epoch": 0.12192777701496636,
+      "grad_norm": 1.80658757686615,
+      "learning_rate": 0.00012197802197802197,
+      "loss": 0.669,
+      "step": 111
+    },
+    {
+      "epoch": 0.12302622545654264,
+      "grad_norm": 1.5311473608016968,
+      "learning_rate": 0.00012307692307692307,
+      "loss": 0.8342,
+      "step": 112
+    },
+    {
+      "epoch": 0.1241246738981189,
+      "grad_norm": 0.8968105912208557,
+      "learning_rate": 0.00012417582417582416,
+      "loss": 0.7199,
+      "step": 113
+    },
+    {
+      "epoch": 0.12522312233969518,
+      "grad_norm": 0.6149659156799316,
+      "learning_rate": 0.00012527472527472527,
+      "loss": 0.4961,
+      "step": 114
+    },
+    {
+      "epoch": 0.12632157078127146,
+      "grad_norm": 8.04592227935791,
+      "learning_rate": 0.00012637362637362635,
+      "loss": 0.7515,
+      "step": 115
+    },
+    {
+      "epoch": 0.12742001922284774,
+      "grad_norm": 0.7797659039497375,
+      "learning_rate": 0.00012747252747252746,
+      "loss": 0.7281,
+      "step": 116
+    },
+    {
+      "epoch": 0.128518467664424,
+      "grad_norm": 0.6414046883583069,
+      "learning_rate": 0.00012857142857142855,
+      "loss": 0.6655,
+      "step": 117
+    },
+    {
+      "epoch": 0.12961691610600026,
+      "grad_norm": 4.678529262542725,
+      "learning_rate": 0.00012967032967032966,
+      "loss": 0.9165,
+      "step": 118
+    },
+    {
+      "epoch": 0.13071536454757654,
+      "grad_norm": 0.8540724515914917,
+      "learning_rate": 0.00013076923076923077,
+      "loss": 0.7064,
+      "step": 119
+    },
+    {
+      "epoch": 0.13181381298915282,
+      "grad_norm": 1.057844638824463,
+      "learning_rate": 0.00013186813186813186,
+      "loss": 0.6617,
+      "step": 120
+    },
+    {
+      "epoch": 0.1329122614307291,
+      "grad_norm": 0.8429140448570251,
+      "learning_rate": 0.00013296703296703294,
+      "loss": 0.8156,
+      "step": 121
+    },
+    {
+      "epoch": 0.13401070987230537,
+      "grad_norm": 0.9944230914115906,
+      "learning_rate": 0.00013406593406593405,
+      "loss": 0.5851,
+      "step": 122
+    },
+    {
+      "epoch": 0.13510915831388165,
+      "grad_norm": 0.6582810878753662,
+      "learning_rate": 0.00013516483516483517,
+      "loss": 0.5819,
+      "step": 123
+    },
+    {
+      "epoch": 0.13620760675545793,
+      "grad_norm": 1.3106951713562012,
+      "learning_rate": 0.00013626373626373625,
+      "loss": 0.7598,
+      "step": 124
+    },
+    {
+      "epoch": 0.13730605519703418,
+      "grad_norm": 1.0464080572128296,
+      "learning_rate": 0.00013736263736263734,
+      "loss": 0.7241,
+      "step": 125
+    },
+    {
+      "epoch": 0.13840450363861045,
+      "grad_norm": 0.8519262075424194,
+      "learning_rate": 0.00013846153846153845,
+      "loss": 0.7001,
+      "step": 126
+    },
+    {
+      "epoch": 0.13950295208018673,
+      "grad_norm": 1.2764228582382202,
+      "learning_rate": 0.00013956043956043956,
+      "loss": 0.7152,
+      "step": 127
+    },
+    {
+      "epoch": 0.140601400521763,
+      "grad_norm": 1.157472014427185,
+      "learning_rate": 0.00014065934065934064,
+      "loss": 0.697,
+      "step": 128
+    },
+    {
+      "epoch": 0.1416998489633393,
+      "grad_norm": 0.7153847813606262,
+      "learning_rate": 0.00014175824175824173,
+      "loss": 0.6897,
+      "step": 129
+    },
+    {
+      "epoch": 0.14279829740491556,
+      "grad_norm": 0.7254152297973633,
+      "learning_rate": 0.00014285714285714284,
+      "loss": 0.5263,
+      "step": 130
+    },
+    {
+      "epoch": 0.14389674584649184,
+      "grad_norm": 1.3370522260665894,
+      "learning_rate": 0.00014395604395604395,
+      "loss": 0.7587,
+      "step": 131
+    },
+    {
+      "epoch": 0.1449951942880681,
+      "grad_norm": 1.092029333114624,
+      "learning_rate": 0.00014505494505494504,
+      "loss": 0.8674,
+      "step": 132
+    },
+    {
+      "epoch": 0.14609364272964437,
+      "grad_norm": 0.6123655438423157,
+      "learning_rate": 0.00014615384615384615,
+      "loss": 0.7163,
+      "step": 133
+    },
+    {
+      "epoch": 0.14719209117122065,
+      "grad_norm": 0.8476639986038208,
+      "learning_rate": 0.00014725274725274723,
+      "loss": 0.7241,
+      "step": 134
+    },
+    {
+      "epoch": 0.14829053961279692,
+      "grad_norm": 0.9986979961395264,
+      "learning_rate": 0.00014835164835164835,
+      "loss": 0.6229,
+      "step": 135
+    },
+    {
+      "epoch": 0.1493889880543732,
+      "grad_norm": 0.8208728432655334,
+      "learning_rate": 0.00014945054945054943,
+      "loss": 0.5441,
+      "step": 136
+    },
+    {
+      "epoch": 0.15048743649594948,
+      "grad_norm": 0.742091953754425,
+      "learning_rate": 0.00015054945054945054,
+      "loss": 0.6047,
+      "step": 137
+    },
+    {
+      "epoch": 0.15158588493752576,
+      "grad_norm": 1.6566306352615356,
+      "learning_rate": 0.00015164835164835163,
+      "loss": 0.6381,
+      "step": 138
+    },
+    {
+      "epoch": 0.152684333379102,
+      "grad_norm": 0.7735741138458252,
+      "learning_rate": 0.0001527472527472527,
+      "loss": 0.5842,
+      "step": 139
+    },
+    {
+      "epoch": 0.15378278182067828,
+      "grad_norm": 0.7116795778274536,
+      "learning_rate": 0.00015384615384615382,
+      "loss": 0.7117,
+      "step": 140
+    },
+    {
+      "epoch": 0.15488123026225456,
+      "grad_norm": 0.6912885904312134,
+      "learning_rate": 0.00015494505494505494,
+      "loss": 0.763,
+      "step": 141
+    },
+    {
+      "epoch": 0.15597967870383084,
+      "grad_norm": 1.0789505243301392,
+      "learning_rate": 0.00015604395604395605,
+      "loss": 0.5534,
+      "step": 142
+    },
+    {
+      "epoch": 0.15707812714540711,
+      "grad_norm": 1.0304033756256104,
+      "learning_rate": 0.00015714285714285713,
+      "loss": 0.4961,
+      "step": 143
+    },
+    {
+      "epoch": 0.1581765755869834,
+      "grad_norm": 1.0216940641403198,
+      "learning_rate": 0.00015824175824175824,
+      "loss": 0.8167,
+      "step": 144
+    },
+    {
+      "epoch": 0.15927502402855967,
+      "grad_norm": 0.7767283916473389,
+      "learning_rate": 0.00015934065934065933,
+      "loss": 0.649,
+      "step": 145
+    },
+    {
+      "epoch": 0.16037347247013592,
+      "grad_norm": 0.6125204563140869,
+      "learning_rate": 0.00016043956043956041,
+      "loss": 0.6596,
+      "step": 146
+    },
+    {
+      "epoch": 0.1614719209117122,
+      "grad_norm": 2.113314390182495,
+      "learning_rate": 0.00016153846153846153,
+      "loss": 0.6825,
+      "step": 147
+    },
+    {
+      "epoch": 0.16257036935328847,
+      "grad_norm": 1.3892889022827148,
+      "learning_rate": 0.0001626373626373626,
+      "loss": 0.5162,
+      "step": 148
+    },
+    {
+      "epoch": 0.16366881779486475,
+      "grad_norm": 1.2544710636138916,
+      "learning_rate": 0.0001637362637362637,
+      "loss": 0.5992,
+      "step": 149
+    },
+    {
+      "epoch": 0.16476726623644103,
+      "grad_norm": 1.2952786684036255,
+      "learning_rate": 0.00016483516483516484,
+      "loss": 0.5968,
+      "step": 150
+    },
+    {
+      "epoch": 0.1658657146780173,
+      "grad_norm": 0.9910382628440857,
+      "learning_rate": 0.00016593406593406592,
+      "loss": 0.6138,
+      "step": 151
+    },
+    {
+      "epoch": 0.16696416311959358,
+      "grad_norm": 0.7291635870933533,
+      "learning_rate": 0.00016703296703296703,
+      "loss": 0.8957,
+      "step": 152
+    },
+    {
+      "epoch": 0.16806261156116986,
+      "grad_norm": 0.7290105819702148,
+      "learning_rate": 0.00016813186813186812,
+      "loss": 0.4864,
+      "step": 153
+    },
+    {
+      "epoch": 0.1691610600027461,
+      "grad_norm": 1.1888444423675537,
+      "learning_rate": 0.0001692307692307692,
+      "loss": 0.913,
+      "step": 154
+    },
+    {
+      "epoch": 0.1702595084443224,
+      "grad_norm": 0.8183659315109253,
+      "learning_rate": 0.0001703296703296703,
+      "loss": 0.6405,
+      "step": 155
+    },
+    {
+      "epoch": 0.17135795688589867,
+      "grad_norm": 0.8549530506134033,
+      "learning_rate": 0.0001714285714285714,
+      "loss": 0.7019,
+      "step": 156
+    },
+    {
+      "epoch": 0.17245640532747494,
+      "grad_norm": 0.5960697531700134,
+      "learning_rate": 0.0001725274725274725,
+      "loss": 0.6728,
+      "step": 157
+    },
+    {
+      "epoch": 0.17355485376905122,
+      "grad_norm": 0.6802973747253418,
+      "learning_rate": 0.00017362637362637362,
+      "loss": 0.6462,
+      "step": 158
+    },
+    {
+      "epoch": 0.1746533022106275,
+      "grad_norm": 0.5056049823760986,
+      "learning_rate": 0.00017472527472527473,
+      "loss": 0.5155,
+      "step": 159
+    },
+    {
+      "epoch": 0.17575175065220378,
+      "grad_norm": 0.8181887865066528,
+      "learning_rate": 0.00017582417582417582,
+      "loss": 0.6631,
+      "step": 160
+    },
+    {
+      "epoch": 0.17685019909378003,
+      "grad_norm": 0.5748574137687683,
+      "learning_rate": 0.0001769230769230769,
+      "loss": 0.5807,
+      "step": 161
+    },
+    {
+      "epoch": 0.1779486475353563,
+      "grad_norm": 0.8585043549537659,
+      "learning_rate": 0.00017802197802197802,
+      "loss": 0.5412,
+      "step": 162
+    },
+    {
+      "epoch": 0.17904709597693258,
+      "grad_norm": 0.8763203620910645,
+      "learning_rate": 0.0001791208791208791,
+      "loss": 1.0859,
+      "step": 163
+    },
+    {
+      "epoch": 0.18014554441850886,
+      "grad_norm": 0.7327267527580261,
+      "learning_rate": 0.00018021978021978018,
+      "loss": 0.8034,
+      "step": 164
+    },
+    {
+      "epoch": 0.18124399286008513,
+      "grad_norm": 0.6813991665840149,
+      "learning_rate": 0.0001813186813186813,
+      "loss": 0.9236,
+      "step": 165
+    },
+    {
+      "epoch": 0.1823424413016614,
+      "grad_norm": 2.9234185218811035,
+      "learning_rate": 0.00018241758241758238,
+      "loss": 0.9148,
+      "step": 166
+    },
+    {
+      "epoch": 0.1834408897432377,
+      "grad_norm": 0.8117207884788513,
+      "learning_rate": 0.00018351648351648352,
+      "loss": 1.0514,
+      "step": 167
+    },
+    {
+      "epoch": 0.18453933818481394,
+      "grad_norm": 0.6485300064086914,
+      "learning_rate": 0.0001846153846153846,
+      "loss": 0.4764,
+      "step": 168
+    },
+    {
+      "epoch": 0.18563778662639022,
+      "grad_norm": 0.43059054017066956,
+      "learning_rate": 0.00018571428571428572,
+      "loss": 0.6289,
+      "step": 169
+    },
+    {
+      "epoch": 0.1867362350679665,
+      "grad_norm": 1.007095456123352,
+      "learning_rate": 0.0001868131868131868,
+      "loss": 0.5889,
+      "step": 170
+    },
+    {
+      "epoch": 0.18783468350954277,
+      "grad_norm": 1.6733218431472778,
+      "learning_rate": 0.0001879120879120879,
+      "loss": 0.8036,
+      "step": 171
+    },
+    {
+      "epoch": 0.18893313195111905,
+      "grad_norm": 0.7533760666847229,
+      "learning_rate": 0.000189010989010989,
+      "loss": 0.7282,
+      "step": 172
+    },
+    {
+      "epoch": 0.19003158039269533,
+      "grad_norm": 0.45892444252967834,
+      "learning_rate": 0.00019010989010989008,
+      "loss": 0.6273,
+      "step": 173
+    },
+    {
+      "epoch": 0.1911300288342716,
+      "grad_norm": 0.54690021276474,
+      "learning_rate": 0.00019120879120879117,
+      "loss": 0.669,
+      "step": 174
+    },
+    {
+      "epoch": 0.19222847727584785,
+      "grad_norm": 0.7361836433410645,
+      "learning_rate": 0.0001923076923076923,
+      "loss": 0.8945,
+      "step": 175
+    },
+    {
+      "epoch": 0.19332692571742413,
+      "grad_norm": 0.5876324772834778,
+      "learning_rate": 0.0001934065934065934,
+      "loss": 0.7557,
+      "step": 176
+    },
+    {
+      "epoch": 0.1944253741590004,
+      "grad_norm": 0.7753897309303284,
+      "learning_rate": 0.0001945054945054945,
+      "loss": 0.7904,
+      "step": 177
+    },
+    {
+      "epoch": 0.19552382260057669,
+      "grad_norm": 0.6244968771934509,
+      "learning_rate": 0.0001956043956043956,
+      "loss": 0.7617,
+      "step": 178
+    },
+    {
+      "epoch": 0.19662227104215296,
+      "grad_norm": 0.6300948262214661,
+      "learning_rate": 0.00019670329670329667,
+      "loss": 0.5884,
+      "step": 179
+    },
+    {
+      "epoch": 0.19772071948372924,
+      "grad_norm": 0.5845354795455933,
+      "learning_rate": 0.00019780219780219779,
+      "loss": 0.8034,
+      "step": 180
+    },
+    {
+      "epoch": 0.19881916792530552,
+      "grad_norm": 0.5231277942657471,
+      "learning_rate": 0.00019890109890109887,
+      "loss": 0.5302,
+      "step": 181
+    },
+    {
+      "epoch": 0.19991761636688177,
+      "grad_norm": 0.8393481969833374,
+      "learning_rate": 0.00019999999999999998,
+      "loss": 0.6376,
+      "step": 182
+    },
+    {
+      "epoch": 0.20101606480845804,
+      "grad_norm": 0.5777038335800171,
+      "learning_rate": 0.00020109890109890107,
+      "loss": 0.5777,
+      "step": 183
+    },
+    {
+      "epoch": 0.20211451325003432,
+      "grad_norm": 0.7751956582069397,
+      "learning_rate": 0.0002021978021978022,
+      "loss": 0.8368,
+      "step": 184
+    },
+    {
+      "epoch": 0.2032129616916106,
+      "grad_norm": 1.5582187175750732,
+      "learning_rate": 0.0002032967032967033,
+      "loss": 0.5087,
+      "step": 185
+    },
+    {
+      "epoch": 0.20431141013318688,
+      "grad_norm": 0.8304231762886047,
+      "learning_rate": 0.00020439560439560438,
+      "loss": 0.5512,
+      "step": 186
+    },
+    {
+      "epoch": 0.20540985857476315,
+      "grad_norm": 0.8545000553131104,
+      "learning_rate": 0.0002054945054945055,
+      "loss": 1.2533,
+      "step": 187
+    },
+    {
+      "epoch": 0.20650830701633943,
+      "grad_norm": 0.4891647696495056,
+      "learning_rate": 0.00020659340659340657,
+      "loss": 0.5738,
+      "step": 188
+    },
+    {
+      "epoch": 0.20760675545791568,
+      "grad_norm": 0.7159665822982788,
+      "learning_rate": 0.00020769230769230766,
+      "loss": 0.9266,
+      "step": 189
+    },
+    {
+      "epoch": 0.20870520389949196,
+      "grad_norm": 0.5053237080574036,
+      "learning_rate": 0.00020879120879120877,
+      "loss": 0.4574,
+      "step": 190
+    },
+    {
+      "epoch": 0.20980365234106824,
+      "grad_norm": 0.728336751461029,
+      "learning_rate": 0.00020989010989010985,
+      "loss": 0.6871,
+      "step": 191
+    },
+    {
+      "epoch": 0.2109021007826445,
+      "grad_norm": 0.8593311309814453,
+      "learning_rate": 0.000210989010989011,
+      "loss": 0.6788,
+      "step": 192
+    },
+    {
+      "epoch": 0.2120005492242208,
+      "grad_norm": 1.247111201286316,
+      "learning_rate": 0.00021208791208791208,
+      "loss": 0.5428,
+      "step": 193
+    },
+    {
+      "epoch": 0.21309899766579707,
+      "grad_norm": 0.6636946201324463,
+      "learning_rate": 0.0002131868131868132,
+      "loss": 0.7935,
+      "step": 194
+    },
+    {
+      "epoch": 0.21419744610737335,
+      "grad_norm": 0.5811622738838196,
+      "learning_rate": 0.00021428571428571427,
+      "loss": 0.4322,
+      "step": 195
+    },
+    {
+      "epoch": 0.2152958945489496,
+      "grad_norm": 0.5329126715660095,
+      "learning_rate": 0.00021538461538461536,
+      "loss": 0.7037,
+      "step": 196
+    },
+    {
+      "epoch": 0.21639434299052587,
+      "grad_norm": 1.730969786643982,
+      "learning_rate": 0.00021648351648351647,
+      "loss": 1.0315,
+      "step": 197
+    },
+    {
+      "epoch": 0.21749279143210215,
+      "grad_norm": 0.5242175459861755,
+      "learning_rate": 0.00021758241758241756,
+      "loss": 0.9285,
+      "step": 198
+    },
+    {
+      "epoch": 0.21859123987367843,
+      "grad_norm": 0.4745596945285797,
+      "learning_rate": 0.00021868131868131864,
+      "loss": 0.5414,
+      "step": 199
+    },
+    {
+      "epoch": 0.2196896883152547,
+      "grad_norm": 0.8693228363990784,
+      "learning_rate": 0.00021978021978021975,
+      "loss": 0.4576,
+      "step": 200
+    },
+    {
+      "epoch": 0.22078813675683098,
+      "grad_norm": 0.7073357105255127,
+      "learning_rate": 0.00022087912087912086,
+      "loss": 0.778,
+      "step": 201
+    },
+    {
+      "epoch": 0.22188658519840726,
+      "grad_norm": 0.535009503364563,
+      "learning_rate": 0.00022197802197802198,
+      "loss": 0.7734,
+      "step": 202
+    },
+    {
+      "epoch": 0.2229850336399835,
+      "grad_norm": 0.5862578749656677,
+      "learning_rate": 0.00022307692307692306,
+      "loss": 0.8612,
+      "step": 203
+    },
+    {
+      "epoch": 0.2240834820815598,
+      "grad_norm": 0.5167233943939209,
+      "learning_rate": 0.00022417582417582415,
+      "loss": 0.6122,
+      "step": 204
+    },
+    {
+      "epoch": 0.22518193052313606,
+      "grad_norm": 0.8982027769088745,
+      "learning_rate": 0.00022527472527472526,
+      "loss": 0.8905,
+      "step": 205
+    },
+    {
+      "epoch": 0.22628037896471234,
+      "grad_norm": 0.7311340570449829,
+      "learning_rate": 0.00022637362637362634,
+      "loss": 1.0151,
+      "step": 206
+    },
+    {
+      "epoch": 0.22737882740628862,
+      "grad_norm": 0.45674124360084534,
+      "learning_rate": 0.00022747252747252745,
+      "loss": 0.7056,
+      "step": 207
+    },
+    {
+      "epoch": 0.2284772758478649,
+      "grad_norm": 0.6916844844818115,
+      "learning_rate": 0.00022857142857142854,
+      "loss": 0.5977,
+      "step": 208
+    },
+    {
+      "epoch": 0.22957572428944117,
+      "grad_norm": 0.6632958650588989,
+      "learning_rate": 0.00022967032967032962,
+      "loss": 0.8228,
+      "step": 209
+    },
+    {
+      "epoch": 0.23067417273101745,
+      "grad_norm": 0.3243491053581238,
+      "learning_rate": 0.00023076923076923076,
+      "loss": 0.4823,
+      "step": 210
+    },
+    {
+      "epoch": 0.2317726211725937,
+      "grad_norm": 0.45630499720573425,
+      "learning_rate": 0.00023186813186813185,
+      "loss": 0.7206,
+      "step": 211
+    },
+    {
+      "epoch": 0.23287106961416998,
+      "grad_norm": 0.6726184487342834,
+      "learning_rate": 0.00023296703296703296,
+      "loss": 0.8211,
+      "step": 212
+    },
+    {
+      "epoch": 0.23396951805574626,
+      "grad_norm": 0.45092982053756714,
+      "learning_rate": 0.00023406593406593405,
+      "loss": 0.6812,
+      "step": 213
+    },
+    {
+      "epoch": 0.23506796649732253,
+      "grad_norm": 0.5624651312828064,
+      "learning_rate": 0.00023516483516483513,
+      "loss": 0.726,
+      "step": 214
+    },
+    {
+      "epoch": 0.2361664149388988,
+      "grad_norm": 1.1685765981674194,
+      "learning_rate": 0.00023626373626373624,
+      "loss": 0.7906,
+      "step": 215
+    },
+    {
+      "epoch": 0.2372648633804751,
+      "grad_norm": 0.581599771976471,
+      "learning_rate": 0.00023736263736263733,
+      "loss": 0.7049,
+      "step": 216
+    },
+    {
+      "epoch": 0.23836331182205137,
+      "grad_norm": 0.7660847902297974,
+      "learning_rate": 0.00023846153846153844,
+      "loss": 0.6105,
+      "step": 217
+    },
+    {
+      "epoch": 0.23946176026362762,
+      "grad_norm": 0.5126472115516663,
+      "learning_rate": 0.00023956043956043955,
+      "loss": 0.7134,
+      "step": 218
+    },
+    {
+      "epoch": 0.2405602087052039,
+      "grad_norm": 0.48460498452186584,
+      "learning_rate": 0.00024065934065934066,
+      "loss": 0.5578,
+      "step": 219
+    },
+    {
+      "epoch": 0.24165865714678017,
+      "grad_norm": 0.41463029384613037,
+      "learning_rate": 0.00024175824175824175,
+      "loss": 0.5589,
+      "step": 220
+    },
+    {
+      "epoch": 0.24275710558835645,
+      "grad_norm": 2.0703623294830322,
+      "learning_rate": 0.00024285714285714283,
+      "loss": 0.7128,
+      "step": 221
+    },
+    {
+      "epoch": 0.24385555402993273,
+      "grad_norm": 1.5641820430755615,
+      "learning_rate": 0.00024395604395604394,
+      "loss": 0.4439,
+      "step": 222
+    },
+    {
+      "epoch": 0.244954002471509,
+      "grad_norm": 0.34634652733802795,
+      "learning_rate": 0.00024505494505494503,
+      "loss": 0.5389,
+      "step": 223
+    },
+    {
+      "epoch": 0.24605245091308528,
+      "grad_norm": 0.5669183135032654,
+      "learning_rate": 0.00024615384615384614,
+      "loss": 0.5699,
+      "step": 224
+    },
+    {
+      "epoch": 0.24715089935466153,
+      "grad_norm": 0.6459633111953735,
+      "learning_rate": 0.0002472527472527472,
+      "loss": 0.7904,
+      "step": 225
+    },
+    {
+      "epoch": 0.2482493477962378,
+      "grad_norm": 0.9719502925872803,
+      "learning_rate": 0.0002483516483516483,
+      "loss": 0.7354,
+      "step": 226
+    },
+    {
+      "epoch": 0.24934779623781408,
+      "grad_norm": 0.7433357834815979,
+      "learning_rate": 0.0002494505494505494,
+      "loss": 0.5772,
+      "step": 227
+    },
+    {
+      "epoch": 0.25044624467939036,
+      "grad_norm": 0.42272481322288513,
+      "learning_rate": 0.00025054945054945053,
+      "loss": 0.5609,
+      "step": 228
+    },
+    {
+      "epoch": 0.2515446931209666,
+      "grad_norm": 1.2868828773498535,
+      "learning_rate": 0.00025164835164835165,
+      "loss": 0.5775,
+      "step": 229
+    },
+    {
+      "epoch": 0.2526431415625429,
+      "grad_norm": 0.40398430824279785,
+      "learning_rate": 0.0002527472527472527,
+      "loss": 0.742,
+      "step": 230
+    },
+    {
+      "epoch": 0.25374159000411917,
+      "grad_norm": 0.46501678228378296,
+      "learning_rate": 0.0002538461538461538,
+      "loss": 0.69,
+      "step": 231
+    },
+    {
+      "epoch": 0.25484003844569547,
+      "grad_norm": 0.46631869673728943,
+      "learning_rate": 0.00025494505494505493,
+      "loss": 0.7712,
+      "step": 232
+    },
+    {
+      "epoch": 0.2559384868872717,
+      "grad_norm": 0.6761367321014404,
+      "learning_rate": 0.000256043956043956,
+      "loss": 0.64,
+      "step": 233
+    },
+    {
+      "epoch": 0.257036935328848,
+      "grad_norm": 0.6253519654273987,
+      "learning_rate": 0.0002571428571428571,
+      "loss": 0.5499,
+      "step": 234
+    },
+    {
+      "epoch": 0.2581353837704243,
+      "grad_norm": 1.0556268692016602,
+      "learning_rate": 0.0002582417582417582,
+      "loss": 0.869,
+      "step": 235
+    },
+    {
+      "epoch": 0.2592338322120005,
+      "grad_norm": 0.4816044867038727,
+      "learning_rate": 0.0002593406593406593,
+      "loss": 0.6061,
+      "step": 236
+    },
+    {
+      "epoch": 0.26033228065357683,
+      "grad_norm": 1.1049383878707886,
+      "learning_rate": 0.00026043956043956043,
+      "loss": 0.7695,
+      "step": 237
+    },
+    {
+      "epoch": 0.2614307290951531,
+      "grad_norm": 0.44643181562423706,
+      "learning_rate": 0.00026153846153846154,
+      "loss": 0.7849,
+      "step": 238
+    },
+    {
+      "epoch": 0.2625291775367294,
+      "grad_norm": 0.5231640338897705,
+      "learning_rate": 0.0002626373626373626,
+      "loss": 0.8033,
+      "step": 239
+    },
+    {
+      "epoch": 0.26362762597830564,
+      "grad_norm": 0.5537316799163818,
+      "learning_rate": 0.0002637362637362637,
+      "loss": 0.7317,
+      "step": 240
+    },
+    {
+      "epoch": 0.26472607441988194,
+      "grad_norm": 0.42069998383522034,
+      "learning_rate": 0.0002648351648351648,
+      "loss": 0.6325,
+      "step": 241
+    },
+    {
+      "epoch": 0.2658245228614582,
+      "grad_norm": 0.8009732365608215,
+      "learning_rate": 0.0002659340659340659,
+      "loss": 0.6589,
+      "step": 242
+    },
+    {
+      "epoch": 0.26692297130303444,
+      "grad_norm": 1.2626444101333618,
+      "learning_rate": 0.000267032967032967,
+      "loss": 0.5845,
+      "step": 243
+    },
+    {
+      "epoch": 0.26802141974461074,
+      "grad_norm": 0.4783913195133209,
+      "learning_rate": 0.0002681318681318681,
+      "loss": 0.8844,
+      "step": 244
+    },
+    {
+      "epoch": 0.269119868186187,
+      "grad_norm": 1.098160982131958,
+      "learning_rate": 0.0002692307692307692,
+      "loss": 0.6134,
+      "step": 245
+    },
+    {
+      "epoch": 0.2702183166277633,
+      "grad_norm": 1.0397273302078247,
+      "learning_rate": 0.00027032967032967033,
+      "loss": 0.7861,
+      "step": 246
+    },
+    {
+      "epoch": 0.27131676506933955,
+      "grad_norm": 0.9729229807853699,
+      "learning_rate": 0.0002714285714285714,
+      "loss": 0.7691,
+      "step": 247
+    },
+    {
+      "epoch": 0.27241521351091585,
+      "grad_norm": 0.44837963581085205,
+      "learning_rate": 0.0002725274725274725,
+      "loss": 0.9414,
+      "step": 248
+    },
+    {
+      "epoch": 0.2735136619524921,
+      "grad_norm": 1.4863499402999878,
+      "learning_rate": 0.0002736263736263736,
+      "loss": 0.5825,
+      "step": 249
+    },
+    {
+      "epoch": 0.27461211039406835,
+      "grad_norm": 0.5948237180709839,
+      "learning_rate": 0.00027472527472527467,
+      "loss": 0.4934,
+      "step": 250
+    },
+    {
+      "epoch": 0.27571055883564466,
+      "grad_norm": 0.5448721051216125,
+      "learning_rate": 0.0002758241758241758,
+      "loss": 0.6295,
+      "step": 251
+    },
+    {
+      "epoch": 0.2768090072772209,
+      "grad_norm": 0.4309394657611847,
+      "learning_rate": 0.0002769230769230769,
+      "loss": 0.6561,
+      "step": 252
+    },
+    {
+      "epoch": 0.2779074557187972,
+      "grad_norm": 0.7659335136413574,
+      "learning_rate": 0.000278021978021978,
+      "loss": 0.7588,
+      "step": 253
+    },
+    {
+      "epoch": 0.27900590416037346,
+      "grad_norm": 0.45655715465545654,
+      "learning_rate": 0.0002791208791208791,
+      "loss": 0.5257,
+      "step": 254
+    },
+    {
+      "epoch": 0.28010435260194977,
+      "grad_norm": 0.5390630960464478,
+      "learning_rate": 0.0002802197802197802,
+      "loss": 0.7051,
+      "step": 255
+    },
+    {
+      "epoch": 0.281202801043526,
+      "grad_norm": 0.39703306555747986,
+      "learning_rate": 0.0002813186813186813,
+      "loss": 0.6137,
+      "step": 256
+    },
+    {
+      "epoch": 0.28230124948510227,
+      "grad_norm": 0.4662924110889435,
+      "learning_rate": 0.0002824175824175824,
+      "loss": 0.4897,
+      "step": 257
+    },
+    {
+      "epoch": 0.2833996979266786,
+      "grad_norm": 0.39399877190589905,
+      "learning_rate": 0.00028351648351648346,
+      "loss": 0.6235,
+      "step": 258
+    },
+    {
+      "epoch": 0.2844981463682548,
+      "grad_norm": 0.497549444437027,
+      "learning_rate": 0.00028461538461538457,
+      "loss": 0.5134,
+      "step": 259
+    },
+    {
+      "epoch": 0.28559659480983113,
+      "grad_norm": 0.6597803235054016,
+      "learning_rate": 0.0002857142857142857,
+      "loss": 0.7955,
+      "step": 260
+    },
+    {
+      "epoch": 0.2866950432514074,
+      "grad_norm": 0.5545711517333984,
+      "learning_rate": 0.0002868131868131868,
+      "loss": 0.833,
+      "step": 261
+    },
+    {
+      "epoch": 0.2877934916929837,
+      "grad_norm": 1.0227786302566528,
+      "learning_rate": 0.0002879120879120879,
+      "loss": 0.5249,
+      "step": 262
+    },
+    {
+      "epoch": 0.28889194013455993,
+      "grad_norm": 0.5727143883705139,
+      "learning_rate": 0.000289010989010989,
+      "loss": 0.6319,
+      "step": 263
+    },
+    {
+      "epoch": 0.2899903885761362,
+      "grad_norm": 0.39322397112846375,
+      "learning_rate": 0.0002901098901098901,
+      "loss": 0.7003,
+      "step": 264
+    },
+    {
+      "epoch": 0.2910888370177125,
+      "grad_norm": 0.5657737851142883,
+      "learning_rate": 0.0002912087912087912,
+      "loss": 0.7085,
+      "step": 265
+    },
+    {
+      "epoch": 0.29218728545928874,
+      "grad_norm": 0.4305976927280426,
+      "learning_rate": 0.0002923076923076923,
+      "loss": 0.5931,
+      "step": 266
+    },
+    {
+      "epoch": 0.29328573390086504,
+      "grad_norm": 0.5300284624099731,
+      "learning_rate": 0.00029340659340659336,
+      "loss": 0.7881,
+      "step": 267
+    },
+    {
+      "epoch": 0.2943841823424413,
+      "grad_norm": 0.5922349095344543,
+      "learning_rate": 0.00029450549450549447,
+      "loss": 0.8688,
+      "step": 268
+    },
+    {
+      "epoch": 0.2954826307840176,
+      "grad_norm": 0.5700828433036804,
+      "learning_rate": 0.0002956043956043956,
+      "loss": 1.1328,
+      "step": 269
+    },
+    {
+      "epoch": 0.29658107922559385,
+      "grad_norm": 0.6773694753646851,
+      "learning_rate": 0.0002967032967032967,
+      "loss": 0.7821,
+      "step": 270
+    },
+    {
+      "epoch": 0.2976795276671701,
+      "grad_norm": 0.5200739502906799,
+      "learning_rate": 0.0002978021978021978,
+      "loss": 0.8775,
+      "step": 271
+    },
+    {
+      "epoch": 0.2987779761087464,
+      "grad_norm": 0.9860020875930786,
+      "learning_rate": 0.00029890109890109886,
+      "loss": 0.9141,
+      "step": 272
+    },
+    {
+      "epoch": 0.29987642455032265,
+      "grad_norm": 0.7012956142425537,
+      "learning_rate": 0.0003,
+      "loss": 0.7672,
+      "step": 273
+    },
+    {
+      "epoch": 0.30097487299189896,
+      "grad_norm": 0.4128098785877228,
+      "learning_rate": 0.0002998778998778999,
+      "loss": 0.3969,
+      "step": 274
+    },
+    {
+      "epoch": 0.3020733214334752,
+      "grad_norm": 0.366597980260849,
+      "learning_rate": 0.00029975579975579974,
+      "loss": 0.639,
+      "step": 275
+    },
+    {
+      "epoch": 0.3031717698750515,
+      "grad_norm": 0.5208033919334412,
+      "learning_rate": 0.0002996336996336996,
+      "loss": 0.664,
+      "step": 276
+    },
+    {
+      "epoch": 0.30427021831662776,
+      "grad_norm": 0.45519202947616577,
+      "learning_rate": 0.0002995115995115995,
+      "loss": 0.8495,
+      "step": 277
+    },
+    {
+      "epoch": 0.305368666758204,
+      "grad_norm": 0.6617010831832886,
+      "learning_rate": 0.0002993894993894994,
+      "loss": 1.0204,
+      "step": 278
+    },
+    {
+      "epoch": 0.3064671151997803,
+      "grad_norm": 1.4151723384857178,
+      "learning_rate": 0.00029926739926739923,
+      "loss": 0.8289,
+      "step": 279
+    },
+    {
+      "epoch": 0.30756556364135657,
+      "grad_norm": 0.6531035900115967,
+      "learning_rate": 0.00029914529914529915,
+      "loss": 0.7571,
+      "step": 280
+    },
+    {
+      "epoch": 0.30866401208293287,
+      "grad_norm": 0.8595600724220276,
+      "learning_rate": 0.000299023199023199,
+      "loss": 0.9668,
+      "step": 281
+    },
+    {
+      "epoch": 0.3097624605245091,
+      "grad_norm": 0.50210040807724,
+      "learning_rate": 0.00029890109890109886,
+      "loss": 0.6662,
+      "step": 282
+    },
+    {
+      "epoch": 0.3108609089660854,
+      "grad_norm": 0.6004669666290283,
+      "learning_rate": 0.0002987789987789988,
+      "loss": 0.7127,
+      "step": 283
+    },
+    {
+      "epoch": 0.3119593574076617,
+      "grad_norm": 0.8085057139396667,
+      "learning_rate": 0.00029865689865689863,
+      "loss": 0.9266,
+      "step": 284
+    },
+    {
+      "epoch": 0.3130578058492379,
+      "grad_norm": 0.44965627789497375,
+      "learning_rate": 0.0002985347985347985,
+      "loss": 0.7118,
+      "step": 285
+    },
+    {
+      "epoch": 0.31415625429081423,
+      "grad_norm": 0.5758265852928162,
+      "learning_rate": 0.00029841269841269835,
+      "loss": 0.6915,
+      "step": 286
+    },
+    {
+      "epoch": 0.3152547027323905,
+      "grad_norm": 0.5623393058776855,
+      "learning_rate": 0.00029829059829059826,
+      "loss": 0.6962,
+      "step": 287
+    },
+    {
+      "epoch": 0.3163531511739668,
+      "grad_norm": 0.857796311378479,
+      "learning_rate": 0.0002981684981684982,
+      "loss": 0.676,
+      "step": 288
+    },
+    {
+      "epoch": 0.31745159961554303,
+      "grad_norm": 0.36431241035461426,
+      "learning_rate": 0.000298046398046398,
+      "loss": 0.5475,
+      "step": 289
+    },
+    {
+      "epoch": 0.31855004805711934,
+      "grad_norm": 0.4778802692890167,
+      "learning_rate": 0.0002979242979242979,
+      "loss": 0.7198,
+      "step": 290
+    },
+    {
+      "epoch": 0.3196484964986956,
+      "grad_norm": 0.4887610673904419,
+      "learning_rate": 0.0002978021978021978,
+      "loss": 0.5559,
+      "step": 291
+    },
+    {
+      "epoch": 0.32074694494027184,
+      "grad_norm": 0.745379626750946,
+      "learning_rate": 0.00029768009768009766,
+      "loss": 1.0509,
+      "step": 292
+    },
+    {
+      "epoch": 0.32184539338184814,
+      "grad_norm": 0.40081167221069336,
+      "learning_rate": 0.0002975579975579975,
+      "loss": 0.6564,
+      "step": 293
+    },
+    {
+      "epoch": 0.3229438418234244,
+      "grad_norm": 0.5133034586906433,
+      "learning_rate": 0.00029743589743589743,
+      "loss": 0.6765,
+      "step": 294
+    },
+    {
+      "epoch": 0.3240422902650007,
+      "grad_norm": 0.5123881697654724,
+      "learning_rate": 0.0002973137973137973,
+      "loss": 0.8001,
+      "step": 295
+    },
+    {
+      "epoch": 0.32514073870657695,
+      "grad_norm": 0.3771597743034363,
+      "learning_rate": 0.00029719169719169715,
+      "loss": 0.785,
+      "step": 296
+    },
+    {
+      "epoch": 0.32623918714815325,
+      "grad_norm": 0.38929086923599243,
+      "learning_rate": 0.00029706959706959706,
+      "loss": 0.7273,
+      "step": 297
+    },
+    {
+      "epoch": 0.3273376355897295,
+      "grad_norm": 0.47761446237564087,
+      "learning_rate": 0.0002969474969474969,
+      "loss": 0.6997,
+      "step": 298
+    },
+    {
+      "epoch": 0.3284360840313058,
+      "grad_norm": 0.4798452854156494,
+      "learning_rate": 0.0002968253968253968,
+      "loss": 0.7171,
+      "step": 299
+    },
+    {
+      "epoch": 0.32953453247288206,
+      "grad_norm": 0.5864073038101196,
+      "learning_rate": 0.0002967032967032967,
+      "loss": 0.7075,
+      "step": 300
+    },
+    {
+      "epoch": 0.3306329809144583,
+      "grad_norm": 0.6298258900642395,
+      "learning_rate": 0.00029658119658119655,
+      "loss": 0.8659,
+      "step": 301
+    },
+    {
+      "epoch": 0.3317314293560346,
+      "grad_norm": 0.9764651656150818,
+      "learning_rate": 0.0002964590964590964,
+      "loss": 0.7451,
+      "step": 302
+    },
+    {
+      "epoch": 0.33282987779761086,
+      "grad_norm": 0.7084535360336304,
+      "learning_rate": 0.0002963369963369963,
+      "loss": 0.7896,
+      "step": 303
+    },
+    {
+      "epoch": 0.33392832623918717,
+      "grad_norm": 0.3226016163825989,
+      "learning_rate": 0.0002962148962148962,
+      "loss": 0.5614,
+      "step": 304
+    },
+    {
+      "epoch": 0.3350267746807634,
+      "grad_norm": 0.5515668988227844,
+      "learning_rate": 0.0002960927960927961,
+      "loss": 0.6981,
+      "step": 305
+    },
+    {
+      "epoch": 0.3361252231223397,
+      "grad_norm": 0.42776307463645935,
+      "learning_rate": 0.00029597069597069595,
+      "loss": 0.5911,
+      "step": 306
+    },
+    {
+      "epoch": 0.33722367156391597,
+      "grad_norm": 0.36645814776420593,
+      "learning_rate": 0.0002958485958485958,
+      "loss": 0.5584,
+      "step": 307
+    },
+    {
+      "epoch": 0.3383221200054922,
+      "grad_norm": 0.4089672565460205,
+      "learning_rate": 0.0002957264957264957,
+      "loss": 0.6814,
+      "step": 308
+    },
+    {
+      "epoch": 0.3394205684470685,
+      "grad_norm": 0.4406324326992035,
+      "learning_rate": 0.0002956043956043956,
+      "loss": 0.5426,
+      "step": 309
+    },
+    {
+      "epoch": 0.3405190168886448,
+      "grad_norm": 0.4138193726539612,
+      "learning_rate": 0.00029548229548229544,
+      "loss": 0.7554,
+      "step": 310
+    },
+    {
+      "epoch": 0.3416174653302211,
+      "grad_norm": 0.45647338032722473,
+      "learning_rate": 0.00029536019536019535,
+      "loss": 0.4871,
+      "step": 311
+    },
+    {
+      "epoch": 0.34271591377179733,
+      "grad_norm": 0.44362974166870117,
+      "learning_rate": 0.0002952380952380952,
+      "loss": 0.7254,
+      "step": 312
+    },
+    {
+      "epoch": 0.34381436221337364,
+      "grad_norm": 0.5832559466362,
+      "learning_rate": 0.00029511599511599507,
+      "loss": 0.64,
+      "step": 313
+    },
+    {
+      "epoch": 0.3449128106549499,
+      "grad_norm": 0.6754651665687561,
+      "learning_rate": 0.000294993894993895,
+      "loss": 0.7046,
+      "step": 314
+    },
+    {
+      "epoch": 0.34601125909652614,
+      "grad_norm": 0.6487123370170593,
+      "learning_rate": 0.00029487179487179484,
+      "loss": 0.5934,
+      "step": 315
+    },
+    {
+      "epoch": 0.34710970753810244,
+      "grad_norm": 0.24118930101394653,
+      "learning_rate": 0.0002947496947496947,
+      "loss": 0.5241,
+      "step": 316
+    },
+    {
+      "epoch": 0.3482081559796787,
+      "grad_norm": 0.4580494165420532,
+      "learning_rate": 0.0002946275946275946,
+      "loss": 0.6733,
+      "step": 317
+    },
+    {
+      "epoch": 0.349306604421255,
+      "grad_norm": 0.4770609736442566,
+      "learning_rate": 0.00029450549450549447,
+      "loss": 0.5758,
+      "step": 318
+    },
+    {
+      "epoch": 0.35040505286283125,
+      "grad_norm": 0.40334221720695496,
+      "learning_rate": 0.0002943833943833944,
+      "loss": 0.5365,
+      "step": 319
+    },
+    {
+      "epoch": 0.35150350130440755,
+      "grad_norm": 0.5605480074882507,
+      "learning_rate": 0.00029426129426129424,
+      "loss": 0.5967,
+      "step": 320
+    },
+    {
+      "epoch": 0.3526019497459838,
+      "grad_norm": 0.6031836271286011,
+      "learning_rate": 0.0002941391941391941,
+      "loss": 0.6397,
+      "step": 321
+    },
+    {
+      "epoch": 0.35370039818756005,
+      "grad_norm": 0.5602075457572937,
+      "learning_rate": 0.000294017094017094,
+      "loss": 0.7253,
+      "step": 322
+    },
+    {
+      "epoch": 0.35479884662913636,
+      "grad_norm": 1.5055879354476929,
+      "learning_rate": 0.00029389499389499387,
+      "loss": 0.6066,
+      "step": 323
+    },
+    {
+      "epoch": 0.3558972950707126,
+      "grad_norm": 1.969072699546814,
+      "learning_rate": 0.0002937728937728937,
+      "loss": 0.9263,
+      "step": 324
+    },
+    {
+      "epoch": 0.3569957435122889,
+      "grad_norm": 0.43139147758483887,
+      "learning_rate": 0.00029365079365079364,
+      "loss": 0.6462,
+      "step": 325
+    },
+    {
+      "epoch": 0.35809419195386516,
+      "grad_norm": 0.40423595905303955,
+      "learning_rate": 0.0002935286935286935,
+      "loss": 0.4278,
+      "step": 326
+    },
+    {
+      "epoch": 0.35919264039544146,
+      "grad_norm": 0.41983166337013245,
+      "learning_rate": 0.00029340659340659336,
+      "loss": 0.7527,
+      "step": 327
+    },
+    {
+      "epoch": 0.3602910888370177,
+      "grad_norm": 0.6624807715415955,
+      "learning_rate": 0.00029328449328449327,
+      "loss": 0.7381,
+      "step": 328
+    },
+    {
+      "epoch": 0.36138953727859396,
+      "grad_norm": 0.6173990964889526,
+      "learning_rate": 0.00029316239316239313,
+      "loss": 0.6838,
+      "step": 329
+    },
+    {
+      "epoch": 0.36248798572017027,
+      "grad_norm": 1.1278433799743652,
+      "learning_rate": 0.000293040293040293,
+      "loss": 0.8439,
+      "step": 330
+    },
+    {
+      "epoch": 0.3635864341617465,
+      "grad_norm": 0.3453993797302246,
+      "learning_rate": 0.0002929181929181929,
+      "loss": 0.5324,
+      "step": 331
+    },
+    {
+      "epoch": 0.3646848826033228,
+      "grad_norm": 0.4151187241077423,
+      "learning_rate": 0.0002927960927960928,
+      "loss": 0.7019,
+      "step": 332
+    },
+    {
+      "epoch": 0.3657833310448991,
+      "grad_norm": 0.4247313439846039,
+      "learning_rate": 0.0002926739926739926,
+      "loss": 0.6362,
+      "step": 333
+    },
+    {
+      "epoch": 0.3668817794864754,
+      "grad_norm": 1.5250136852264404,
+      "learning_rate": 0.00029255189255189253,
+      "loss": 0.5885,
+      "step": 334
+    },
+    {
+      "epoch": 0.36798022792805163,
+      "grad_norm": 0.43669968843460083,
+      "learning_rate": 0.00029242979242979244,
+      "loss": 0.9191,
+      "step": 335
+    },
+    {
+      "epoch": 0.3690786763696279,
+      "grad_norm": 0.8063925504684448,
+      "learning_rate": 0.0002923076923076923,
+      "loss": 0.6813,
+      "step": 336
+    },
+    {
+      "epoch": 0.3701771248112042,
+      "grad_norm": 0.6002399325370789,
+      "learning_rate": 0.00029218559218559216,
+      "loss": 0.5859,
+      "step": 337
+    },
+    {
+      "epoch": 0.37127557325278043,
+      "grad_norm": 0.9405462145805359,
+      "learning_rate": 0.000292063492063492,
+      "loss": 0.7476,
+      "step": 338
+    },
+    {
+      "epoch": 0.37237402169435674,
+      "grad_norm": 0.5050615072250366,
+      "learning_rate": 0.00029194139194139193,
+      "loss": 0.5172,
+      "step": 339
+    },
+    {
+      "epoch": 0.373472470135933,
+      "grad_norm": 0.4593801200389862,
+      "learning_rate": 0.0002918192918192918,
+      "loss": 0.5405,
+      "step": 340
+    },
+    {
+      "epoch": 0.3745709185775093,
+      "grad_norm": 0.5275060534477234,
+      "learning_rate": 0.00029169719169719164,
+      "loss": 0.4537,
+      "step": 341
+    },
+    {
+      "epoch": 0.37566936701908554,
+      "grad_norm": 0.8907522559165955,
+      "learning_rate": 0.00029157509157509156,
+      "loss": 0.6826,
+      "step": 342
+    },
+    {
+      "epoch": 0.3767678154606618,
+      "grad_norm": 0.7229670882225037,
+      "learning_rate": 0.0002914529914529914,
+      "loss": 0.6072,
+      "step": 343
+    },
+    {
+      "epoch": 0.3778662639022381,
+      "grad_norm": 1.7154827117919922,
+      "learning_rate": 0.0002913308913308913,
+      "loss": 0.6956,
+      "step": 344
+    },
+    {
+      "epoch": 0.37896471234381435,
+      "grad_norm": 1.012902021408081,
+      "learning_rate": 0.0002912087912087912,
+      "loss": 0.5337,
+      "step": 345
+    },
+    {
+      "epoch": 0.38006316078539065,
+      "grad_norm": 0.6467313170433044,
+      "learning_rate": 0.00029108669108669105,
+      "loss": 0.7652,
+      "step": 346
+    },
+    {
+      "epoch": 0.3811616092269669,
+      "grad_norm": 0.5594947338104248,
+      "learning_rate": 0.0002909645909645909,
+      "loss": 0.578,
+      "step": 347
+    },
+    {
+      "epoch": 0.3822600576685432,
+      "grad_norm": 0.5808854699134827,
+      "learning_rate": 0.0002908424908424908,
+      "loss": 0.6142,
+      "step": 348
+    },
+    {
+      "epoch": 0.38335850611011946,
+      "grad_norm": 0.6067795157432556,
+      "learning_rate": 0.00029072039072039073,
+      "loss": 0.7682,
+      "step": 349
+    },
+    {
+      "epoch": 0.3844569545516957,
+      "grad_norm": 0.392993301153183,
+      "learning_rate": 0.0002905982905982906,
+      "loss": 0.6599,
+      "step": 350
+    },
+    {
+      "epoch": 0.385555402993272,
+      "grad_norm": 0.3963404893875122,
+      "learning_rate": 0.00029047619047619045,
+      "loss": 0.7079,
+      "step": 351
+    },
+    {
+      "epoch": 0.38665385143484826,
+      "grad_norm": 0.3471222221851349,
+      "learning_rate": 0.00029035409035409036,
+      "loss": 0.463,
+      "step": 352
+    },
+    {
+      "epoch": 0.38775229987642457,
+      "grad_norm": 0.5496531128883362,
+      "learning_rate": 0.0002902319902319902,
+      "loss": 0.7639,
+      "step": 353
+    },
+    {
+      "epoch": 0.3888507483180008,
+      "grad_norm": 0.5482885241508484,
+      "learning_rate": 0.0002901098901098901,
+      "loss": 0.4198,
+      "step": 354
+    },
+    {
+      "epoch": 0.3899491967595771,
+      "grad_norm": 0.7329181432723999,
+      "learning_rate": 0.00028998778998779,
+      "loss": 0.6057,
+      "step": 355
+    },
+    {
+      "epoch": 0.39104764520115337,
+      "grad_norm": 0.41850918531417847,
+      "learning_rate": 0.00028986568986568985,
+      "loss": 0.605,
+      "step": 356
+    },
+    {
+      "epoch": 0.3921460936427296,
+      "grad_norm": 0.4463609457015991,
+      "learning_rate": 0.0002897435897435897,
+      "loss": 0.7381,
+      "step": 357
+    },
+    {
+      "epoch": 0.3932445420843059,
+      "grad_norm": 0.7207491397857666,
+      "learning_rate": 0.0002896214896214896,
+      "loss": 0.6892,
+      "step": 358
+    },
+    {
+      "epoch": 0.3943429905258822,
+      "grad_norm": 0.3715958595275879,
+      "learning_rate": 0.0002894993894993895,
+      "loss": 0.5426,
+      "step": 359
+    },
+    {
+      "epoch": 0.3954414389674585,
+      "grad_norm": 0.7077822685241699,
+      "learning_rate": 0.00028937728937728933,
+      "loss": 0.5923,
+      "step": 360
+    },
+    {
+      "epoch": 0.39653988740903473,
+      "grad_norm": 0.5109585523605347,
+      "learning_rate": 0.00028925518925518925,
+      "loss": 0.5939,
+      "step": 361
+    },
+    {
+      "epoch": 0.39763833585061104,
+      "grad_norm": 0.6105355024337769,
+      "learning_rate": 0.0002891330891330891,
+      "loss": 1.0345,
+      "step": 362
+    },
+    {
+      "epoch": 0.3987367842921873,
+      "grad_norm": 0.479732871055603,
+      "learning_rate": 0.000289010989010989,
+      "loss": 0.71,
+      "step": 363
+    },
+    {
+      "epoch": 0.39983523273376353,
+      "grad_norm": 0.8600007891654968,
+      "learning_rate": 0.0002888888888888888,
+      "loss": 0.7406,
+      "step": 364
+    },
+    {
+      "epoch": 0.40093368117533984,
+      "grad_norm": 0.6584550738334656,
+      "learning_rate": 0.00028876678876678873,
+      "loss": 0.6658,
+      "step": 365
+    },
+    {
+      "epoch": 0.4020321296169161,
+      "grad_norm": 0.7251041531562805,
+      "learning_rate": 0.00028864468864468865,
+      "loss": 0.8425,
+      "step": 366
+    },
+    {
+      "epoch": 0.4031305780584924,
+      "grad_norm": 0.5729238390922546,
+      "learning_rate": 0.0002885225885225885,
+      "loss": 0.9054,
+      "step": 367
+    },
+    {
+      "epoch": 0.40422902650006864,
+      "grad_norm": 1.1829932928085327,
+      "learning_rate": 0.00028840048840048836,
+      "loss": 0.9232,
+      "step": 368
+    },
+    {
+      "epoch": 0.40532747494164495,
+      "grad_norm": 0.37746721506118774,
+      "learning_rate": 0.0002882783882783883,
+      "loss": 0.9619,
+      "step": 369
+    },
+    {
+      "epoch": 0.4064259233832212,
+      "grad_norm": 0.5653749108314514,
+      "learning_rate": 0.00028815628815628813,
+      "loss": 0.7182,
+      "step": 370
+    },
+    {
+      "epoch": 0.40752437182479745,
+      "grad_norm": 0.6024563312530518,
+      "learning_rate": 0.000288034188034188,
+      "loss": 0.6881,
+      "step": 371
+    },
+    {
+      "epoch": 0.40862282026637375,
+      "grad_norm": 0.485350102186203,
+      "learning_rate": 0.0002879120879120879,
+      "loss": 0.6451,
+      "step": 372
+    },
+    {
+      "epoch": 0.40972126870795,
+      "grad_norm": 0.5762611627578735,
+      "learning_rate": 0.00028778998778998776,
+      "loss": 0.7818,
+      "step": 373
+    },
+    {
+      "epoch": 0.4108197171495263,
+      "grad_norm": 0.7961844801902771,
+      "learning_rate": 0.0002876678876678876,
+      "loss": 0.6682,
+      "step": 374
+    },
+    {
+      "epoch": 0.41191816559110256,
+      "grad_norm": 0.4630587697029114,
+      "learning_rate": 0.00028754578754578753,
+      "loss": 0.9015,
+      "step": 375
+    },
+    {
+      "epoch": 0.41301661403267886,
+      "grad_norm": 0.6592808961868286,
+      "learning_rate": 0.0002874236874236874,
+      "loss": 0.5738,
+      "step": 376
+    },
+    {
+      "epoch": 0.4141150624742551,
+      "grad_norm": 0.4788278639316559,
+      "learning_rate": 0.00028730158730158725,
+      "loss": 0.7022,
+      "step": 377
+    },
+    {
+      "epoch": 0.41521351091583136,
+      "grad_norm": 0.5041861534118652,
+      "learning_rate": 0.00028717948717948716,
+      "loss": 0.6137,
+      "step": 378
+    },
+    {
+      "epoch": 0.41631195935740767,
+      "grad_norm": 0.5436013340950012,
+      "learning_rate": 0.000287057387057387,
+      "loss": 0.6621,
+      "step": 379
+    },
+    {
+      "epoch": 0.4174104077989839,
+      "grad_norm": 0.5102400183677673,
+      "learning_rate": 0.00028693528693528694,
+      "loss": 0.6627,
+      "step": 380
+    },
+    {
+      "epoch": 0.4185088562405602,
+      "grad_norm": 0.43655040860176086,
+      "learning_rate": 0.0002868131868131868,
+      "loss": 0.6475,
+      "step": 381
+    },
+    {
+      "epoch": 0.4196073046821365,
+      "grad_norm": 0.3989826738834381,
+      "learning_rate": 0.00028669108669108665,
+      "loss": 0.5483,
+      "step": 382
+    },
+    {
+      "epoch": 0.4207057531237128,
+      "grad_norm": 0.7781158685684204,
+      "learning_rate": 0.00028656898656898656,
+      "loss": 0.6475,
+      "step": 383
+    },
+    {
+      "epoch": 0.421804201565289,
+      "grad_norm": 0.8119930624961853,
+      "learning_rate": 0.0002864468864468864,
+      "loss": 0.8122,
+      "step": 384
+    },
+    {
+      "epoch": 0.4229026500068653,
+      "grad_norm": 0.7233585119247437,
+      "learning_rate": 0.0002863247863247863,
+      "loss": 0.7837,
+      "step": 385
+    },
+    {
+      "epoch": 0.4240010984484416,
+      "grad_norm": 0.41249507665634155,
+      "learning_rate": 0.0002862026862026862,
+      "loss": 0.6916,
+      "step": 386
+    },
+    {
+      "epoch": 0.42509954689001783,
+      "grad_norm": 0.4865298867225647,
+      "learning_rate": 0.00028608058608058605,
+      "loss": 0.595,
+      "step": 387
+    },
+    {
+      "epoch": 0.42619799533159414,
+      "grad_norm": 0.6057963371276855,
+      "learning_rate": 0.0002859584859584859,
+      "loss": 0.7214,
+      "step": 388
+    },
+    {
+      "epoch": 0.4272964437731704,
+      "grad_norm": 0.5390968918800354,
+      "learning_rate": 0.0002858363858363858,
+      "loss": 0.805,
+      "step": 389
+    },
+    {
+      "epoch": 0.4283948922147467,
+      "grad_norm": 0.5944109559059143,
+      "learning_rate": 0.0002857142857142857,
+      "loss": 0.9953,
+      "step": 390
+    },
+    {
+      "epoch": 0.42949334065632294,
+      "grad_norm": 0.5480278134346008,
+      "learning_rate": 0.00028559218559218554,
+      "loss": 0.8406,
+      "step": 391
+    },
+    {
+      "epoch": 0.4305917890978992,
+      "grad_norm": 0.5168552994728088,
+      "learning_rate": 0.00028547008547008545,
+      "loss": 0.9715,
+      "step": 392
+    },
+    {
+      "epoch": 0.4316902375394755,
+      "grad_norm": 0.4859452247619629,
+      "learning_rate": 0.0002853479853479853,
+      "loss": 0.7368,
+      "step": 393
+    },
+    {
+      "epoch": 0.43278868598105175,
+      "grad_norm": 0.4697234034538269,
+      "learning_rate": 0.0002852258852258852,
+      "loss": 0.4801,
+      "step": 394
+    },
+    {
+      "epoch": 0.43388713442262805,
+      "grad_norm": 0.6198891401290894,
+      "learning_rate": 0.0002851037851037851,
+      "loss": 0.5184,
+      "step": 395
+    },
+    {
+      "epoch": 0.4349855828642043,
+      "grad_norm": 0.531563401222229,
+      "learning_rate": 0.00028498168498168494,
+      "loss": 0.8047,
+      "step": 396
+    },
+    {
+      "epoch": 0.4360840313057806,
+      "grad_norm": 0.4610724449157715,
+      "learning_rate": 0.00028485958485958485,
+      "loss": 0.4583,
+      "step": 397
+    },
+    {
+      "epoch": 0.43718247974735686,
+      "grad_norm": 0.5609697699546814,
+      "learning_rate": 0.0002847374847374847,
+      "loss": 0.7362,
+      "step": 398
+    },
+    {
+      "epoch": 0.4382809281889331,
+      "grad_norm": 0.5257968306541443,
+      "learning_rate": 0.00028461538461538457,
+      "loss": 0.8173,
+      "step": 399
+    },
+    {
+      "epoch": 0.4393793766305094,
+      "grad_norm": 0.8307009339332581,
+      "learning_rate": 0.0002844932844932845,
+      "loss": 0.5507,
+      "step": 400
+    },
+    {
+      "epoch": 0.44047782507208566,
+      "grad_norm": 0.36615508794784546,
+      "learning_rate": 0.00028437118437118434,
+      "loss": 0.6605,
+      "step": 401
+    },
+    {
+      "epoch": 0.44157627351366197,
+      "grad_norm": 0.35138362646102905,
+      "learning_rate": 0.0002842490842490842,
+      "loss": 0.6614,
+      "step": 402
+    },
+    {
+      "epoch": 0.4426747219552382,
+      "grad_norm": 0.5054494738578796,
+      "learning_rate": 0.0002841269841269841,
+      "loss": 0.799,
+      "step": 403
+    },
+    {
+      "epoch": 0.4437731703968145,
+      "grad_norm": 0.4711816608905792,
+      "learning_rate": 0.00028400488400488397,
+      "loss": 0.8892,
+      "step": 404
+    },
+    {
+      "epoch": 0.44487161883839077,
+      "grad_norm": 0.5073884725570679,
+      "learning_rate": 0.00028388278388278383,
+      "loss": 0.8156,
+      "step": 405
+    },
+    {
+      "epoch": 0.445970067279967,
+      "grad_norm": 0.29938632249832153,
+      "learning_rate": 0.00028376068376068374,
+      "loss": 0.7598,
+      "step": 406
+    },
+    {
+      "epoch": 0.4470685157215433,
+      "grad_norm": 1.745937466621399,
+      "learning_rate": 0.00028363858363858365,
+      "loss": 0.7829,
+      "step": 407
+    },
+    {
+      "epoch": 0.4481669641631196,
+      "grad_norm": 0.46887943148612976,
+      "learning_rate": 0.00028351648351648346,
+      "loss": 0.7798,
+      "step": 408
+    },
+    {
+      "epoch": 0.4492654126046959,
+      "grad_norm": 0.4274987280368805,
+      "learning_rate": 0.00028339438339438337,
+      "loss": 0.8407,
+      "step": 409
+    },
+    {
+      "epoch": 0.45036386104627213,
+      "grad_norm": 0.4445902109146118,
+      "learning_rate": 0.0002832722832722833,
+      "loss": 0.7394,
+      "step": 410
+    },
+    {
+      "epoch": 0.45146230948784843,
+      "grad_norm": 0.3842466175556183,
+      "learning_rate": 0.00028315018315018314,
+      "loss": 0.7781,
+      "step": 411
+    },
+    {
+      "epoch": 0.4525607579294247,
+      "grad_norm": 0.5660600066184998,
+      "learning_rate": 0.000283028083028083,
+      "loss": 0.8058,
+      "step": 412
+    },
+    {
+      "epoch": 0.45365920637100093,
+      "grad_norm": 0.442911297082901,
+      "learning_rate": 0.0002829059829059829,
+      "loss": 0.808,
+      "step": 413
+    },
+    {
+      "epoch": 0.45475765481257724,
+      "grad_norm": 0.9051260352134705,
+      "learning_rate": 0.00028278388278388277,
+      "loss": 0.9427,
+      "step": 414
+    },
+    {
+      "epoch": 0.4558561032541535,
+      "grad_norm": 0.8027593493461609,
+      "learning_rate": 0.00028266178266178263,
+      "loss": 0.531,
+      "step": 415
+    },
+    {
+      "epoch": 0.4569545516957298,
+      "grad_norm": 0.36242446303367615,
+      "learning_rate": 0.0002825396825396825,
+      "loss": 0.5609,
+      "step": 416
+    },
+    {
+      "epoch": 0.45805300013730604,
+      "grad_norm": 0.6095871925354004,
+      "learning_rate": 0.0002824175824175824,
+      "loss": 0.7424,
+      "step": 417
+    },
+    {
+      "epoch": 0.45915144857888235,
+      "grad_norm": 0.5102814435958862,
+      "learning_rate": 0.00028229548229548226,
+      "loss": 0.8861,
+      "step": 418
+    },
+    {
+      "epoch": 0.4602498970204586,
+      "grad_norm": 0.375265896320343,
+      "learning_rate": 0.0002821733821733821,
+      "loss": 0.6235,
+      "step": 419
+    },
+    {
+      "epoch": 0.4613483454620349,
+      "grad_norm": 0.4506315588951111,
+      "learning_rate": 0.00028205128205128203,
+      "loss": 0.6059,
+      "step": 420
+    },
+    {
+      "epoch": 0.46244679390361115,
+      "grad_norm": 0.8119642734527588,
+      "learning_rate": 0.0002819291819291819,
+      "loss": 0.7821,
+      "step": 421
+    },
+    {
+      "epoch": 0.4635452423451874,
+      "grad_norm": 0.42945513129234314,
+      "learning_rate": 0.00028180708180708175,
+      "loss": 0.9503,
+      "step": 422
+    },
+    {
+      "epoch": 0.4646436907867637,
+      "grad_norm": 0.35567665100097656,
+      "learning_rate": 0.00028168498168498166,
+      "loss": 0.5243,
+      "step": 423
+    },
+    {
+      "epoch": 0.46574213922833996,
+      "grad_norm": 0.5160343647003174,
+      "learning_rate": 0.00028156288156288157,
+      "loss": 0.5767,
+      "step": 424
+    },
+    {
+      "epoch": 0.46684058766991626,
+      "grad_norm": 0.37530624866485596,
+      "learning_rate": 0.00028144078144078143,
+      "loss": 1.2016,
+      "step": 425
+    },
+    {
+      "epoch": 0.4679390361114925,
+      "grad_norm": 0.5283146500587463,
+      "learning_rate": 0.0002813186813186813,
+      "loss": 0.5958,
+      "step": 426
+    },
+    {
+      "epoch": 0.4690374845530688,
+      "grad_norm": 0.5217192769050598,
+      "learning_rate": 0.0002811965811965812,
+      "loss": 0.715,
+      "step": 427
+    },
+    {
+      "epoch": 0.47013593299464507,
+      "grad_norm": 0.5092077851295471,
+      "learning_rate": 0.00028107448107448106,
+      "loss": 0.6942,
+      "step": 428
+    },
+    {
+      "epoch": 0.4712343814362213,
+      "grad_norm": 0.7683324813842773,
+      "learning_rate": 0.0002809523809523809,
+      "loss": 1.0185,
+      "step": 429
+    },
+    {
+      "epoch": 0.4723328298777976,
+      "grad_norm": 0.3117397725582123,
+      "learning_rate": 0.00028083028083028083,
+      "loss": 0.6949,
+      "step": 430
+    },
+    {
+      "epoch": 0.47343127831937387,
+      "grad_norm": 0.3218965232372284,
+      "learning_rate": 0.0002807081807081807,
+      "loss": 0.6872,
+      "step": 431
+    },
+    {
+      "epoch": 0.4745297267609502,
+      "grad_norm": 1.104121446609497,
+      "learning_rate": 0.00028058608058608055,
+      "loss": 0.6628,
+      "step": 432
+    },
+    {
+      "epoch": 0.4756281752025264,
+      "grad_norm": 0.3224816620349884,
+      "learning_rate": 0.00028046398046398046,
+      "loss": 0.5974,
+      "step": 433
+    },
+    {
+      "epoch": 0.47672662364410273,
+      "grad_norm": 0.5742220878601074,
+      "learning_rate": 0.0002803418803418803,
+      "loss": 0.7248,
+      "step": 434
+    },
+    {
+      "epoch": 0.477825072085679,
+      "grad_norm": 0.5449275374412537,
+      "learning_rate": 0.0002802197802197802,
+      "loss": 0.8552,
+      "step": 435
+    },
+    {
+      "epoch": 0.47892352052725523,
+      "grad_norm": 0.44660067558288574,
+      "learning_rate": 0.0002800976800976801,
+      "loss": 0.6968,
+      "step": 436
+    },
+    {
+      "epoch": 0.48002196896883154,
+      "grad_norm": 0.4287508428096771,
+      "learning_rate": 0.00027997557997557995,
+      "loss": 0.8101,
+      "step": 437
+    },
+    {
+      "epoch": 0.4811204174104078,
+      "grad_norm": 0.4142225384712219,
+      "learning_rate": 0.00027985347985347986,
+      "loss": 0.5379,
+      "step": 438
+    },
+    {
+      "epoch": 0.4822188658519841,
+      "grad_norm": 1.246833324432373,
+      "learning_rate": 0.0002797313797313797,
+      "loss": 0.7116,
+      "step": 439
+    },
+    {
+      "epoch": 0.48331731429356034,
+      "grad_norm": 0.3845030963420868,
+      "learning_rate": 0.0002796092796092796,
+      "loss": 0.8088,
+      "step": 440
+    },
+    {
+      "epoch": 0.48441576273513665,
+      "grad_norm": 1.4492995738983154,
+      "learning_rate": 0.0002794871794871795,
+      "loss": 0.7358,
+      "step": 441
+    },
+    {
+      "epoch": 0.4855142111767129,
+      "grad_norm": 0.40994521975517273,
+      "learning_rate": 0.00027936507936507935,
+      "loss": 0.6228,
+      "step": 442
+    },
+    {
+      "epoch": 0.48661265961828915,
+      "grad_norm": 0.4782777428627014,
+      "learning_rate": 0.0002792429792429792,
+      "loss": 0.4944,
+      "step": 443
+    },
+    {
+      "epoch": 0.48771110805986545,
+      "grad_norm": 0.47269922494888306,
+      "learning_rate": 0.0002791208791208791,
+      "loss": 0.7023,
+      "step": 444
+    },
+    {
+      "epoch": 0.4888095565014417,
+      "grad_norm": 0.5529118776321411,
+      "learning_rate": 0.000278998778998779,
+      "loss": 0.7717,
+      "step": 445
+    },
+    {
+      "epoch": 0.489908004943018,
+      "grad_norm": 0.4244072139263153,
+      "learning_rate": 0.00027887667887667884,
+      "loss": 0.7902,
+      "step": 446
+    },
+    {
+      "epoch": 0.49100645338459425,
+      "grad_norm": 1.4737539291381836,
+      "learning_rate": 0.00027875457875457875,
+      "loss": 0.5784,
+      "step": 447
+    },
+    {
+      "epoch": 0.49210490182617056,
+      "grad_norm": 0.40120208263397217,
+      "learning_rate": 0.0002786324786324786,
+      "loss": 0.7974,
+      "step": 448
+    },
+    {
+      "epoch": 0.4932033502677468,
+      "grad_norm": 0.5481031537055969,
+      "learning_rate": 0.00027851037851037846,
+      "loss": 0.7867,
+      "step": 449
+    },
+    {
+      "epoch": 0.49430179870932306,
+      "grad_norm": 0.36719343066215515,
+      "learning_rate": 0.0002783882783882784,
+      "loss": 0.6543,
+      "step": 450
+    },
+    {
+      "epoch": 0.49540024715089936,
+      "grad_norm": 0.3980066776275635,
+      "learning_rate": 0.00027826617826617824,
+      "loss": 0.5395,
+      "step": 451
+    },
+    {
+      "epoch": 0.4964986955924756,
+      "grad_norm": 0.45570313930511475,
+      "learning_rate": 0.0002781440781440781,
+      "loss": 0.7908,
+      "step": 452
+    },
+    {
+      "epoch": 0.4975971440340519,
+      "grad_norm": 0.41858601570129395,
+      "learning_rate": 0.000278021978021978,
+      "loss": 0.5248,
+      "step": 453
+    },
+    {
+      "epoch": 0.49869559247562817,
+      "grad_norm": 0.5019702315330505,
+      "learning_rate": 0.00027789987789987786,
+      "loss": 0.8006,
+      "step": 454
+    },
+    {
+      "epoch": 0.4997940409172045,
+      "grad_norm": 0.4589880108833313,
+      "learning_rate": 0.0002777777777777778,
+      "loss": 0.7294,
+      "step": 455
+    },
+    {
+      "epoch": 0.5008924893587807,
+      "grad_norm": 0.5679266452789307,
+      "learning_rate": 0.00027765567765567764,
+      "loss": 0.651,
+      "step": 456
+    },
+    {
+      "epoch": 0.501990937800357,
+      "grad_norm": 0.4854479134082794,
+      "learning_rate": 0.0002775335775335775,
+      "loss": 0.9908,
+      "step": 457
+    },
+    {
+      "epoch": 0.5030893862419332,
+      "grad_norm": 0.4964112341403961,
+      "learning_rate": 0.0002774114774114774,
+      "loss": 0.8084,
+      "step": 458
+    },
+    {
+      "epoch": 0.5041878346835096,
+      "grad_norm": 0.5130513906478882,
+      "learning_rate": 0.00027728937728937727,
+      "loss": 0.8389,
+      "step": 459
+    },
+    {
+      "epoch": 0.5052862831250858,
+      "grad_norm": 0.4784137010574341,
+      "learning_rate": 0.0002771672771672771,
+      "loss": 0.5497,
+      "step": 460
+    },
+    {
+      "epoch": 0.5063847315666621,
+      "grad_norm": 0.28685998916625977,
+      "learning_rate": 0.00027704517704517704,
+      "loss": 0.491,
+      "step": 461
+    },
+    {
+      "epoch": 0.5074831800082383,
+      "grad_norm": 0.5337100625038147,
+      "learning_rate": 0.0002769230769230769,
+      "loss": 0.8315,
+      "step": 462
+    },
+    {
+      "epoch": 0.5085816284498146,
+      "grad_norm": 0.5431344509124756,
+      "learning_rate": 0.00027680097680097675,
+      "loss": 0.5996,
+      "step": 463
+    },
+    {
+      "epoch": 0.5096800768913909,
+      "grad_norm": 0.4546130299568176,
+      "learning_rate": 0.00027667887667887667,
+      "loss": 0.5647,
+      "step": 464
+    },
+    {
+      "epoch": 0.5107785253329672,
+      "grad_norm": 0.6298655271530151,
+      "learning_rate": 0.0002765567765567765,
+      "loss": 0.7684,
+      "step": 465
+    },
+    {
+      "epoch": 0.5118769737745434,
+      "grad_norm": 0.44330841302871704,
+      "learning_rate": 0.0002764346764346764,
+      "loss": 0.4906,
+      "step": 466
+    },
+    {
+      "epoch": 0.5129754222161197,
+      "grad_norm": 0.3824306130409241,
+      "learning_rate": 0.0002763125763125763,
+      "loss": 0.6123,
+      "step": 467
+    },
+    {
+      "epoch": 0.514073870657696,
+      "grad_norm": 0.3225514590740204,
+      "learning_rate": 0.00027619047619047615,
+      "loss": 0.7535,
+      "step": 468
+    },
+    {
+      "epoch": 0.5151723190992723,
+      "grad_norm": 0.701239824295044,
+      "learning_rate": 0.00027606837606837607,
+      "loss": 0.9643,
+      "step": 469
+    },
+    {
+      "epoch": 0.5162707675408486,
+      "grad_norm": 0.37800920009613037,
+      "learning_rate": 0.0002759462759462759,
+      "loss": 0.543,
+      "step": 470
+    },
+    {
+      "epoch": 0.5173692159824248,
+      "grad_norm": 0.3521328568458557,
+      "learning_rate": 0.0002758241758241758,
+      "loss": 0.7157,
+      "step": 471
+    },
+    {
+      "epoch": 0.518467664424001,
+      "grad_norm": 0.2659924626350403,
+      "learning_rate": 0.0002757020757020757,
+      "loss": 0.7334,
+      "step": 472
+    },
+    {
+      "epoch": 0.5195661128655774,
+      "grad_norm": 0.42815065383911133,
+      "learning_rate": 0.00027557997557997555,
+      "loss": 1.2015,
+      "step": 473
+    },
+    {
+      "epoch": 0.5206645613071537,
+      "grad_norm": 0.7758998870849609,
+      "learning_rate": 0.0002754578754578754,
+      "loss": 0.9493,
+      "step": 474
+    },
+    {
+      "epoch": 0.5217630097487299,
+      "grad_norm": 0.46281251311302185,
+      "learning_rate": 0.0002753357753357753,
+      "loss": 0.9159,
+      "step": 475
+    },
+    {
+      "epoch": 0.5228614581903062,
+      "grad_norm": 0.3668971061706543,
+      "learning_rate": 0.0002752136752136752,
+      "loss": 0.4869,
+      "step": 476
+    },
+    {
+      "epoch": 0.5239599066318824,
+      "grad_norm": 0.462534099817276,
+      "learning_rate": 0.00027509157509157504,
+      "loss": 0.6439,
+      "step": 477
+    },
+    {
+      "epoch": 0.5250583550734588,
+      "grad_norm": 0.6341688632965088,
+      "learning_rate": 0.00027496947496947495,
+      "loss": 0.6948,
+      "step": 478
+    },
+    {
+      "epoch": 0.526156803515035,
+      "grad_norm": 0.5469139814376831,
+      "learning_rate": 0.0002748473748473748,
+      "loss": 1.016,
+      "step": 479
+    },
+    {
+      "epoch": 0.5272552519566113,
+      "grad_norm": 0.438204288482666,
+      "learning_rate": 0.00027472527472527467,
+      "loss": 0.6941,
+      "step": 480
+    },
+    {
+      "epoch": 0.5283537003981875,
+      "grad_norm": 0.586700975894928,
+      "learning_rate": 0.0002746031746031746,
+      "loss": 0.6649,
+      "step": 481
+    },
+    {
+      "epoch": 0.5294521488397639,
+      "grad_norm": 0.4077949523925781,
+      "learning_rate": 0.0002744810744810745,
+      "loss": 0.5948,
+      "step": 482
+    },
+    {
+      "epoch": 0.5305505972813401,
+      "grad_norm": 0.3756411373615265,
+      "learning_rate": 0.0002743589743589743,
+      "loss": 0.4915,
+      "step": 483
+    },
+    {
+      "epoch": 0.5316490457229164,
+      "grad_norm": 1.2067008018493652,
+      "learning_rate": 0.0002742368742368742,
+      "loss": 0.8795,
+      "step": 484
+    },
+    {
+      "epoch": 0.5327474941644926,
+      "grad_norm": 0.3097778260707855,
+      "learning_rate": 0.0002741147741147741,
+      "loss": 0.5478,
+      "step": 485
+    },
+    {
+      "epoch": 0.5338459426060689,
+      "grad_norm": 0.5536866188049316,
+      "learning_rate": 0.000273992673992674,
+      "loss": 0.7042,
+      "step": 486
+    },
+    {
+      "epoch": 0.5349443910476452,
+      "grad_norm": 0.5930231809616089,
+      "learning_rate": 0.00027387057387057384,
+      "loss": 0.7108,
+      "step": 487
+    },
+    {
+      "epoch": 0.5360428394892215,
+      "grad_norm": 0.39304253458976746,
+      "learning_rate": 0.00027374847374847375,
+      "loss": 0.788,
+      "step": 488
+    },
+    {
+      "epoch": 0.5371412879307977,
+      "grad_norm": 0.5238274335861206,
+      "learning_rate": 0.0002736263736263736,
+      "loss": 0.9887,
+      "step": 489
+    },
+    {
+      "epoch": 0.538239736372374,
+      "grad_norm": 0.5993770956993103,
+      "learning_rate": 0.00027350427350427347,
+      "loss": 0.7819,
+      "step": 490
+    },
+    {
+      "epoch": 0.5393381848139503,
+      "grad_norm": 0.4601563811302185,
+      "learning_rate": 0.00027338217338217333,
+      "loss": 0.4347,
+      "step": 491
+    },
+    {
+      "epoch": 0.5404366332555266,
+      "grad_norm": 0.5292415022850037,
+      "learning_rate": 0.00027326007326007324,
+      "loss": 0.5248,
+      "step": 492
+    },
+    {
+      "epoch": 0.5415350816971028,
+      "grad_norm": 0.37247565388679504,
+      "learning_rate": 0.0002731379731379731,
+      "loss": 0.5412,
+      "step": 493
+    },
+    {
+      "epoch": 0.5426335301386791,
+      "grad_norm": 0.6865994930267334,
+      "learning_rate": 0.00027301587301587296,
+      "loss": 0.8263,
+      "step": 494
+    },
+    {
+      "epoch": 0.5437319785802553,
+      "grad_norm": 0.5019715428352356,
+      "learning_rate": 0.00027289377289377287,
+      "loss": 0.7084,
+      "step": 495
+    },
+    {
+      "epoch": 0.5448304270218317,
+      "grad_norm": 0.8432828783988953,
+      "learning_rate": 0.00027277167277167273,
+      "loss": 0.6188,
+      "step": 496
+    },
+    {
+      "epoch": 0.545928875463408,
+      "grad_norm": 0.594881534576416,
+      "learning_rate": 0.0002726495726495726,
+      "loss": 0.8923,
+      "step": 497
+    },
+    {
+      "epoch": 0.5470273239049842,
+      "grad_norm": 0.5573694705963135,
+      "learning_rate": 0.0002725274725274725,
+      "loss": 0.6351,
+      "step": 498
+    },
+    {
+      "epoch": 0.5481257723465605,
+      "grad_norm": 0.30426710844039917,
+      "learning_rate": 0.0002724053724053724,
+      "loss": 0.6359,
+      "step": 499
+    },
+    {
+      "epoch": 0.5492242207881367,
+      "grad_norm": 0.759385883808136,
+      "learning_rate": 0.00027228327228327227,
+      "loss": 0.6131,
+      "step": 500
+    },
+    {
+      "epoch": 0.5503226692297131,
+      "grad_norm": 0.5436901450157166,
+      "learning_rate": 0.00027216117216117213,
+      "loss": 0.5232,
+      "step": 501
+    },
+    {
+      "epoch": 0.5514211176712893,
+      "grad_norm": 0.5924163460731506,
+      "learning_rate": 0.00027203907203907204,
+      "loss": 0.9594,
+      "step": 502
+    },
+    {
+      "epoch": 0.5525195661128656,
+      "grad_norm": 0.49177658557891846,
+      "learning_rate": 0.0002719169719169719,
+      "loss": 0.842,
+      "step": 503
+    },
+    {
+      "epoch": 0.5536180145544418,
+      "grad_norm": 0.4437295198440552,
+      "learning_rate": 0.00027179487179487176,
+      "loss": 1.0338,
+      "step": 504
+    },
+    {
+      "epoch": 0.5547164629960182,
+      "grad_norm": 0.426213800907135,
+      "learning_rate": 0.00027167277167277167,
+      "loss": 0.6375,
+      "step": 505
+    },
+    {
+      "epoch": 0.5558149114375944,
+      "grad_norm": 0.4599516689777374,
+      "learning_rate": 0.00027155067155067153,
+      "loss": 0.5005,
+      "step": 506
+    },
+    {
+      "epoch": 0.5569133598791707,
+      "grad_norm": 0.647957980632782,
+      "learning_rate": 0.0002714285714285714,
+      "loss": 0.6292,
+      "step": 507
+    },
+    {
+      "epoch": 0.5580118083207469,
+      "grad_norm": 0.7891755104064941,
+      "learning_rate": 0.0002713064713064713,
+      "loss": 0.697,
+      "step": 508
+    },
+    {
+      "epoch": 0.5591102567623232,
+      "grad_norm": 0.5290817618370056,
+      "learning_rate": 0.00027118437118437116,
+      "loss": 0.4547,
+      "step": 509
+    },
+    {
+      "epoch": 0.5602087052038995,
+      "grad_norm": 0.4025941789150238,
+      "learning_rate": 0.000271062271062271,
+      "loss": 0.6299,
+      "step": 510
+    },
+    {
+      "epoch": 0.5613071536454758,
+      "grad_norm": 0.7768287658691406,
+      "learning_rate": 0.00027094017094017093,
+      "loss": 0.6813,
+      "step": 511
+    },
+    {
+      "epoch": 0.562405602087052,
+      "grad_norm": 0.6977662444114685,
+      "learning_rate": 0.0002708180708180708,
+      "loss": 0.8217,
+      "step": 512
+    },
+    {
+      "epoch": 0.5635040505286283,
+      "grad_norm": 0.5238949060440063,
+      "learning_rate": 0.0002706959706959707,
+      "loss": 0.7348,
+      "step": 513
+    },
+    {
+      "epoch": 0.5646024989702045,
+      "grad_norm": 0.5099830627441406,
+      "learning_rate": 0.00027057387057387056,
+      "loss": 0.9894,
+      "step": 514
+    },
+    {
+      "epoch": 0.5657009474117809,
+      "grad_norm": 0.6254756450653076,
+      "learning_rate": 0.0002704517704517704,
+      "loss": 0.9258,
+      "step": 515
+    },
+    {
+      "epoch": 0.5667993958533571,
+      "grad_norm": 0.40313196182250977,
+      "learning_rate": 0.00027032967032967033,
+      "loss": 0.8115,
+      "step": 516
+    },
+    {
+      "epoch": 0.5678978442949334,
+      "grad_norm": 0.9706575274467468,
+      "learning_rate": 0.0002702075702075702,
+      "loss": 0.5204,
+      "step": 517
+    },
+    {
+      "epoch": 0.5689962927365096,
+      "grad_norm": 0.36777085065841675,
+      "learning_rate": 0.00027008547008547005,
+      "loss": 0.7716,
+      "step": 518
+    },
+    {
+      "epoch": 0.570094741178086,
+      "grad_norm": 0.48726886510849,
+      "learning_rate": 0.00026996336996336996,
+      "loss": 0.7745,
+      "step": 519
+    },
+    {
+      "epoch": 0.5711931896196623,
+      "grad_norm": 0.3590470850467682,
+      "learning_rate": 0.0002698412698412698,
+      "loss": 0.7038,
+      "step": 520
+    },
+    {
+      "epoch": 0.5722916380612385,
+      "grad_norm": 0.7103118896484375,
+      "learning_rate": 0.0002697191697191697,
+      "loss": 0.8368,
+      "step": 521
+    },
+    {
+      "epoch": 0.5733900865028148,
+      "grad_norm": 0.5503933429718018,
+      "learning_rate": 0.0002695970695970696,
+      "loss": 0.6164,
+      "step": 522
+    },
+    {
+      "epoch": 0.574488534944391,
+      "grad_norm": 0.5255150198936462,
+      "learning_rate": 0.00026947496947496945,
+      "loss": 0.8886,
+      "step": 523
+    },
+    {
+      "epoch": 0.5755869833859674,
+      "grad_norm": 0.4872569739818573,
+      "learning_rate": 0.0002693528693528693,
+      "loss": 0.6277,
+      "step": 524
+    },
+    {
+      "epoch": 0.5766854318275436,
+      "grad_norm": 0.3748464584350586,
+      "learning_rate": 0.0002692307692307692,
+      "loss": 0.6471,
+      "step": 525
+    },
+    {
+      "epoch": 0.5777838802691199,
+      "grad_norm": 0.4401276111602783,
+      "learning_rate": 0.0002691086691086691,
+      "loss": 0.9846,
+      "step": 526
+    },
+    {
+      "epoch": 0.5788823287106961,
+      "grad_norm": 0.9565305709838867,
+      "learning_rate": 0.00026898656898656894,
+      "loss": 0.9471,
+      "step": 527
+    },
+    {
+      "epoch": 0.5799807771522724,
+      "grad_norm": 0.6307245492935181,
+      "learning_rate": 0.00026886446886446885,
+      "loss": 0.9168,
+      "step": 528
+    },
+    {
+      "epoch": 0.5810792255938487,
+      "grad_norm": 0.49177634716033936,
+      "learning_rate": 0.0002687423687423687,
+      "loss": 0.5464,
+      "step": 529
+    },
+    {
+      "epoch": 0.582177674035425,
+      "grad_norm": 0.68553626537323,
+      "learning_rate": 0.0002686202686202686,
+      "loss": 0.5874,
+      "step": 530
+    },
+    {
+      "epoch": 0.5832761224770012,
+      "grad_norm": 0.3811597228050232,
+      "learning_rate": 0.0002684981684981685,
+      "loss": 0.766,
+      "step": 531
+    },
+    {
+      "epoch": 0.5843745709185775,
+      "grad_norm": 0.6634503602981567,
+      "learning_rate": 0.00026837606837606834,
+      "loss": 0.6438,
+      "step": 532
+    },
+    {
+      "epoch": 0.5854730193601538,
+      "grad_norm": 0.6115571856498718,
+      "learning_rate": 0.00026825396825396825,
+      "loss": 0.8757,
+      "step": 533
+    },
+    {
+      "epoch": 0.5865714678017301,
+      "grad_norm": 0.3011985719203949,
+      "learning_rate": 0.0002681318681318681,
+      "loss": 0.6188,
+      "step": 534
+    },
+    {
+      "epoch": 0.5876699162433063,
+      "grad_norm": 0.7029386162757874,
+      "learning_rate": 0.00026800976800976797,
+      "loss": 0.8681,
+      "step": 535
+    },
+    {
+      "epoch": 0.5887683646848826,
+      "grad_norm": 0.4796508550643921,
+      "learning_rate": 0.0002678876678876679,
+      "loss": 0.7207,
+      "step": 536
+    },
+    {
+      "epoch": 0.5898668131264588,
+      "grad_norm": 0.542948842048645,
+      "learning_rate": 0.00026776556776556774,
+      "loss": 0.5587,
+      "step": 537
+    },
+    {
+      "epoch": 0.5909652615680352,
+      "grad_norm": 0.7566731572151184,
+      "learning_rate": 0.0002676434676434676,
+      "loss": 0.8562,
+      "step": 538
+    },
+    {
+      "epoch": 0.5920637100096114,
+      "grad_norm": 0.6411837339401245,
+      "learning_rate": 0.0002675213675213675,
+      "loss": 0.4516,
+      "step": 539
+    },
+    {
+      "epoch": 0.5931621584511877,
+      "grad_norm": 0.41434159874916077,
+      "learning_rate": 0.00026739926739926737,
+      "loss": 0.7069,
+      "step": 540
+    },
+    {
+      "epoch": 0.5942606068927639,
+      "grad_norm": 0.29941752552986145,
+      "learning_rate": 0.0002672771672771672,
+      "loss": 0.7444,
+      "step": 541
+    },
+    {
+      "epoch": 0.5953590553343402,
+      "grad_norm": 1.8168927431106567,
+      "learning_rate": 0.00026715506715506714,
+      "loss": 0.4947,
+      "step": 542
+    },
+    {
+      "epoch": 0.5964575037759166,
+      "grad_norm": 0.5639868974685669,
+      "learning_rate": 0.000267032967032967,
+      "loss": 0.6749,
+      "step": 543
+    },
+    {
+      "epoch": 0.5975559522174928,
+      "grad_norm": 0.5054119229316711,
+      "learning_rate": 0.0002669108669108669,
+      "loss": 0.8075,
+      "step": 544
+    },
+    {
+      "epoch": 0.598654400659069,
+      "grad_norm": 0.3531246483325958,
+      "learning_rate": 0.00026678876678876677,
+      "loss": 0.6986,
+      "step": 545
+    },
+    {
+      "epoch": 0.5997528491006453,
+      "grad_norm": 0.36428287625312805,
+      "learning_rate": 0.0002666666666666666,
+      "loss": 0.6496,
+      "step": 546
+    },
+    {
+      "epoch": 0.6008512975422217,
+      "grad_norm": 0.45706960558891296,
+      "learning_rate": 0.00026654456654456654,
+      "loss": 0.5646,
+      "step": 547
+    },
+    {
+      "epoch": 0.6019497459837979,
+      "grad_norm": 0.39326363801956177,
+      "learning_rate": 0.0002664224664224664,
+      "loss": 0.5037,
+      "step": 548
+    },
+    {
+      "epoch": 0.6030481944253742,
+      "grad_norm": 0.7158151268959045,
+      "learning_rate": 0.00026630036630036625,
+      "loss": 0.5643,
+      "step": 549
+    },
+    {
+      "epoch": 0.6041466428669504,
+      "grad_norm": 0.398335337638855,
+      "learning_rate": 0.00026617826617826617,
+      "loss": 0.5462,
+      "step": 550
+    },
+    {
+      "epoch": 0.6052450913085267,
+      "grad_norm": 0.8625812530517578,
+      "learning_rate": 0.000266056166056166,
+      "loss": 0.7898,
+      "step": 551
+    },
+    {
+      "epoch": 0.606343539750103,
+      "grad_norm": 0.5558099150657654,
+      "learning_rate": 0.0002659340659340659,
+      "loss": 0.7968,
+      "step": 552
+    },
+    {
+      "epoch": 0.6074419881916793,
+      "grad_norm": 0.6244741678237915,
+      "learning_rate": 0.0002658119658119658,
+      "loss": 0.9085,
+      "step": 553
+    },
+    {
+      "epoch": 0.6085404366332555,
+      "grad_norm": 0.4907127916812897,
+      "learning_rate": 0.00026568986568986565,
+      "loss": 0.5683,
+      "step": 554
+    },
+    {
+      "epoch": 0.6096388850748318,
+      "grad_norm": 0.6140159964561462,
+      "learning_rate": 0.0002655677655677655,
+      "loss": 0.5693,
+      "step": 555
+    },
+    {
+      "epoch": 0.610737333516408,
+      "grad_norm": 0.41251274943351746,
+      "learning_rate": 0.0002654456654456654,
+      "loss": 0.728,
+      "step": 556
+    },
+    {
+      "epoch": 0.6118357819579844,
+      "grad_norm": 0.43427684903144836,
+      "learning_rate": 0.00026532356532356534,
+      "loss": 0.5692,
+      "step": 557
+    },
+    {
+      "epoch": 0.6129342303995606,
+      "grad_norm": 0.41471078991889954,
+      "learning_rate": 0.00026520146520146514,
+      "loss": 0.6616,
+      "step": 558
+    },
+    {
+      "epoch": 0.6140326788411369,
+      "grad_norm": 0.4406953752040863,
+      "learning_rate": 0.00026507936507936506,
+      "loss": 0.4764,
+      "step": 559
+    },
+    {
+      "epoch": 0.6151311272827131,
+      "grad_norm": 7.233060359954834,
+      "learning_rate": 0.00026495726495726497,
+      "loss": 0.6111,
+      "step": 560
+    },
+    {
+      "epoch": 0.6162295757242895,
+      "grad_norm": 0.47008857131004333,
+      "learning_rate": 0.0002648351648351648,
+      "loss": 0.8145,
+      "step": 561
+    },
+    {
+      "epoch": 0.6173280241658657,
+      "grad_norm": 0.47636717557907104,
+      "learning_rate": 0.0002647130647130647,
+      "loss": 0.8036,
+      "step": 562
+    },
+    {
+      "epoch": 0.618426472607442,
+      "grad_norm": 0.526971161365509,
+      "learning_rate": 0.0002645909645909646,
+      "loss": 0.7559,
+      "step": 563
+    },
+    {
+      "epoch": 0.6195249210490182,
+      "grad_norm": 0.5027382373809814,
+      "learning_rate": 0.00026446886446886446,
+      "loss": 0.7765,
+      "step": 564
+    },
+    {
+      "epoch": 0.6206233694905945,
+      "grad_norm": 0.4222506284713745,
+      "learning_rate": 0.0002643467643467643,
+      "loss": 0.6376,
+      "step": 565
+    },
+    {
+      "epoch": 0.6217218179321709,
+      "grad_norm": 0.6390372514724731,
+      "learning_rate": 0.0002642246642246642,
+      "loss": 0.8224,
+      "step": 566
+    },
+    {
+      "epoch": 0.6228202663737471,
+      "grad_norm": 0.44495514035224915,
+      "learning_rate": 0.0002641025641025641,
+      "loss": 0.5995,
+      "step": 567
+    },
+    {
+      "epoch": 0.6239187148153233,
+      "grad_norm": 0.7005137205123901,
+      "learning_rate": 0.00026398046398046394,
+      "loss": 0.4986,
+      "step": 568
+    },
+    {
+      "epoch": 0.6250171632568996,
+      "grad_norm": 0.40745365619659424,
+      "learning_rate": 0.0002638583638583638,
+      "loss": 0.608,
+      "step": 569
+    },
+    {
+      "epoch": 0.6261156116984758,
+      "grad_norm": 0.3449142277240753,
+      "learning_rate": 0.0002637362637362637,
+      "loss": 0.6253,
+      "step": 570
+    },
+    {
+      "epoch": 0.6272140601400522,
+      "grad_norm": 0.4318457841873169,
+      "learning_rate": 0.00026361416361416357,
+      "loss": 0.6376,
+      "step": 571
+    },
+    {
+      "epoch": 0.6283125085816285,
+      "grad_norm": 2.2202258110046387,
+      "learning_rate": 0.00026349206349206343,
+      "loss": 0.5477,
+      "step": 572
+    },
+    {
+      "epoch": 0.6294109570232047,
+      "grad_norm": 0.6759721040725708,
+      "learning_rate": 0.00026336996336996334,
+      "loss": 1.1176,
+      "step": 573
+    },
+    {
+      "epoch": 0.630509405464781,
+      "grad_norm": 1.7796927690505981,
+      "learning_rate": 0.00026324786324786326,
+      "loss": 0.8713,
+      "step": 574
+    },
+    {
+      "epoch": 0.6316078539063573,
+      "grad_norm": 0.32952558994293213,
+      "learning_rate": 0.0002631257631257631,
+      "loss": 0.4711,
+      "step": 575
+    },
+    {
+      "epoch": 0.6327063023479336,
+      "grad_norm": 0.40390628576278687,
+      "learning_rate": 0.000263003663003663,
+      "loss": 0.5412,
+      "step": 576
+    },
+    {
+      "epoch": 0.6338047507895098,
+      "grad_norm": 0.7439208030700684,
+      "learning_rate": 0.0002628815628815629,
+      "loss": 0.7094,
+      "step": 577
+    },
+    {
+      "epoch": 0.6349031992310861,
+      "grad_norm": 0.34505775570869446,
+      "learning_rate": 0.00026275946275946274,
+      "loss": 0.5939,
+      "step": 578
+    },
+    {
+      "epoch": 0.6360016476726623,
+      "grad_norm": 0.9452011585235596,
+      "learning_rate": 0.0002626373626373626,
+      "loss": 0.5108,
+      "step": 579
+    },
+    {
+      "epoch": 0.6371000961142387,
+      "grad_norm": 0.42789551615715027,
+      "learning_rate": 0.0002625152625152625,
+      "loss": 0.5661,
+      "step": 580
+    },
+    {
+      "epoch": 0.6381985445558149,
+      "grad_norm": 0.3460575044155121,
+      "learning_rate": 0.0002623931623931624,
+      "loss": 0.8333,
+      "step": 581
+    },
+    {
+      "epoch": 0.6392969929973912,
+      "grad_norm": 0.8932168483734131,
+      "learning_rate": 0.00026227106227106223,
+      "loss": 0.7058,
+      "step": 582
+    },
+    {
+      "epoch": 0.6403954414389674,
+      "grad_norm": 0.8588842749595642,
+      "learning_rate": 0.00026214896214896214,
+      "loss": 0.6905,
+      "step": 583
+    },
+    {
+      "epoch": 0.6414938898805437,
+      "grad_norm": 0.5097251534461975,
+      "learning_rate": 0.000262026862026862,
+      "loss": 0.8189,
+      "step": 584
+    },
+    {
+      "epoch": 0.64259233832212,
+      "grad_norm": 0.45746755599975586,
+      "learning_rate": 0.00026190476190476186,
+      "loss": 0.7212,
+      "step": 585
+    },
+    {
+      "epoch": 0.6436907867636963,
+      "grad_norm": 0.9576689600944519,
+      "learning_rate": 0.0002617826617826618,
+      "loss": 0.6159,
+      "step": 586
+    },
+    {
+      "epoch": 0.6447892352052725,
+      "grad_norm": 0.5721899271011353,
+      "learning_rate": 0.00026166056166056163,
+      "loss": 0.6083,
+      "step": 587
+    },
+    {
+      "epoch": 0.6458876836468488,
+      "grad_norm": 0.4851115942001343,
+      "learning_rate": 0.00026153846153846154,
+      "loss": 0.7678,
+      "step": 588
+    },
+    {
+      "epoch": 0.6469861320884251,
+      "grad_norm": 0.6631761193275452,
+      "learning_rate": 0.0002614163614163614,
+      "loss": 0.7068,
+      "step": 589
+    },
+    {
+      "epoch": 0.6480845805300014,
+      "grad_norm": 0.6862382292747498,
+      "learning_rate": 0.00026129426129426126,
+      "loss": 0.5766,
+      "step": 590
+    },
+    {
+      "epoch": 0.6491830289715776,
+      "grad_norm": 0.3754968047142029,
+      "learning_rate": 0.0002611721611721612,
+      "loss": 0.7254,
+      "step": 591
+    },
+    {
+      "epoch": 0.6502814774131539,
+      "grad_norm": 0.5239700078964233,
+      "learning_rate": 0.00026105006105006103,
+      "loss": 0.5777,
+      "step": 592
+    },
+    {
+      "epoch": 0.6513799258547301,
+      "grad_norm": 0.5103443264961243,
+      "learning_rate": 0.0002609279609279609,
+      "loss": 1.0006,
+      "step": 593
+    },
+    {
+      "epoch": 0.6524783742963065,
+      "grad_norm": 0.4733884632587433,
+      "learning_rate": 0.0002608058608058608,
+      "loss": 0.6851,
+      "step": 594
+    },
+    {
+      "epoch": 0.6535768227378828,
+      "grad_norm": 0.5982065796852112,
+      "learning_rate": 0.00026068376068376066,
+      "loss": 0.6295,
+      "step": 595
+    },
+    {
+      "epoch": 0.654675271179459,
+      "grad_norm": 1.2408190965652466,
+      "learning_rate": 0.0002605616605616605,
+      "loss": 0.8806,
+      "step": 596
+    },
+    {
+      "epoch": 0.6557737196210353,
+      "grad_norm": 0.6005455851554871,
+      "learning_rate": 0.00026043956043956043,
+      "loss": 0.7186,
+      "step": 597
+    },
+    {
+      "epoch": 0.6568721680626116,
+      "grad_norm": 0.33777105808258057,
+      "learning_rate": 0.0002603174603174603,
+      "loss": 0.4599,
+      "step": 598
+    },
+    {
+      "epoch": 0.6579706165041879,
+      "grad_norm": 0.5336529612541199,
+      "learning_rate": 0.00026019536019536015,
+      "loss": 0.553,
+      "step": 599
+    },
+    {
+      "epoch": 0.6590690649457641,
+      "grad_norm": 0.6930931806564331,
+      "learning_rate": 0.00026007326007326006,
+      "loss": 0.5686,
+      "step": 600
+    },
+    {
+      "epoch": 0.6601675133873404,
+      "grad_norm": 1.1340439319610596,
+      "learning_rate": 0.0002599511599511599,
+      "loss": 0.5886,
+      "step": 601
+    },
+    {
+      "epoch": 0.6612659618289166,
+      "grad_norm": 0.9833797812461853,
+      "learning_rate": 0.0002598290598290598,
+      "loss": 0.7109,
+      "step": 602
+    },
+    {
+      "epoch": 0.662364410270493,
+      "grad_norm": 0.9305315017700195,
+      "learning_rate": 0.0002597069597069597,
+      "loss": 0.8341,
+      "step": 603
+    },
+    {
+      "epoch": 0.6634628587120692,
+      "grad_norm": 0.9753265380859375,
+      "learning_rate": 0.00025958485958485955,
+      "loss": 0.7102,
+      "step": 604
+    },
+    {
+      "epoch": 0.6645613071536455,
+      "grad_norm": 2.2342822551727295,
+      "learning_rate": 0.00025946275946275946,
+      "loss": 0.6784,
+      "step": 605
+    },
+    {
+      "epoch": 0.6656597555952217,
+      "grad_norm": 0.6815157532691956,
+      "learning_rate": 0.0002593406593406593,
+      "loss": 0.7689,
+      "step": 606
+    },
+    {
+      "epoch": 0.666758204036798,
+      "grad_norm": 0.7792591452598572,
+      "learning_rate": 0.0002592185592185592,
+      "loss": 0.9444,
+      "step": 607
+    },
+    {
+      "epoch": 0.6678566524783743,
+      "grad_norm": 0.668251097202301,
+      "learning_rate": 0.0002590964590964591,
+      "loss": 0.6899,
+      "step": 608
+    },
+    {
+      "epoch": 0.6689551009199506,
+      "grad_norm": 0.5041349530220032,
+      "learning_rate": 0.00025897435897435895,
+      "loss": 0.652,
+      "step": 609
+    },
+    {
+      "epoch": 0.6700535493615268,
+      "grad_norm": 0.35069939494132996,
+      "learning_rate": 0.0002588522588522588,
+      "loss": 0.8102,
+      "step": 610
+    },
+    {
+      "epoch": 0.6711519978031031,
+      "grad_norm": 3.324793577194214,
+      "learning_rate": 0.0002587301587301587,
+      "loss": 0.7936,
+      "step": 611
+    },
+    {
+      "epoch": 0.6722504462446794,
+      "grad_norm": 0.6778903007507324,
+      "learning_rate": 0.0002586080586080586,
+      "loss": 0.6258,
+      "step": 612
+    },
+    {
+      "epoch": 0.6733488946862557,
+      "grad_norm": 3.034745454788208,
+      "learning_rate": 0.00025848595848595844,
+      "loss": 0.697,
+      "step": 613
+    },
+    {
+      "epoch": 0.6744473431278319,
+      "grad_norm": 2.563870429992676,
+      "learning_rate": 0.00025836385836385835,
+      "loss": 0.7596,
+      "step": 614
+    },
+    {
+      "epoch": 0.6755457915694082,
+      "grad_norm": 0.45592913031578064,
+      "learning_rate": 0.0002582417582417582,
+      "loss": 0.7753,
+      "step": 615
+    },
+    {
+      "epoch": 0.6766442400109844,
+      "grad_norm": 0.7209720015525818,
+      "learning_rate": 0.00025811965811965807,
+      "loss": 0.6907,
+      "step": 616
+    },
+    {
+      "epoch": 0.6777426884525608,
+      "grad_norm": 0.4611949026584625,
+      "learning_rate": 0.000257997557997558,
+      "loss": 0.5896,
+      "step": 617
+    },
+    {
+      "epoch": 0.678841136894137,
+      "grad_norm": 1.3885395526885986,
+      "learning_rate": 0.0002578754578754579,
+      "loss": 0.6344,
+      "step": 618
+    },
+    {
+      "epoch": 0.6799395853357133,
+      "grad_norm": 0.544572651386261,
+      "learning_rate": 0.00025775335775335775,
+      "loss": 0.586,
+      "step": 619
+    },
+    {
+      "epoch": 0.6810380337772896,
+      "grad_norm": 0.5637034177780151,
+      "learning_rate": 0.0002576312576312576,
+      "loss": 0.8284,
+      "step": 620
+    },
+    {
+      "epoch": 0.6821364822188658,
+      "grad_norm": 1.170779824256897,
+      "learning_rate": 0.00025750915750915747,
+      "loss": 0.8818,
+      "step": 621
+    },
+    {
+      "epoch": 0.6832349306604422,
+      "grad_norm": 0.4877263605594635,
+      "learning_rate": 0.0002573870573870574,
+      "loss": 0.9179,
+      "step": 622
+    },
+    {
+      "epoch": 0.6843333791020184,
+      "grad_norm": 0.6684415340423584,
+      "learning_rate": 0.00025726495726495724,
+      "loss": 0.7358,
+      "step": 623
+    },
+    {
+      "epoch": 0.6854318275435947,
+      "grad_norm": 0.6679075956344604,
+      "learning_rate": 0.0002571428571428571,
+      "loss": 0.6342,
+      "step": 624
+    },
+    {
+      "epoch": 0.6865302759851709,
+      "grad_norm": 0.65242600440979,
+      "learning_rate": 0.000257020757020757,
+      "loss": 0.4762,
+      "step": 625
+    },
+    {
+      "epoch": 0.6876287244267473,
+      "grad_norm": 0.806523859500885,
+      "learning_rate": 0.00025689865689865687,
+      "loss": 0.7621,
+      "step": 626
+    },
+    {
+      "epoch": 0.6887271728683235,
+      "grad_norm": 1.09652578830719,
+      "learning_rate": 0.0002567765567765567,
+      "loss": 0.6594,
+      "step": 627
+    },
+    {
+      "epoch": 0.6898256213098998,
+      "grad_norm": 0.412505179643631,
+      "learning_rate": 0.00025665445665445664,
+      "loss": 0.8026,
+      "step": 628
+    },
+    {
+      "epoch": 0.690924069751476,
+      "grad_norm": 0.5801676511764526,
+      "learning_rate": 0.0002565323565323565,
+      "loss": 0.7026,
+      "step": 629
+    },
+    {
+      "epoch": 0.6920225181930523,
+      "grad_norm": 0.6822883486747742,
+      "learning_rate": 0.00025641025641025636,
+      "loss": 0.4372,
+      "step": 630
+    },
+    {
+      "epoch": 0.6931209666346286,
+      "grad_norm": 0.3455508351325989,
+      "learning_rate": 0.00025628815628815627,
+      "loss": 0.5624,
+      "step": 631
+    },
+    {
+      "epoch": 0.6942194150762049,
+      "grad_norm": 0.3533216714859009,
+      "learning_rate": 0.0002561660561660562,
+      "loss": 0.7493,
+      "step": 632
+    },
+    {
+      "epoch": 0.6953178635177811,
+      "grad_norm": 1.4306656122207642,
+      "learning_rate": 0.000256043956043956,
+      "loss": 0.7537,
+      "step": 633
+    },
+    {
+      "epoch": 0.6964163119593574,
+      "grad_norm": 0.336393266916275,
+      "learning_rate": 0.0002559218559218559,
+      "loss": 0.787,
+      "step": 634
+    },
+    {
+      "epoch": 0.6975147604009336,
+      "grad_norm": 0.5303547382354736,
+      "learning_rate": 0.0002557997557997558,
+      "loss": 0.5604,
+      "step": 635
+    },
+    {
+      "epoch": 0.69861320884251,
+      "grad_norm": 0.5421821475028992,
+      "learning_rate": 0.00025567765567765567,
+      "loss": 0.6905,
+      "step": 636
+    },
+    {
+      "epoch": 0.6997116572840862,
+      "grad_norm": 0.5445061922073364,
+      "learning_rate": 0.00025555555555555553,
+      "loss": 0.6389,
+      "step": 637
+    },
+    {
+      "epoch": 0.7008101057256625,
+      "grad_norm": 0.42832881212234497,
+      "learning_rate": 0.00025543345543345544,
+      "loss": 0.7825,
+      "step": 638
+    },
+    {
+      "epoch": 0.7019085541672387,
+      "grad_norm": 1.4624862670898438,
+      "learning_rate": 0.0002553113553113553,
+      "loss": 0.4964,
+      "step": 639
+    },
+    {
+      "epoch": 0.7030070026088151,
+      "grad_norm": 0.38657426834106445,
+      "learning_rate": 0.00025518925518925516,
+      "loss": 0.5299,
+      "step": 640
+    },
+    {
+      "epoch": 0.7041054510503914,
+      "grad_norm": 14.422834396362305,
+      "learning_rate": 0.00025506715506715507,
+      "loss": 0.5008,
+      "step": 641
+    },
+    {
+      "epoch": 0.7052038994919676,
+      "grad_norm": 0.591106653213501,
+      "learning_rate": 0.00025494505494505493,
+      "loss": 0.6732,
+      "step": 642
+    },
+    {
+      "epoch": 0.7063023479335439,
+      "grad_norm": 1.6697375774383545,
+      "learning_rate": 0.0002548229548229548,
+      "loss": 0.6782,
+      "step": 643
+    },
+    {
+      "epoch": 0.7074007963751201,
+      "grad_norm": 1.670777678489685,
+      "learning_rate": 0.0002547008547008547,
+      "loss": 0.5275,
+      "step": 644
+    },
+    {
+      "epoch": 0.7084992448166965,
+      "grad_norm": 2.3361563682556152,
+      "learning_rate": 0.00025457875457875456,
+      "loss": 0.4177,
+      "step": 645
+    },
+    {
+      "epoch": 0.7095976932582727,
+      "grad_norm": 1.823844313621521,
+      "learning_rate": 0.0002544566544566544,
+      "loss": 0.5438,
+      "step": 646
+    },
+    {
+      "epoch": 0.710696141699849,
+      "grad_norm": 0.5374146699905396,
+      "learning_rate": 0.0002543345543345543,
+      "loss": 0.6704,
+      "step": 647
+    },
+    {
+      "epoch": 0.7117945901414252,
+      "grad_norm": 0.9709361791610718,
+      "learning_rate": 0.0002542124542124542,
+      "loss": 0.8896,
+      "step": 648
+    },
+    {
+      "epoch": 0.7128930385830015,
+      "grad_norm": 0.7118197083473206,
+      "learning_rate": 0.0002540903540903541,
+      "loss": 0.766,
+      "step": 649
+    },
+    {
+      "epoch": 0.7139914870245778,
+      "grad_norm": 0.4597225487232208,
+      "learning_rate": 0.00025396825396825396,
+      "loss": 0.7498,
+      "step": 650
+    },
+    {
+      "epoch": 0.7150899354661541,
+      "grad_norm": 0.9708977937698364,
+      "learning_rate": 0.0002538461538461538,
+      "loss": 0.7602,
+      "step": 651
+    },
+    {
+      "epoch": 0.7161883839077303,
+      "grad_norm": 0.8156960606575012,
+      "learning_rate": 0.00025372405372405373,
+      "loss": 1.1105,
+      "step": 652
+    },
+    {
+      "epoch": 0.7172868323493066,
+      "grad_norm": 1.4135644435882568,
+      "learning_rate": 0.0002536019536019536,
+      "loss": 0.9203,
+      "step": 653
+    },
+    {
+      "epoch": 0.7183852807908829,
+      "grad_norm": 0.5754226446151733,
+      "learning_rate": 0.00025347985347985344,
+      "loss": 0.5368,
+      "step": 654
+    },
+    {
+      "epoch": 0.7194837292324592,
+      "grad_norm": 1.7644588947296143,
+      "learning_rate": 0.00025335775335775336,
+      "loss": 0.6451,
+      "step": 655
+    },
+    {
+      "epoch": 0.7205821776740354,
+      "grad_norm": 4.35576868057251,
+      "learning_rate": 0.0002532356532356532,
+      "loss": 0.6732,
+      "step": 656
+    },
+    {
+      "epoch": 0.7216806261156117,
+      "grad_norm": 1.1072558164596558,
+      "learning_rate": 0.0002531135531135531,
+      "loss": 0.7901,
+      "step": 657
+    },
+    {
+      "epoch": 0.7227790745571879,
+      "grad_norm": 0.3916113078594208,
+      "learning_rate": 0.000252991452991453,
+      "loss": 0.7153,
+      "step": 658
+    },
+    {
+      "epoch": 0.7238775229987643,
+      "grad_norm": 1.055137276649475,
+      "learning_rate": 0.00025286935286935285,
+      "loss": 0.8664,
+      "step": 659
+    },
+    {
+      "epoch": 0.7249759714403405,
+      "grad_norm": 0.5966087579727173,
+      "learning_rate": 0.0002527472527472527,
+      "loss": 0.933,
+      "step": 660
+    },
+    {
+      "epoch": 0.7260744198819168,
+      "grad_norm": 0.40958529710769653,
+      "learning_rate": 0.0002526251526251526,
+      "loss": 0.7196,
+      "step": 661
+    },
+    {
+      "epoch": 0.727172868323493,
+      "grad_norm": 0.4636710584163666,
+      "learning_rate": 0.0002525030525030525,
+      "loss": 0.7039,
+      "step": 662
+    },
+    {
+      "epoch": 0.7282713167650693,
+      "grad_norm": 0.6967337131500244,
+      "learning_rate": 0.0002523809523809524,
+      "loss": 0.8981,
+      "step": 663
+    },
+    {
+      "epoch": 0.7293697652066456,
+      "grad_norm": 0.49781784415245056,
+      "learning_rate": 0.00025225885225885225,
+      "loss": 0.7239,
+      "step": 664
+    },
+    {
+      "epoch": 0.7304682136482219,
+      "grad_norm": 0.940851628780365,
+      "learning_rate": 0.0002521367521367521,
+      "loss": 0.8199,
+      "step": 665
+    },
+    {
+      "epoch": 0.7315666620897981,
+      "grad_norm": 1.0271226167678833,
+      "learning_rate": 0.000252014652014652,
+      "loss": 0.6757,
+      "step": 666
+    },
+    {
+      "epoch": 0.7326651105313744,
+      "grad_norm": 0.5299912095069885,
+      "learning_rate": 0.0002518925518925519,
+      "loss": 0.8464,
+      "step": 667
+    },
+    {
+      "epoch": 0.7337635589729508,
+      "grad_norm": 0.7060052156448364,
+      "learning_rate": 0.00025177045177045173,
+      "loss": 0.6541,
+      "step": 668
+    },
+    {
+      "epoch": 0.734862007414527,
+      "grad_norm": 0.5419691205024719,
+      "learning_rate": 0.00025164835164835165,
+      "loss": 0.8741,
+      "step": 669
+    },
+    {
+      "epoch": 0.7359604558561033,
+      "grad_norm": 0.6363463401794434,
+      "learning_rate": 0.0002515262515262515,
+      "loss": 0.7224,
+      "step": 670
+    },
+    {
+      "epoch": 0.7370589042976795,
+      "grad_norm": 0.7622922658920288,
+      "learning_rate": 0.00025140415140415136,
+      "loss": 0.9402,
+      "step": 671
+    },
+    {
+      "epoch": 0.7381573527392558,
+      "grad_norm": 0.7477490305900574,
+      "learning_rate": 0.0002512820512820513,
+      "loss": 0.6036,
+      "step": 672
+    },
+    {
+      "epoch": 0.7392558011808321,
+      "grad_norm": 0.4813562333583832,
+      "learning_rate": 0.00025115995115995113,
+      "loss": 0.5982,
+      "step": 673
+    },
+    {
+      "epoch": 0.7403542496224084,
+      "grad_norm": 3.112766981124878,
+      "learning_rate": 0.000251037851037851,
+      "loss": 0.5825,
+      "step": 674
+    },
+    {
+      "epoch": 0.7414526980639846,
+      "grad_norm": 0.9523088932037354,
+      "learning_rate": 0.0002509157509157509,
+      "loss": 0.5698,
+      "step": 675
+    },
+    {
+      "epoch": 0.7425511465055609,
+      "grad_norm": 0.3426001965999603,
+      "learning_rate": 0.00025079365079365076,
+      "loss": 0.5516,
+      "step": 676
+    },
+    {
+      "epoch": 0.7436495949471371,
+      "grad_norm": 0.4866350591182709,
+      "learning_rate": 0.0002506715506715506,
+      "loss": 0.5466,
+      "step": 677
+    },
+    {
+      "epoch": 0.7447480433887135,
+      "grad_norm": 0.6590595245361328,
+      "learning_rate": 0.00025054945054945053,
+      "loss": 0.7579,
+      "step": 678
+    },
+    {
+      "epoch": 0.7458464918302897,
+      "grad_norm": 0.36733704805374146,
+      "learning_rate": 0.0002504273504273504,
+      "loss": 0.5114,
+      "step": 679
+    },
+    {
+      "epoch": 0.746944940271866,
+      "grad_norm": 0.5890951156616211,
+      "learning_rate": 0.0002503052503052503,
+      "loss": 0.7196,
+      "step": 680
+    },
+    {
+      "epoch": 0.7480433887134422,
+      "grad_norm": 0.8393438458442688,
+      "learning_rate": 0.00025018315018315016,
+      "loss": 0.6291,
+      "step": 681
+    },
+    {
+      "epoch": 0.7491418371550186,
+      "grad_norm": 0.9745636582374573,
+      "learning_rate": 0.00025006105006105,
+      "loss": 0.8675,
+      "step": 682
+    },
+    {
+      "epoch": 0.7502402855965948,
+      "grad_norm": 1.1764310598373413,
+      "learning_rate": 0.00024993894993894993,
+      "loss": 0.9384,
+      "step": 683
+    },
+    {
+      "epoch": 0.7513387340381711,
+      "grad_norm": 0.6199970245361328,
+      "learning_rate": 0.0002498168498168498,
+      "loss": 0.5984,
+      "step": 684
+    },
+    {
+      "epoch": 0.7524371824797473,
+      "grad_norm": 2.2708802223205566,
+      "learning_rate": 0.00024969474969474965,
+      "loss": 0.7867,
+      "step": 685
+    },
+    {
+      "epoch": 0.7535356309213236,
+      "grad_norm": 0.6731462478637695,
+      "learning_rate": 0.00024957264957264956,
+      "loss": 0.5377,
+      "step": 686
+    },
+    {
+      "epoch": 0.7546340793629,
+      "grad_norm": 0.991669774055481,
+      "learning_rate": 0.0002494505494505494,
+      "loss": 0.7015,
+      "step": 687
+    },
+    {
+      "epoch": 0.7557325278044762,
+      "grad_norm": 0.5873506665229797,
+      "learning_rate": 0.0002493284493284493,
+      "loss": 0.567,
+      "step": 688
+    },
+    {
+      "epoch": 0.7568309762460524,
+      "grad_norm": 1.5025473833084106,
+      "learning_rate": 0.0002492063492063492,
+      "loss": 0.6264,
+      "step": 689
+    },
+    {
+      "epoch": 0.7579294246876287,
+      "grad_norm": 0.4942665696144104,
+      "learning_rate": 0.00024908424908424905,
+      "loss": 0.7623,
+      "step": 690
+    },
+    {
+      "epoch": 0.7590278731292049,
+      "grad_norm": 0.5522105693817139,
+      "learning_rate": 0.0002489621489621489,
+      "loss": 0.6192,
+      "step": 691
+    },
+    {
+      "epoch": 0.7601263215707813,
+      "grad_norm": 1.25243079662323,
+      "learning_rate": 0.0002488400488400488,
+      "loss": 0.8547,
+      "step": 692
+    },
+    {
+      "epoch": 0.7612247700123576,
+      "grad_norm": 0.5228685140609741,
+      "learning_rate": 0.00024871794871794874,
+      "loss": 0.7365,
+      "step": 693
+    },
+    {
+      "epoch": 0.7623232184539338,
+      "grad_norm": 1.5090827941894531,
+      "learning_rate": 0.0002485958485958486,
+      "loss": 0.9226,
+      "step": 694
+    },
+    {
+      "epoch": 0.76342166689551,
+      "grad_norm": 3.3617379665374756,
+      "learning_rate": 0.00024847374847374845,
+      "loss": 0.7942,
+      "step": 695
+    },
+    {
+      "epoch": 0.7645201153370864,
+      "grad_norm": 0.5350137948989868,
+      "learning_rate": 0.0002483516483516483,
+      "loss": 0.6254,
+      "step": 696
+    },
+    {
+      "epoch": 0.7656185637786627,
+      "grad_norm": 0.8871312141418457,
+      "learning_rate": 0.0002482295482295482,
+      "loss": 0.8241,
+      "step": 697
+    },
+    {
+      "epoch": 0.7667170122202389,
+      "grad_norm": 0.48593926429748535,
+      "learning_rate": 0.0002481074481074481,
+      "loss": 0.5707,
+      "step": 698
+    },
+    {
+      "epoch": 0.7678154606618152,
+      "grad_norm": 0.7460000514984131,
+      "learning_rate": 0.00024798534798534794,
+      "loss": 0.9521,
+      "step": 699
+    },
+    {
+      "epoch": 0.7689139091033914,
+      "grad_norm": 0.7105034589767456,
+      "learning_rate": 0.00024786324786324785,
+      "loss": 0.7513,
+      "step": 700
+    },
+    {
+      "epoch": 0.7700123575449678,
+      "grad_norm": 0.40251481533050537,
+      "learning_rate": 0.0002477411477411477,
+      "loss": 0.6067,
+      "step": 701
+    },
+    {
+      "epoch": 0.771110805986544,
+      "grad_norm": 0.452709436416626,
+      "learning_rate": 0.00024761904761904757,
+      "loss": 0.671,
+      "step": 702
+    },
+    {
+      "epoch": 0.7722092544281203,
+      "grad_norm": 0.581453263759613,
+      "learning_rate": 0.0002474969474969475,
+      "loss": 0.5356,
+      "step": 703
+    },
+    {
+      "epoch": 0.7733077028696965,
+      "grad_norm": 0.8013669848442078,
+      "learning_rate": 0.00024737484737484734,
+      "loss": 0.6889,
+      "step": 704
+    },
+    {
+      "epoch": 0.7744061513112728,
+      "grad_norm": 1.1480565071105957,
+      "learning_rate": 0.0002472527472527472,
+      "loss": 0.7456,
+      "step": 705
+    },
+    {
+      "epoch": 0.7755045997528491,
+      "grad_norm": 0.7568329572677612,
+      "learning_rate": 0.0002471306471306471,
+      "loss": 0.7455,
+      "step": 706
+    },
+    {
+      "epoch": 0.7766030481944254,
+      "grad_norm": 0.4223226308822632,
+      "learning_rate": 0.000247008547008547,
+      "loss": 0.7138,
+      "step": 707
+    },
+    {
+      "epoch": 0.7777014966360016,
+      "grad_norm": 0.372872531414032,
+      "learning_rate": 0.00024688644688644683,
+      "loss": 0.8037,
+      "step": 708
+    },
+    {
+      "epoch": 0.7787999450775779,
+      "grad_norm": 0.968614399433136,
+      "learning_rate": 0.00024676434676434674,
+      "loss": 0.5943,
+      "step": 709
+    },
+    {
+      "epoch": 0.7798983935191542,
+      "grad_norm": 0.801157534122467,
+      "learning_rate": 0.00024664224664224665,
+      "loss": 0.9467,
+      "step": 710
+    },
+    {
+      "epoch": 0.7809968419607305,
+      "grad_norm": 0.7115808129310608,
+      "learning_rate": 0.0002465201465201465,
+      "loss": 0.7828,
+      "step": 711
+    },
+    {
+      "epoch": 0.7820952904023067,
+      "grad_norm": 1.2951349020004272,
+      "learning_rate": 0.00024639804639804637,
+      "loss": 0.6221,
+      "step": 712
+    },
+    {
+      "epoch": 0.783193738843883,
+      "grad_norm": 0.47706693410873413,
+      "learning_rate": 0.0002462759462759463,
+      "loss": 0.3641,
+      "step": 713
+    },
+    {
+      "epoch": 0.7842921872854592,
+      "grad_norm": 0.8871097564697266,
+      "learning_rate": 0.00024615384615384614,
+      "loss": 0.6177,
+      "step": 714
+    },
+    {
+      "epoch": 0.7853906357270356,
+      "grad_norm": 0.7920973896980286,
+      "learning_rate": 0.000246031746031746,
+      "loss": 0.5858,
+      "step": 715
+    },
+    {
+      "epoch": 0.7864890841686119,
+      "grad_norm": 0.49732694029808044,
+      "learning_rate": 0.0002459096459096459,
+      "loss": 0.5176,
+      "step": 716
+    },
+    {
+      "epoch": 0.7875875326101881,
+      "grad_norm": 0.34965720772743225,
+      "learning_rate": 0.00024578754578754577,
+      "loss": 0.4983,
+      "step": 717
+    },
+    {
+      "epoch": 0.7886859810517644,
+      "grad_norm": 0.45963025093078613,
+      "learning_rate": 0.00024566544566544563,
+      "loss": 0.7756,
+      "step": 718
+    },
+    {
+      "epoch": 0.7897844294933407,
+      "grad_norm": 0.5802373290061951,
+      "learning_rate": 0.00024554334554334554,
+      "loss": 0.5773,
+      "step": 719
+    },
+    {
+      "epoch": 0.790882877934917,
+      "grad_norm": 1.8482742309570312,
+      "learning_rate": 0.0002454212454212454,
+      "loss": 0.7978,
+      "step": 720
+    },
+    {
+      "epoch": 0.7919813263764932,
+      "grad_norm": 0.5821959972381592,
+      "learning_rate": 0.00024529914529914526,
+      "loss": 0.7483,
+      "step": 721
+    },
+    {
+      "epoch": 0.7930797748180695,
+      "grad_norm": 0.9352701306343079,
+      "learning_rate": 0.0002451770451770451,
+      "loss": 0.6979,
+      "step": 722
+    },
+    {
+      "epoch": 0.7941782232596457,
+      "grad_norm": 0.554032564163208,
+      "learning_rate": 0.00024505494505494503,
+      "loss": 0.6773,
+      "step": 723
+    },
+    {
+      "epoch": 0.7952766717012221,
+      "grad_norm": 0.6914504766464233,
+      "learning_rate": 0.00024493284493284494,
+      "loss": 0.6548,
+      "step": 724
+    },
+    {
+      "epoch": 0.7963751201427983,
+      "grad_norm": 0.40804949402809143,
+      "learning_rate": 0.0002448107448107448,
+      "loss": 0.4634,
+      "step": 725
+    },
+    {
+      "epoch": 0.7974735685843746,
+      "grad_norm": 0.4965716302394867,
+      "learning_rate": 0.00024468864468864466,
+      "loss": 0.4879,
+      "step": 726
+    },
+    {
+      "epoch": 0.7985720170259508,
+      "grad_norm": 0.48798999190330505,
+      "learning_rate": 0.00024456654456654457,
+      "loss": 0.7003,
+      "step": 727
+    },
+    {
+      "epoch": 0.7996704654675271,
+      "grad_norm": 0.6946013569831848,
+      "learning_rate": 0.00024444444444444443,
+      "loss": 0.7508,
+      "step": 728
+    },
+    {
+      "epoch": 0.8007689139091034,
+      "grad_norm": 0.4310678243637085,
+      "learning_rate": 0.0002443223443223443,
+      "loss": 0.5765,
+      "step": 729
+    },
+    {
+      "epoch": 0.8018673623506797,
+      "grad_norm": 0.5407636761665344,
+      "learning_rate": 0.0002442002442002442,
+      "loss": 0.5445,
+      "step": 730
+    },
+    {
+      "epoch": 0.8029658107922559,
+      "grad_norm": 0.6281490921974182,
+      "learning_rate": 0.00024407814407814403,
+      "loss": 0.9319,
+      "step": 731
+    },
+    {
+      "epoch": 0.8040642592338322,
+      "grad_norm": 1.2027008533477783,
+      "learning_rate": 0.00024395604395604394,
+      "loss": 0.3957,
+      "step": 732
+    },
+    {
+      "epoch": 0.8051627076754085,
+      "grad_norm": 0.543230414390564,
+      "learning_rate": 0.00024383394383394383,
+      "loss": 0.7919,
+      "step": 733
+    },
+    {
+      "epoch": 0.8062611561169848,
+      "grad_norm": 0.4269828498363495,
+      "learning_rate": 0.0002437118437118437,
+      "loss": 0.6081,
+      "step": 734
+    },
+    {
+      "epoch": 0.807359604558561,
+      "grad_norm": 1.2857966423034668,
+      "learning_rate": 0.00024358974358974357,
+      "loss": 0.8654,
+      "step": 735
+    },
+    {
+      "epoch": 0.8084580530001373,
+      "grad_norm": 0.6370485424995422,
+      "learning_rate": 0.00024346764346764346,
+      "loss": 0.8053,
+      "step": 736
+    },
+    {
+      "epoch": 0.8095565014417135,
+      "grad_norm": 1.1288559436798096,
+      "learning_rate": 0.00024334554334554332,
+      "loss": 0.8709,
+      "step": 737
+    },
+    {
+      "epoch": 0.8106549498832899,
+      "grad_norm": 0.5601497292518616,
+      "learning_rate": 0.0002432234432234432,
+      "loss": 0.7982,
+      "step": 738
+    },
+    {
+      "epoch": 0.8117533983248661,
+      "grad_norm": 0.476745069026947,
+      "learning_rate": 0.0002431013431013431,
+      "loss": 0.7372,
+      "step": 739
+    },
+    {
+      "epoch": 0.8128518467664424,
+      "grad_norm": 0.4287762939929962,
+      "learning_rate": 0.00024297924297924295,
+      "loss": 0.5686,
+      "step": 740
+    },
+    {
+      "epoch": 0.8139502952080186,
+      "grad_norm": 0.7039306163787842,
+      "learning_rate": 0.00024285714285714283,
+      "loss": 0.7976,
+      "step": 741
+    },
+    {
+      "epoch": 0.8150487436495949,
+      "grad_norm": 0.47433528304100037,
+      "learning_rate": 0.00024273504273504272,
+      "loss": 0.6375,
+      "step": 742
+    },
+    {
+      "epoch": 0.8161471920911713,
+      "grad_norm": 0.5443944931030273,
+      "learning_rate": 0.00024261294261294258,
+      "loss": 0.6793,
+      "step": 743
+    },
+    {
+      "epoch": 0.8172456405327475,
+      "grad_norm": 0.516094982624054,
+      "learning_rate": 0.00024249084249084246,
+      "loss": 0.785,
+      "step": 744
+    },
+    {
+      "epoch": 0.8183440889743238,
+      "grad_norm": 0.6694304347038269,
+      "learning_rate": 0.00024236874236874237,
+      "loss": 0.5431,
+      "step": 745
+    },
+    {
+      "epoch": 0.8194425374159,
+      "grad_norm": 0.5309669375419617,
+      "learning_rate": 0.00024224664224664223,
+      "loss": 0.5806,
+      "step": 746
+    },
+    {
+      "epoch": 0.8205409858574764,
+      "grad_norm": 0.5502971410751343,
+      "learning_rate": 0.00024212454212454212,
+      "loss": 0.5053,
+      "step": 747
+    },
+    {
+      "epoch": 0.8216394342990526,
+      "grad_norm": 0.5242869853973389,
+      "learning_rate": 0.00024200244200244198,
+      "loss": 0.8189,
+      "step": 748
+    },
+    {
+      "epoch": 0.8227378827406289,
+      "grad_norm": 0.4131311774253845,
+      "learning_rate": 0.00024188034188034186,
+      "loss": 0.7074,
+      "step": 749
+    },
+    {
+      "epoch": 0.8238363311822051,
+      "grad_norm": 0.599915087223053,
+      "learning_rate": 0.00024175824175824175,
+      "loss": 0.9408,
+      "step": 750
+    },
+    {
+      "epoch": 0.8249347796237814,
+      "grad_norm": 0.3683515191078186,
+      "learning_rate": 0.0002416361416361416,
+      "loss": 0.6675,
+      "step": 751
+    },
+    {
+      "epoch": 0.8260332280653577,
+      "grad_norm": 1.633415699005127,
+      "learning_rate": 0.0002415140415140415,
+      "loss": 0.6768,
+      "step": 752
+    },
+    {
+      "epoch": 0.827131676506934,
+      "grad_norm": 0.3848377764225006,
+      "learning_rate": 0.00024139194139194138,
+      "loss": 0.485,
+      "step": 753
+    },
+    {
+      "epoch": 0.8282301249485102,
+      "grad_norm": 0.4116027355194092,
+      "learning_rate": 0.00024126984126984123,
+      "loss": 0.8253,
+      "step": 754
+    },
+    {
+      "epoch": 0.8293285733900865,
+      "grad_norm": 0.5805407762527466,
+      "learning_rate": 0.00024114774114774112,
+      "loss": 0.825,
+      "step": 755
+    },
+    {
+      "epoch": 0.8304270218316627,
+      "grad_norm": 1.2401742935180664,
+      "learning_rate": 0.000241025641025641,
+      "loss": 0.6394,
+      "step": 756
+    },
+    {
+      "epoch": 0.8315254702732391,
+      "grad_norm": 0.42345038056373596,
+      "learning_rate": 0.00024090354090354086,
+      "loss": 0.6958,
+      "step": 757
+    },
+    {
+      "epoch": 0.8326239187148153,
+      "grad_norm": 1.3758116960525513,
+      "learning_rate": 0.00024078144078144075,
+      "loss": 0.6997,
+      "step": 758
+    },
+    {
+      "epoch": 0.8337223671563916,
+      "grad_norm": 1.1826672554016113,
+      "learning_rate": 0.00024065934065934066,
+      "loss": 0.7908,
+      "step": 759
+    },
+    {
+      "epoch": 0.8348208155979678,
+      "grad_norm": 1.0752373933792114,
+      "learning_rate": 0.0002405372405372405,
+      "loss": 0.8896,
+      "step": 760
+    },
+    {
+      "epoch": 0.8359192640395442,
+      "grad_norm": 0.3347112834453583,
+      "learning_rate": 0.0002404151404151404,
+      "loss": 0.8202,
+      "step": 761
+    },
+    {
+      "epoch": 0.8370177124811204,
+      "grad_norm": 0.5837082266807556,
+      "learning_rate": 0.0002402930402930403,
+      "loss": 0.7502,
+      "step": 762
+    },
+    {
+      "epoch": 0.8381161609226967,
+      "grad_norm": 0.5439388751983643,
+      "learning_rate": 0.00024017094017094015,
+      "loss": 0.6928,
+      "step": 763
+    },
+    {
+      "epoch": 0.839214609364273,
+      "grad_norm": 0.35348060727119446,
+      "learning_rate": 0.00024004884004884004,
+      "loss": 0.5495,
+      "step": 764
+    },
+    {
+      "epoch": 0.8403130578058492,
+      "grad_norm": 0.4943974018096924,
+      "learning_rate": 0.00023992673992673992,
+      "loss": 0.9218,
+      "step": 765
+    },
+    {
+      "epoch": 0.8414115062474256,
+      "grad_norm": 0.628667414188385,
+      "learning_rate": 0.00023980463980463978,
+      "loss": 0.6266,
+      "step": 766
+    },
+    {
+      "epoch": 0.8425099546890018,
+      "grad_norm": 0.822575032711029,
+      "learning_rate": 0.00023968253968253966,
+      "loss": 0.791,
+      "step": 767
+    },
+    {
+      "epoch": 0.843608403130578,
+      "grad_norm": 0.3044184446334839,
+      "learning_rate": 0.00023956043956043955,
+      "loss": 0.6048,
+      "step": 768
+    },
+    {
+      "epoch": 0.8447068515721543,
+      "grad_norm": 0.40807369351387024,
+      "learning_rate": 0.0002394383394383394,
+      "loss": 0.6286,
+      "step": 769
+    },
+    {
+      "epoch": 0.8458053000137306,
+      "grad_norm": 1.2373838424682617,
+      "learning_rate": 0.0002393162393162393,
+      "loss": 0.5133,
+      "step": 770
+    },
+    {
+      "epoch": 0.8469037484553069,
+      "grad_norm": 0.5104987025260925,
+      "learning_rate": 0.00023919413919413918,
+      "loss": 0.591,
+      "step": 771
+    },
+    {
+      "epoch": 0.8480021968968832,
+      "grad_norm": 0.6644220352172852,
+      "learning_rate": 0.00023907203907203904,
+      "loss": 0.7039,
+      "step": 772
+    },
+    {
+      "epoch": 0.8491006453384594,
+      "grad_norm": 0.5887960195541382,
+      "learning_rate": 0.00023894993894993892,
+      "loss": 0.7017,
+      "step": 773
+    },
+    {
+      "epoch": 0.8501990937800357,
+      "grad_norm": 0.6568577885627747,
+      "learning_rate": 0.00023882783882783878,
+      "loss": 0.6131,
+      "step": 774
+    },
+    {
+      "epoch": 0.851297542221612,
+      "grad_norm": 0.6594721674919128,
+      "learning_rate": 0.00023870573870573867,
+      "loss": 0.6079,
+      "step": 775
+    },
+    {
+      "epoch": 0.8523959906631883,
+      "grad_norm": 12.29937744140625,
+      "learning_rate": 0.00023858363858363858,
+      "loss": 1.1068,
+      "step": 776
+    },
+    {
+      "epoch": 0.8534944391047645,
+      "grad_norm": 1.175355315208435,
+      "learning_rate": 0.00023846153846153844,
+      "loss": 0.734,
+      "step": 777
+    },
+    {
+      "epoch": 0.8545928875463408,
+      "grad_norm": 1.7128019332885742,
+      "learning_rate": 0.00023833943833943832,
+      "loss": 0.6395,
+      "step": 778
+    },
+    {
+      "epoch": 0.855691335987917,
+      "grad_norm": 0.6479717493057251,
+      "learning_rate": 0.0002382173382173382,
+      "loss": 0.8572,
+      "step": 779
+    },
+    {
+      "epoch": 0.8567897844294934,
+      "grad_norm": 0.9646544456481934,
+      "learning_rate": 0.00023809523809523807,
+      "loss": 1.1168,
+      "step": 780
+    },
+    {
+      "epoch": 0.8578882328710696,
+      "grad_norm": 0.8290930986404419,
+      "learning_rate": 0.00023797313797313795,
+      "loss": 0.4413,
+      "step": 781
+    },
+    {
+      "epoch": 0.8589866813126459,
+      "grad_norm": 0.6690389513969421,
+      "learning_rate": 0.00023785103785103784,
+      "loss": 1.1878,
+      "step": 782
+    },
+    {
+      "epoch": 0.8600851297542221,
+      "grad_norm": 0.6602356433868408,
+      "learning_rate": 0.0002377289377289377,
+      "loss": 0.5862,
+      "step": 783
+    },
+    {
+      "epoch": 0.8611835781957984,
+      "grad_norm": 0.612316370010376,
+      "learning_rate": 0.00023760683760683758,
+      "loss": 0.7971,
+      "step": 784
+    },
+    {
+      "epoch": 0.8622820266373747,
+      "grad_norm": 0.7429434657096863,
+      "learning_rate": 0.00023748473748473747,
+      "loss": 0.6265,
+      "step": 785
+    },
+    {
+      "epoch": 0.863380475078951,
+      "grad_norm": 0.40107640624046326,
+      "learning_rate": 0.00023736263736263733,
+      "loss": 0.6697,
+      "step": 786
+    },
+    {
+      "epoch": 0.8644789235205272,
+      "grad_norm": 0.45808035135269165,
+      "learning_rate": 0.0002372405372405372,
+      "loss": 0.7443,
+      "step": 787
+    },
+    {
+      "epoch": 0.8655773719621035,
+      "grad_norm": 0.36327049136161804,
+      "learning_rate": 0.0002371184371184371,
+      "loss": 0.6518,
+      "step": 788
+    },
+    {
+      "epoch": 0.8666758204036799,
+      "grad_norm": 0.45617833733558655,
+      "learning_rate": 0.00023699633699633696,
+      "loss": 0.792,
+      "step": 789
+    },
+    {
+      "epoch": 0.8677742688452561,
+      "grad_norm": 0.5354835391044617,
+      "learning_rate": 0.00023687423687423687,
+      "loss": 0.7788,
+      "step": 790
+    },
+    {
+      "epoch": 0.8688727172868324,
+      "grad_norm": 0.9770327210426331,
+      "learning_rate": 0.00023675213675213675,
+      "loss": 0.7267,
+      "step": 791
+    },
+    {
+      "epoch": 0.8699711657284086,
+      "grad_norm": 0.646757960319519,
+      "learning_rate": 0.0002366300366300366,
+      "loss": 0.7234,
+      "step": 792
+    },
+    {
+      "epoch": 0.8710696141699849,
+      "grad_norm": 0.4694693982601166,
+      "learning_rate": 0.0002365079365079365,
+      "loss": 0.8261,
+      "step": 793
+    },
+    {
+      "epoch": 0.8721680626115612,
+      "grad_norm": 0.9923954606056213,
+      "learning_rate": 0.00023638583638583638,
+      "loss": 0.703,
+      "step": 794
+    },
+    {
+      "epoch": 0.8732665110531375,
+      "grad_norm": 1.6440534591674805,
+      "learning_rate": 0.00023626373626373624,
+      "loss": 0.7654,
+      "step": 795
+    },
+    {
+      "epoch": 0.8743649594947137,
+      "grad_norm": 0.3947128653526306,
+      "learning_rate": 0.00023614163614163613,
+      "loss": 0.637,
+      "step": 796
+    },
+    {
+      "epoch": 0.87546340793629,
+      "grad_norm": 3.4264323711395264,
+      "learning_rate": 0.000236019536019536,
+      "loss": 0.7325,
+      "step": 797
+    },
+    {
+      "epoch": 0.8765618563778662,
+      "grad_norm": 0.5469256043434143,
+      "learning_rate": 0.00023589743589743587,
+      "loss": 0.8203,
+      "step": 798
+    },
+    {
+      "epoch": 0.8776603048194426,
+      "grad_norm": 0.5184471011161804,
+      "learning_rate": 0.00023577533577533576,
+      "loss": 0.7895,
+      "step": 799
+    },
+    {
+      "epoch": 0.8787587532610188,
+      "grad_norm": 0.8231347799301147,
+      "learning_rate": 0.00023565323565323562,
+      "loss": 0.7888,
+      "step": 800
+    },
+    {
+      "epoch": 0.8798572017025951,
+      "grad_norm": 14.826855659484863,
+      "learning_rate": 0.0002355311355311355,
+      "loss": 0.7564,
+      "step": 801
+    },
+    {
+      "epoch": 0.8809556501441713,
+      "grad_norm": 0.5809927582740784,
+      "learning_rate": 0.00023540903540903539,
+      "loss": 0.6702,
+      "step": 802
+    },
+    {
+      "epoch": 0.8820540985857477,
+      "grad_norm": 0.7244674563407898,
+      "learning_rate": 0.00023528693528693524,
+      "loss": 0.6475,
+      "step": 803
+    },
+    {
+      "epoch": 0.8831525470273239,
+      "grad_norm": 0.8071272373199463,
+      "learning_rate": 0.00023516483516483513,
+      "loss": 0.7434,
+      "step": 804
+    },
+    {
+      "epoch": 0.8842509954689002,
+      "grad_norm": 0.6872429847717285,
+      "learning_rate": 0.00023504273504273504,
+      "loss": 0.5968,
+      "step": 805
+    },
+    {
+      "epoch": 0.8853494439104764,
+      "grad_norm": 9.353965759277344,
+      "learning_rate": 0.00023492063492063487,
+      "loss": 0.4228,
+      "step": 806
+    },
+    {
+      "epoch": 0.8864478923520527,
+      "grad_norm": 0.47151222825050354,
+      "learning_rate": 0.00023479853479853479,
+      "loss": 0.6832,
+      "step": 807
+    },
+    {
+      "epoch": 0.887546340793629,
+      "grad_norm": 1.4599422216415405,
+      "learning_rate": 0.00023467643467643467,
+      "loss": 0.6692,
+      "step": 808
+    },
+    {
+      "epoch": 0.8886447892352053,
+      "grad_norm": 0.45811519026756287,
+      "learning_rate": 0.00023455433455433453,
+      "loss": 0.787,
+      "step": 809
+    },
+    {
+      "epoch": 0.8897432376767815,
+      "grad_norm": 1.077709674835205,
+      "learning_rate": 0.00023443223443223442,
+      "loss": 0.6695,
+      "step": 810
+    },
+    {
+      "epoch": 0.8908416861183578,
+      "grad_norm": 0.5702061057090759,
+      "learning_rate": 0.0002343101343101343,
+      "loss": 0.5858,
+      "step": 811
+    },
+    {
+      "epoch": 0.891940134559934,
+      "grad_norm": 2.2391059398651123,
+      "learning_rate": 0.00023418803418803416,
+      "loss": 0.6688,
+      "step": 812
+    },
+    {
+      "epoch": 0.8930385830015104,
+      "grad_norm": 1.6974279880523682,
+      "learning_rate": 0.00023406593406593405,
+      "loss": 0.8545,
+      "step": 813
+    },
+    {
+      "epoch": 0.8941370314430866,
+      "grad_norm": 0.983435869216919,
+      "learning_rate": 0.00023394383394383393,
+      "loss": 0.8128,
+      "step": 814
+    },
+    {
+      "epoch": 0.8952354798846629,
+      "grad_norm": 0.44103240966796875,
+      "learning_rate": 0.0002338217338217338,
+      "loss": 0.7968,
+      "step": 815
+    },
+    {
+      "epoch": 0.8963339283262391,
+      "grad_norm": 1.0707038640975952,
+      "learning_rate": 0.00023369963369963367,
+      "loss": 0.6996,
+      "step": 816
+    },
+    {
+      "epoch": 0.8974323767678155,
+      "grad_norm": 0.8029122352600098,
+      "learning_rate": 0.00023357753357753356,
+      "loss": 0.7911,
+      "step": 817
+    },
+    {
+      "epoch": 0.8985308252093918,
+      "grad_norm": 0.46339499950408936,
+      "learning_rate": 0.00023345543345543342,
+      "loss": 0.7712,
+      "step": 818
+    },
+    {
+      "epoch": 0.899629273650968,
+      "grad_norm": 1.020947813987732,
+      "learning_rate": 0.0002333333333333333,
+      "loss": 0.6865,
+      "step": 819
+    },
+    {
+      "epoch": 0.9007277220925443,
+      "grad_norm": 0.5332039594650269,
+      "learning_rate": 0.00023321123321123322,
+      "loss": 0.8352,
+      "step": 820
+    },
+    {
+      "epoch": 0.9018261705341205,
+      "grad_norm": 0.40052923560142517,
+      "learning_rate": 0.00023308913308913307,
+      "loss": 0.5435,
+      "step": 821
+    },
+    {
+      "epoch": 0.9029246189756969,
+      "grad_norm": 0.6643521189689636,
+      "learning_rate": 0.00023296703296703296,
+      "loss": 0.7406,
+      "step": 822
+    },
+    {
+      "epoch": 0.9040230674172731,
+      "grad_norm": 0.7514997720718384,
+      "learning_rate": 0.00023284493284493285,
+      "loss": 0.7595,
+      "step": 823
+    },
+    {
+      "epoch": 0.9051215158588494,
+      "grad_norm": 0.7124571204185486,
+      "learning_rate": 0.0002327228327228327,
+      "loss": 0.5736,
+      "step": 824
+    },
+    {
+      "epoch": 0.9062199643004256,
+      "grad_norm": 0.6757075786590576,
+      "learning_rate": 0.0002326007326007326,
+      "loss": 0.6275,
+      "step": 825
+    },
+    {
+      "epoch": 0.9073184127420019,
+      "grad_norm": 0.4200783669948578,
+      "learning_rate": 0.00023247863247863245,
+      "loss": 0.6267,
+      "step": 826
+    },
+    {
+      "epoch": 0.9084168611835782,
+      "grad_norm": 0.5442836284637451,
+      "learning_rate": 0.00023235653235653233,
+      "loss": 0.6814,
+      "step": 827
+    },
+    {
+      "epoch": 0.9095153096251545,
+      "grad_norm": 0.4859601557254791,
+      "learning_rate": 0.00023223443223443222,
+      "loss": 0.6451,
+      "step": 828
+    },
+    {
+      "epoch": 0.9106137580667307,
+      "grad_norm": 0.7353097200393677,
+      "learning_rate": 0.00023211233211233208,
+      "loss": 0.6723,
+      "step": 829
+    },
+    {
+      "epoch": 0.911712206508307,
+      "grad_norm": 0.6389304995536804,
+      "learning_rate": 0.00023199023199023196,
+      "loss": 0.9429,
+      "step": 830
+    },
+    {
+      "epoch": 0.9128106549498833,
+      "grad_norm": 0.6813933849334717,
+      "learning_rate": 0.00023186813186813185,
+      "loss": 0.5319,
+      "step": 831
+    },
+    {
+      "epoch": 0.9139091033914596,
+      "grad_norm": 0.40023690462112427,
+      "learning_rate": 0.0002317460317460317,
+      "loss": 0.5808,
+      "step": 832
+    },
+    {
+      "epoch": 0.9150075518330358,
+      "grad_norm": 0.5327205657958984,
+      "learning_rate": 0.0002316239316239316,
+      "loss": 0.6666,
+      "step": 833
+    },
+    {
+      "epoch": 0.9161060002746121,
+      "grad_norm": 1.672450065612793,
+      "learning_rate": 0.0002315018315018315,
+      "loss": 0.7758,
+      "step": 834
+    },
+    {
+      "epoch": 0.9172044487161883,
+      "grad_norm": 0.5022990703582764,
+      "learning_rate": 0.00023137973137973134,
+      "loss": 0.6309,
+      "step": 835
+    },
+    {
+      "epoch": 0.9183028971577647,
+      "grad_norm": 0.43023642897605896,
+      "learning_rate": 0.00023125763125763125,
+      "loss": 0.5343,
+      "step": 836
+    },
+    {
+      "epoch": 0.919401345599341,
+      "grad_norm": 0.6878641843795776,
+      "learning_rate": 0.00023113553113553113,
+      "loss": 0.7268,
+      "step": 837
+    },
+    {
+      "epoch": 0.9204997940409172,
+      "grad_norm": 0.40551453828811646,
+      "learning_rate": 0.000231013431013431,
+      "loss": 0.5784,
+      "step": 838
+    },
+    {
+      "epoch": 0.9215982424824934,
+      "grad_norm": 0.412356436252594,
+      "learning_rate": 0.00023089133089133088,
+      "loss": 0.7685,
+      "step": 839
+    },
+    {
+      "epoch": 0.9226966909240698,
+      "grad_norm": 1.1603305339813232,
+      "learning_rate": 0.00023076923076923076,
+      "loss": 0.518,
+      "step": 840
+    },
+    {
+      "epoch": 0.9237951393656461,
+      "grad_norm": 0.6733229756355286,
+      "learning_rate": 0.00023064713064713062,
+      "loss": 0.5883,
+      "step": 841
+    },
+    {
+      "epoch": 0.9248935878072223,
+      "grad_norm": 0.619434654712677,
+      "learning_rate": 0.0002305250305250305,
+      "loss": 0.6244,
+      "step": 842
+    },
+    {
+      "epoch": 0.9259920362487986,
+      "grad_norm": 0.6989772319793701,
+      "learning_rate": 0.0002304029304029304,
+      "loss": 0.5763,
+      "step": 843
+    },
+    {
+      "epoch": 0.9270904846903748,
+      "grad_norm": 0.6276418566703796,
+      "learning_rate": 0.00023028083028083025,
+      "loss": 0.4762,
+      "step": 844
+    },
+    {
+      "epoch": 0.9281889331319512,
+      "grad_norm": 0.5577360987663269,
+      "learning_rate": 0.00023015873015873014,
+      "loss": 0.6254,
+      "step": 845
+    },
+    {
+      "epoch": 0.9292873815735274,
+      "grad_norm": 0.6185848116874695,
+      "learning_rate": 0.00023003663003663002,
+      "loss": 1.0182,
+      "step": 846
+    },
+    {
+      "epoch": 0.9303858300151037,
+      "grad_norm": 1.2415262460708618,
+      "learning_rate": 0.00022991452991452988,
+      "loss": 0.4677,
+      "step": 847
+    },
+    {
+      "epoch": 0.9314842784566799,
+      "grad_norm": 0.4582594335079193,
+      "learning_rate": 0.00022979242979242977,
+      "loss": 0.6308,
+      "step": 848
+    },
+    {
+      "epoch": 0.9325827268982562,
+      "grad_norm": 0.4749620258808136,
+      "learning_rate": 0.00022967032967032962,
+      "loss": 0.6217,
+      "step": 849
+    },
+    {
+      "epoch": 0.9336811753398325,
+      "grad_norm": 0.48614588379859924,
+      "learning_rate": 0.0002295482295482295,
+      "loss": 0.7469,
+      "step": 850
+    },
+    {
+      "epoch": 0.9347796237814088,
+      "grad_norm": 0.7357453107833862,
+      "learning_rate": 0.00022942612942612942,
+      "loss": 0.5978,
+      "step": 851
+    },
+    {
+      "epoch": 0.935878072222985,
+      "grad_norm": 0.53326815366745,
+      "learning_rate": 0.00022930402930402928,
+      "loss": 0.7678,
+      "step": 852
+    },
+    {
+      "epoch": 0.9369765206645613,
+      "grad_norm": 0.4853271245956421,
+      "learning_rate": 0.00022918192918192917,
+      "loss": 0.4888,
+      "step": 853
+    },
+    {
+      "epoch": 0.9380749691061376,
+      "grad_norm": 1.6529743671417236,
+      "learning_rate": 0.00022905982905982905,
+      "loss": 0.6103,
+      "step": 854
+    },
+    {
+      "epoch": 0.9391734175477139,
+      "grad_norm": 0.8255143165588379,
+      "learning_rate": 0.0002289377289377289,
+      "loss": 0.6977,
+      "step": 855
+    },
+    {
+      "epoch": 0.9402718659892901,
+      "grad_norm": 0.3999016284942627,
+      "learning_rate": 0.0002288156288156288,
+      "loss": 0.5398,
+      "step": 856
+    },
+    {
+      "epoch": 0.9413703144308664,
+      "grad_norm": 1.933090329170227,
+      "learning_rate": 0.00022869352869352868,
+      "loss": 1.0827,
+      "step": 857
+    },
+    {
+      "epoch": 0.9424687628724426,
+      "grad_norm": 0.8884105682373047,
+      "learning_rate": 0.00022857142857142854,
+      "loss": 0.702,
+      "step": 858
+    },
+    {
+      "epoch": 0.943567211314019,
+      "grad_norm": 0.4555901885032654,
+      "learning_rate": 0.00022844932844932843,
+      "loss": 0.8737,
+      "step": 859
+    },
+    {
+      "epoch": 0.9446656597555952,
+      "grad_norm": 0.535915732383728,
+      "learning_rate": 0.0002283272283272283,
+      "loss": 0.7036,
+      "step": 860
+    },
+    {
+      "epoch": 0.9457641081971715,
+      "grad_norm": 0.7607597708702087,
+      "learning_rate": 0.00022820512820512817,
+      "loss": 0.8707,
+      "step": 861
+    },
+    {
+      "epoch": 0.9468625566387477,
+      "grad_norm": 0.4056457579135895,
+      "learning_rate": 0.00022808302808302805,
+      "loss": 0.6658,
+      "step": 862
+    },
+    {
+      "epoch": 0.947961005080324,
+      "grad_norm": 0.5472984313964844,
+      "learning_rate": 0.00022796092796092794,
+      "loss": 0.5429,
+      "step": 863
+    },
+    {
+      "epoch": 0.9490594535219004,
+      "grad_norm": 0.6866592764854431,
+      "learning_rate": 0.0002278388278388278,
+      "loss": 0.7343,
+      "step": 864
+    },
+    {
+      "epoch": 0.9501579019634766,
+      "grad_norm": 0.5244406461715698,
+      "learning_rate": 0.0002277167277167277,
+      "loss": 0.669,
+      "step": 865
+    },
+    {
+      "epoch": 0.9512563504050529,
+      "grad_norm": 0.45024383068084717,
+      "learning_rate": 0.0002275946275946276,
+      "loss": 0.9062,
+      "step": 866
+    },
+    {
+      "epoch": 0.9523547988466291,
+      "grad_norm": 0.4252873659133911,
+      "learning_rate": 0.00022747252747252745,
+      "loss": 0.6109,
+      "step": 867
+    },
+    {
+      "epoch": 0.9534532472882055,
+      "grad_norm": 0.50081467628479,
+      "learning_rate": 0.00022735042735042734,
+      "loss": 0.5266,
+      "step": 868
+    },
+    {
+      "epoch": 0.9545516957297817,
+      "grad_norm": 0.9674072861671448,
+      "learning_rate": 0.00022722832722832723,
+      "loss": 0.7197,
+      "step": 869
+    },
+    {
+      "epoch": 0.955650144171358,
+      "grad_norm": 1.572348952293396,
+      "learning_rate": 0.00022710622710622708,
+      "loss": 0.4728,
+      "step": 870
+    },
+    {
+      "epoch": 0.9567485926129342,
+      "grad_norm": 0.6033158898353577,
+      "learning_rate": 0.00022698412698412697,
+      "loss": 0.6394,
+      "step": 871
+    },
+    {
+      "epoch": 0.9578470410545105,
+      "grad_norm": 0.5810523629188538,
+      "learning_rate": 0.00022686202686202686,
+      "loss": 0.8813,
+      "step": 872
+    },
+    {
+      "epoch": 0.9589454894960868,
+      "grad_norm": 0.46345213055610657,
+      "learning_rate": 0.00022673992673992671,
+      "loss": 0.5828,
+      "step": 873
+    },
+    {
+      "epoch": 0.9600439379376631,
+      "grad_norm": 0.5414748191833496,
+      "learning_rate": 0.0002266178266178266,
+      "loss": 0.6311,
+      "step": 874
+    },
+    {
+      "epoch": 0.9611423863792393,
+      "grad_norm": 0.9083818197250366,
+      "learning_rate": 0.00022649572649572646,
+      "loss": 0.961,
+      "step": 875
+    },
+    {
+      "epoch": 0.9622408348208156,
+      "grad_norm": 0.786993145942688,
+      "learning_rate": 0.00022637362637362634,
+      "loss": 0.7825,
+      "step": 876
+    },
+    {
+      "epoch": 0.9633392832623918,
+      "grad_norm": 0.7639968991279602,
+      "learning_rate": 0.00022625152625152623,
+      "loss": 0.8989,
+      "step": 877
+    },
+    {
+      "epoch": 0.9644377317039682,
+      "grad_norm": 0.43360400199890137,
+      "learning_rate": 0.0002261294261294261,
+      "loss": 0.6747,
+      "step": 878
+    },
+    {
+      "epoch": 0.9655361801455444,
+      "grad_norm": 0.8512898683547974,
+      "learning_rate": 0.00022600732600732597,
+      "loss": 0.7152,
+      "step": 879
+    },
+    {
+      "epoch": 0.9666346285871207,
+      "grad_norm": 0.46903684735298157,
+      "learning_rate": 0.00022588522588522589,
+      "loss": 0.7594,
+      "step": 880
+    },
+    {
+      "epoch": 0.9677330770286969,
+      "grad_norm": 1.9560080766677856,
+      "learning_rate": 0.00022576312576312572,
+      "loss": 0.598,
+      "step": 881
+    },
+    {
+      "epoch": 0.9688315254702733,
+      "grad_norm": 1.1595470905303955,
+      "learning_rate": 0.00022564102564102563,
+      "loss": 0.6005,
+      "step": 882
+    },
+    {
+      "epoch": 0.9699299739118495,
+      "grad_norm": 0.7318668365478516,
+      "learning_rate": 0.00022551892551892551,
+      "loss": 0.7327,
+      "step": 883
+    },
+    {
+      "epoch": 0.9710284223534258,
+      "grad_norm": 0.6557647585868835,
+      "learning_rate": 0.00022539682539682537,
+      "loss": 0.5858,
+      "step": 884
+    },
+    {
+      "epoch": 0.972126870795002,
+      "grad_norm": 0.5645928382873535,
+      "learning_rate": 0.00022527472527472526,
+      "loss": 0.5818,
+      "step": 885
+    },
+    {
+      "epoch": 0.9732253192365783,
+      "grad_norm": 0.4630253314971924,
+      "learning_rate": 0.00022515262515262514,
+      "loss": 0.8363,
+      "step": 886
+    },
+    {
+      "epoch": 0.9743237676781547,
+      "grad_norm": 0.6750912666320801,
+      "learning_rate": 0.000225030525030525,
+      "loss": 0.8865,
+      "step": 887
+    },
+    {
+      "epoch": 0.9754222161197309,
+      "grad_norm": 0.6309487819671631,
+      "learning_rate": 0.0002249084249084249,
+      "loss": 0.5596,
+      "step": 888
+    },
+    {
+      "epoch": 0.9765206645613072,
+      "grad_norm": 0.9696050882339478,
+      "learning_rate": 0.00022478632478632477,
+      "loss": 0.7752,
+      "step": 889
+    },
+    {
+      "epoch": 0.9776191130028834,
+      "grad_norm": 0.7614735960960388,
+      "learning_rate": 0.00022466422466422463,
+      "loss": 0.7131,
+      "step": 890
+    },
+    {
+      "epoch": 0.9787175614444596,
+      "grad_norm": 0.4971006214618683,
+      "learning_rate": 0.00022454212454212452,
+      "loss": 0.6218,
+      "step": 891
+    },
+    {
+      "epoch": 0.979816009886036,
+      "grad_norm": 0.47809773683547974,
+      "learning_rate": 0.0002244200244200244,
+      "loss": 0.5678,
+      "step": 892
+    },
+    {
+      "epoch": 0.9809144583276123,
+      "grad_norm": 0.5959337949752808,
+      "learning_rate": 0.00022429792429792426,
+      "loss": 1.0002,
+      "step": 893
+    },
+    {
+      "epoch": 0.9820129067691885,
+      "grad_norm": 0.45277753472328186,
+      "learning_rate": 0.00022417582417582415,
+      "loss": 0.7321,
+      "step": 894
+    },
+    {
+      "epoch": 0.9831113552107648,
+      "grad_norm": 1.279405951499939,
+      "learning_rate": 0.00022405372405372406,
+      "loss": 0.7912,
+      "step": 895
+    },
+    {
+      "epoch": 0.9842098036523411,
+      "grad_norm": 0.49885687232017517,
+      "learning_rate": 0.00022393162393162392,
+      "loss": 0.5558,
+      "step": 896
+    },
+    {
+      "epoch": 0.9853082520939174,
+      "grad_norm": 0.474979430437088,
+      "learning_rate": 0.0002238095238095238,
+      "loss": 0.7095,
+      "step": 897
+    },
+    {
+      "epoch": 0.9864067005354936,
+      "grad_norm": 0.3826389014720917,
+      "learning_rate": 0.0002236874236874237,
+      "loss": 0.5695,
+      "step": 898
+    },
+    {
+      "epoch": 0.9875051489770699,
+      "grad_norm": 0.33514517545700073,
+      "learning_rate": 0.00022356532356532355,
+      "loss": 0.6341,
+      "step": 899
+    },
+    {
+      "epoch": 0.9886035974186461,
+      "grad_norm": 0.5049251914024353,
+      "learning_rate": 0.00022344322344322343,
+      "loss": 0.5577,
+      "step": 900
+    },
+    {
+      "epoch": 0.9897020458602225,
+      "grad_norm": 0.5179988145828247,
+      "learning_rate": 0.0002233211233211233,
+      "loss": 0.5769,
+      "step": 901
+    },
+    {
+      "epoch": 0.9908004943017987,
+      "grad_norm": 0.5194469094276428,
+      "learning_rate": 0.00022319902319902318,
+      "loss": 0.5466,
+      "step": 902
+    },
+    {
+      "epoch": 0.991898942743375,
+      "grad_norm": 0.46941491961479187,
+      "learning_rate": 0.00022307692307692306,
+      "loss": 0.642,
+      "step": 903
+    },
+    {
+      "epoch": 0.9929973911849512,
+      "grad_norm": 0.379682719707489,
+      "learning_rate": 0.00022295482295482292,
+      "loss": 0.5508,
+      "step": 904
+    },
+    {
+      "epoch": 0.9940958396265275,
+      "grad_norm": 1.3844119310379028,
+      "learning_rate": 0.0002228327228327228,
+      "loss": 0.8814,
+      "step": 905
+    },
+    {
+      "epoch": 0.9951942880681038,
+      "grad_norm": 2.497697114944458,
+      "learning_rate": 0.0002227106227106227,
+      "loss": 0.8116,
+      "step": 906
+    },
+    {
+      "epoch": 0.9962927365096801,
+      "grad_norm": 0.36689239740371704,
+      "learning_rate": 0.00022258852258852255,
+      "loss": 0.5001,
+      "step": 907
+    },
+    {
+      "epoch": 0.9973911849512563,
+      "grad_norm": 0.39868447184562683,
+      "learning_rate": 0.00022246642246642243,
+      "loss": 0.6913,
+      "step": 908
+    },
+    {
+      "epoch": 0.9984896333928326,
+      "grad_norm": 0.5270336270332336,
+      "learning_rate": 0.00022234432234432235,
+      "loss": 0.5401,
+      "step": 909
+    },
+    {
+      "epoch": 0.999588081834409,
+      "grad_norm": 0.4079851508140564,
+      "learning_rate": 0.00022222222222222218,
+      "loss": 0.471,
+      "step": 910
+    },
+    {
+      "epoch": 1.000686530275985,
+      "grad_norm": 0.43189048767089844,
+      "learning_rate": 0.0002221001221001221,
+      "loss": 0.8237,
+      "step": 911
+    },
+    {
+      "epoch": 1.0017849787175614,
+      "grad_norm": 0.52342289686203,
+      "learning_rate": 0.00022197802197802198,
+      "loss": 0.6363,
+      "step": 912
+    },
+    {
+      "epoch": 1.0028834271591378,
+      "grad_norm": 0.38078904151916504,
+      "learning_rate": 0.00022185592185592184,
+      "loss": 0.4411,
+      "step": 913
+    },
+    {
+      "epoch": 1.003981875600714,
+      "grad_norm": 0.5302817821502686,
+      "learning_rate": 0.00022173382173382172,
+      "loss": 0.858,
+      "step": 914
+    },
+    {
+      "epoch": 1.0050803240422903,
+      "grad_norm": 0.3696751892566681,
+      "learning_rate": 0.0002216117216117216,
+      "loss": 0.8766,
+      "step": 915
+    },
+    {
+      "epoch": 1.0061787724838664,
+      "grad_norm": 0.7566766738891602,
+      "learning_rate": 0.00022148962148962146,
+      "loss": 1.067,
+      "step": 916
+    },
+    {
+      "epoch": 1.0072772209254428,
+      "grad_norm": 0.7399318218231201,
+      "learning_rate": 0.00022136752136752135,
+      "loss": 0.6683,
+      "step": 917
+    },
+    {
+      "epoch": 1.0083756693670192,
+      "grad_norm": 0.5435899496078491,
+      "learning_rate": 0.00022124542124542124,
+      "loss": 0.6045,
+      "step": 918
+    },
+    {
+      "epoch": 1.0094741178085953,
+      "grad_norm": 0.9680571556091309,
+      "learning_rate": 0.0002211233211233211,
+      "loss": 0.7546,
+      "step": 919
+    },
+    {
+      "epoch": 1.0105725662501717,
+      "grad_norm": 0.6131067872047424,
+      "learning_rate": 0.00022100122100122098,
+      "loss": 0.6655,
+      "step": 920
+    },
+    {
+      "epoch": 1.0116710146917478,
+      "grad_norm": 0.8093316555023193,
+      "learning_rate": 0.00022087912087912086,
+      "loss": 0.4812,
+      "step": 921
+    },
+    {
+      "epoch": 1.0127694631333242,
+      "grad_norm": 0.5077763199806213,
+      "learning_rate": 0.00022075702075702072,
+      "loss": 0.5357,
+      "step": 922
+    },
+    {
+      "epoch": 1.0138679115749005,
+      "grad_norm": 0.4767695963382721,
+      "learning_rate": 0.0002206349206349206,
+      "loss": 0.5807,
+      "step": 923
+    },
+    {
+      "epoch": 1.0149663600164767,
+      "grad_norm": 0.3215581178665161,
+      "learning_rate": 0.00022051282051282052,
+      "loss": 0.5773,
+      "step": 924
+    },
+    {
+      "epoch": 1.016064808458053,
+      "grad_norm": 0.425603985786438,
+      "learning_rate": 0.00022039072039072035,
+      "loss": 0.5441,
+      "step": 925
+    },
+    {
+      "epoch": 1.0171632568996292,
+      "grad_norm": 0.6131730079650879,
+      "learning_rate": 0.00022026862026862027,
+      "loss": 0.856,
+      "step": 926
+    },
+    {
+      "epoch": 1.0182617053412055,
+      "grad_norm": 0.5472941398620605,
+      "learning_rate": 0.00022014652014652012,
+      "loss": 0.8228,
+      "step": 927
+    },
+    {
+      "epoch": 1.0193601537827819,
+      "grad_norm": 0.46728211641311646,
+      "learning_rate": 0.00022002442002442,
+      "loss": 0.7615,
+      "step": 928
+    },
+    {
+      "epoch": 1.020458602224358,
+      "grad_norm": 0.39919501543045044,
+      "learning_rate": 0.0002199023199023199,
+      "loss": 0.709,
+      "step": 929
+    },
+    {
+      "epoch": 1.0215570506659344,
+      "grad_norm": 0.564400315284729,
+      "learning_rate": 0.00021978021978021975,
+      "loss": 0.5941,
+      "step": 930
+    },
+    {
+      "epoch": 1.0226554991075107,
+      "grad_norm": 0.39073804020881653,
+      "learning_rate": 0.00021965811965811964,
+      "loss": 0.6386,
+      "step": 931
+    },
+    {
+      "epoch": 1.0237539475490869,
+      "grad_norm": 0.3725563585758209,
+      "learning_rate": 0.00021953601953601952,
+      "loss": 0.4766,
+      "step": 932
+    },
+    {
+      "epoch": 1.0248523959906632,
+      "grad_norm": 1.319197654724121,
+      "learning_rate": 0.00021941391941391938,
+      "loss": 0.8465,
+      "step": 933
+    },
+    {
+      "epoch": 1.0259508444322394,
+      "grad_norm": 0.5126785635948181,
+      "learning_rate": 0.00021929181929181927,
+      "loss": 0.5103,
+      "step": 934
+    },
+    {
+      "epoch": 1.0270492928738157,
+      "grad_norm": 0.5401897430419922,
+      "learning_rate": 0.00021916971916971915,
+      "loss": 0.5879,
+      "step": 935
+    },
+    {
+      "epoch": 1.028147741315392,
+      "grad_norm": 0.47014057636260986,
+      "learning_rate": 0.000219047619047619,
+      "loss": 0.658,
+      "step": 936
+    },
+    {
+      "epoch": 1.0292461897569682,
+      "grad_norm": 0.49227291345596313,
+      "learning_rate": 0.0002189255189255189,
+      "loss": 0.5271,
+      "step": 937
+    },
+    {
+      "epoch": 1.0303446381985446,
+      "grad_norm": 0.8186778426170349,
+      "learning_rate": 0.00021880341880341878,
+      "loss": 0.6491,
+      "step": 938
+    },
+    {
+      "epoch": 1.0314430866401207,
+      "grad_norm": 0.46345674991607666,
+      "learning_rate": 0.00021868131868131864,
+      "loss": 0.7935,
+      "step": 939
+    },
+    {
+      "epoch": 1.032541535081697,
+      "grad_norm": 1.7300915718078613,
+      "learning_rate": 0.00021855921855921855,
+      "loss": 0.516,
+      "step": 940
+    },
+    {
+      "epoch": 1.0336399835232735,
+      "grad_norm": 0.5100822448730469,
+      "learning_rate": 0.00021843711843711844,
+      "loss": 0.8286,
+      "step": 941
+    },
+    {
+      "epoch": 1.0347384319648496,
+      "grad_norm": 0.42278483510017395,
+      "learning_rate": 0.0002183150183150183,
+      "loss": 0.7312,
+      "step": 942
+    },
+    {
+      "epoch": 1.035836880406426,
+      "grad_norm": 0.42105185985565186,
+      "learning_rate": 0.00021819291819291818,
+      "loss": 0.5729,
+      "step": 943
+    },
+    {
+      "epoch": 1.036935328848002,
+      "grad_norm": 0.5117312669754028,
+      "learning_rate": 0.00021807081807081807,
+      "loss": 0.7688,
+      "step": 944
+    },
+    {
+      "epoch": 1.0380337772895785,
+      "grad_norm": 0.4982740879058838,
+      "learning_rate": 0.00021794871794871793,
+      "loss": 0.5746,
+      "step": 945
+    },
+    {
+      "epoch": 1.0391322257311548,
+      "grad_norm": 0.5181052684783936,
+      "learning_rate": 0.0002178266178266178,
+      "loss": 0.8446,
+      "step": 946
+    },
+    {
+      "epoch": 1.040230674172731,
+      "grad_norm": 5.104315757751465,
+      "learning_rate": 0.0002177045177045177,
+      "loss": 0.9641,
+      "step": 947
+    },
+    {
+      "epoch": 1.0413291226143073,
+      "grad_norm": 0.7384645938873291,
+      "learning_rate": 0.00021758241758241756,
+      "loss": 0.7168,
+      "step": 948
+    },
+    {
+      "epoch": 1.0424275710558835,
+      "grad_norm": 0.4367550313472748,
+      "learning_rate": 0.00021746031746031744,
+      "loss": 0.7139,
+      "step": 949
+    },
+    {
+      "epoch": 1.0435260194974598,
+      "grad_norm": 0.7332566380500793,
+      "learning_rate": 0.00021733821733821733,
+      "loss": 0.7082,
+      "step": 950
+    },
+    {
+      "epoch": 1.0446244679390362,
+      "grad_norm": 0.4191775918006897,
+      "learning_rate": 0.00021721611721611719,
+      "loss": 0.7986,
+      "step": 951
+    },
+    {
+      "epoch": 1.0457229163806123,
+      "grad_norm": 0.33929941058158875,
+      "learning_rate": 0.00021709401709401707,
+      "loss": 0.3784,
+      "step": 952
+    },
+    {
+      "epoch": 1.0468213648221887,
+      "grad_norm": 0.5255181789398193,
+      "learning_rate": 0.00021697191697191693,
+      "loss": 0.5842,
+      "step": 953
+    },
+    {
+      "epoch": 1.047919813263765,
+      "grad_norm": 0.5401780605316162,
+      "learning_rate": 0.00021684981684981681,
+      "loss": 0.7939,
+      "step": 954
+    },
+    {
+      "epoch": 1.0490182617053412,
+      "grad_norm": 0.34873855113983154,
+      "learning_rate": 0.00021672771672771673,
+      "loss": 0.7957,
+      "step": 955
+    },
+    {
+      "epoch": 1.0501167101469175,
+      "grad_norm": 0.33418160676956177,
+      "learning_rate": 0.00021660561660561656,
+      "loss": 0.6037,
+      "step": 956
+    },
+    {
+      "epoch": 1.0512151585884937,
+      "grad_norm": 0.3197249174118042,
+      "learning_rate": 0.00021648351648351647,
+      "loss": 0.5223,
+      "step": 957
+    },
+    {
+      "epoch": 1.05231360703007,
+      "grad_norm": 0.5962835550308228,
+      "learning_rate": 0.00021636141636141636,
+      "loss": 0.5213,
+      "step": 958
+    },
+    {
+      "epoch": 1.0534120554716464,
+      "grad_norm": 1.3891643285751343,
+      "learning_rate": 0.00021623931623931622,
+      "loss": 0.6781,
+      "step": 959
+    },
+    {
+      "epoch": 1.0545105039132225,
+      "grad_norm": 0.42117932438850403,
+      "learning_rate": 0.0002161172161172161,
+      "loss": 0.6363,
+      "step": 960
+    },
+    {
+      "epoch": 1.055608952354799,
+      "grad_norm": 0.4514491558074951,
+      "learning_rate": 0.00021599511599511599,
+      "loss": 0.6904,
+      "step": 961
+    },
+    {
+      "epoch": 1.056707400796375,
+      "grad_norm": 0.4863387644290924,
+      "learning_rate": 0.00021587301587301584,
+      "loss": 0.6595,
+      "step": 962
+    },
+    {
+      "epoch": 1.0578058492379514,
+      "grad_norm": 0.6178450584411621,
+      "learning_rate": 0.00021575091575091573,
+      "loss": 0.8412,
+      "step": 963
+    },
+    {
+      "epoch": 1.0589042976795278,
+      "grad_norm": 0.3728642761707306,
+      "learning_rate": 0.00021562881562881562,
+      "loss": 0.629,
+      "step": 964
+    },
+    {
+      "epoch": 1.060002746121104,
+      "grad_norm": 0.7554892301559448,
+      "learning_rate": 0.00021550671550671547,
+      "loss": 0.5804,
+      "step": 965
+    },
+    {
+      "epoch": 1.0611011945626803,
+      "grad_norm": 0.550298273563385,
+      "learning_rate": 0.00021538461538461536,
+      "loss": 0.476,
+      "step": 966
+    },
+    {
+      "epoch": 1.0621996430042564,
+      "grad_norm": 0.4082244336605072,
+      "learning_rate": 0.00021526251526251524,
+      "loss": 0.4001,
+      "step": 967
+    },
+    {
+      "epoch": 1.0632980914458328,
+      "grad_norm": 1.2327499389648438,
+      "learning_rate": 0.0002151404151404151,
+      "loss": 0.4583,
+      "step": 968
+    },
+    {
+      "epoch": 1.0643965398874091,
+      "grad_norm": 0.860550045967102,
+      "learning_rate": 0.000215018315018315,
+      "loss": 0.6415,
+      "step": 969
+    },
+    {
+      "epoch": 1.0654949883289853,
+      "grad_norm": 0.558860182762146,
+      "learning_rate": 0.0002148962148962149,
+      "loss": 0.6215,
+      "step": 970
+    },
+    {
+      "epoch": 1.0665934367705616,
+      "grad_norm": 0.7794890403747559,
+      "learning_rate": 0.00021477411477411476,
+      "loss": 0.5094,
+      "step": 971
+    },
+    {
+      "epoch": 1.0676918852121378,
+      "grad_norm": 0.48574942350387573,
+      "learning_rate": 0.00021465201465201465,
+      "loss": 0.7385,
+      "step": 972
+    },
+    {
+      "epoch": 1.0687903336537141,
+      "grad_norm": 0.4496791660785675,
+      "learning_rate": 0.00021452991452991453,
+      "loss": 0.5036,
+      "step": 973
+    },
+    {
+      "epoch": 1.0698887820952905,
+      "grad_norm": 0.5360952615737915,
+      "learning_rate": 0.0002144078144078144,
+      "loss": 0.6825,
+      "step": 974
+    },
+    {
+      "epoch": 1.0709872305368666,
+      "grad_norm": 0.5783904194831848,
+      "learning_rate": 0.00021428571428571427,
+      "loss": 0.6736,
+      "step": 975
+    },
+    {
+      "epoch": 1.072085678978443,
+      "grad_norm": 2.290815830230713,
+      "learning_rate": 0.00021416361416361416,
+      "loss": 0.696,
+      "step": 976
+    },
+    {
+      "epoch": 1.0731841274200193,
+      "grad_norm": 1.3432899713516235,
+      "learning_rate": 0.00021404151404151402,
+      "loss": 0.5296,
+      "step": 977
+    },
+    {
+      "epoch": 1.0742825758615955,
+      "grad_norm": 0.5308722257614136,
+      "learning_rate": 0.0002139194139194139,
+      "loss": 0.6642,
+      "step": 978
+    },
+    {
+      "epoch": 1.0753810243031718,
+      "grad_norm": 0.7245768904685974,
+      "learning_rate": 0.00021379731379731376,
+      "loss": 0.6811,
+      "step": 979
+    },
+    {
+      "epoch": 1.076479472744748,
+      "grad_norm": 0.3873349726200104,
+      "learning_rate": 0.00021367521367521365,
+      "loss": 0.8503,
+      "step": 980
+    },
+    {
+      "epoch": 1.0775779211863243,
+      "grad_norm": 0.5792405605316162,
+      "learning_rate": 0.00021355311355311353,
+      "loss": 0.4543,
+      "step": 981
+    },
+    {
+      "epoch": 1.0786763696279005,
+      "grad_norm": 0.6543241143226624,
+      "learning_rate": 0.0002134310134310134,
+      "loss": 0.7778,
+      "step": 982
+    },
+    {
+      "epoch": 1.0797748180694768,
+      "grad_norm": 0.5572071075439453,
+      "learning_rate": 0.00021330891330891328,
+      "loss": 0.8446,
+      "step": 983
+    },
+    {
+      "epoch": 1.0808732665110532,
+      "grad_norm": 0.5798014402389526,
+      "learning_rate": 0.0002131868131868132,
+      "loss": 0.7461,
+      "step": 984
+    },
+    {
+      "epoch": 1.0819717149526293,
+      "grad_norm": 0.8282085657119751,
+      "learning_rate": 0.00021306471306471302,
+      "loss": 0.612,
+      "step": 985
+    },
+    {
+      "epoch": 1.0830701633942057,
+      "grad_norm": 0.5782580971717834,
+      "learning_rate": 0.00021294261294261293,
+      "loss": 0.5506,
+      "step": 986
+    },
+    {
+      "epoch": 1.084168611835782,
+      "grad_norm": 0.3826775848865509,
+      "learning_rate": 0.00021282051282051282,
+      "loss": 0.7859,
+      "step": 987
+    },
+    {
+      "epoch": 1.0852670602773582,
+      "grad_norm": 0.534752368927002,
+      "learning_rate": 0.00021269841269841268,
+      "loss": 0.8835,
+      "step": 988
+    },
+    {
+      "epoch": 1.0863655087189346,
+      "grad_norm": 0.45931264758110046,
+      "learning_rate": 0.00021257631257631256,
+      "loss": 0.6694,
+      "step": 989
+    },
+    {
+      "epoch": 1.0874639571605107,
+      "grad_norm": 0.6106250286102295,
+      "learning_rate": 0.00021245421245421245,
+      "loss": 0.8274,
+      "step": 990
+    },
+    {
+      "epoch": 1.088562405602087,
+      "grad_norm": 0.3704061806201935,
+      "learning_rate": 0.0002123321123321123,
+      "loss": 0.7449,
+      "step": 991
+    },
+    {
+      "epoch": 1.0896608540436634,
+      "grad_norm": 0.3922840356826782,
+      "learning_rate": 0.0002122100122100122,
+      "loss": 0.5845,
+      "step": 992
+    },
+    {
+      "epoch": 1.0907593024852396,
+      "grad_norm": 0.48152726888656616,
+      "learning_rate": 0.00021208791208791208,
+      "loss": 0.6608,
+      "step": 993
+    },
+    {
+      "epoch": 1.091857750926816,
+      "grad_norm": 0.42257216572761536,
+      "learning_rate": 0.00021196581196581194,
+      "loss": 0.6379,
+      "step": 994
+    },
+    {
+      "epoch": 1.092956199368392,
+      "grad_norm": 0.4746345579624176,
+      "learning_rate": 0.00021184371184371182,
+      "loss": 0.6467,
+      "step": 995
+    },
+    {
+      "epoch": 1.0940546478099684,
+      "grad_norm": 0.3915644884109497,
+      "learning_rate": 0.0002117216117216117,
+      "loss": 0.9699,
+      "step": 996
+    },
+    {
+      "epoch": 1.0951530962515448,
+      "grad_norm": 0.5957880020141602,
+      "learning_rate": 0.00021159951159951157,
+      "loss": 0.6917,
+      "step": 997
+    },
+    {
+      "epoch": 1.096251544693121,
+      "grad_norm": 0.4327985942363739,
+      "learning_rate": 0.00021147741147741145,
+      "loss": 0.8091,
+      "step": 998
+    },
+    {
+      "epoch": 1.0973499931346973,
+      "grad_norm": 0.42600274085998535,
+      "learning_rate": 0.00021135531135531136,
+      "loss": 0.7685,
+      "step": 999
+    },
+    {
+      "epoch": 1.0984484415762734,
+      "grad_norm": 0.7165039777755737,
+      "learning_rate": 0.0002112332112332112,
+      "loss": 0.8646,
+      "step": 1000
+    },
+    {
+      "epoch": 1.0995468900178498,
+      "grad_norm": 0.447652131319046,
+      "learning_rate": 0.0002111111111111111,
+      "loss": 0.521,
+      "step": 1001
+    },
+    {
+      "epoch": 1.1006453384594261,
+      "grad_norm": 0.3022591769695282,
+      "learning_rate": 0.000210989010989011,
+      "loss": 0.6099,
+      "step": 1002
+    },
+    {
+      "epoch": 1.1017437869010023,
+      "grad_norm": 0.32764387130737305,
+      "learning_rate": 0.00021086691086691085,
+      "loss": 0.5624,
+      "step": 1003
+    },
+    {
+      "epoch": 1.1028422353425786,
+      "grad_norm": 0.7301959991455078,
+      "learning_rate": 0.00021074481074481074,
+      "loss": 0.6091,
+      "step": 1004
+    },
+    {
+      "epoch": 1.1039406837841548,
+      "grad_norm": 0.4734131097793579,
+      "learning_rate": 0.0002106227106227106,
+      "loss": 0.6849,
+      "step": 1005
+    },
+    {
+      "epoch": 1.1050391322257311,
+      "grad_norm": 0.7214820384979248,
+      "learning_rate": 0.00021050061050061048,
+      "loss": 0.789,
+      "step": 1006
+    },
+    {
+      "epoch": 1.1061375806673075,
+      "grad_norm": 0.31265702843666077,
+      "learning_rate": 0.00021037851037851037,
+      "loss": 0.5176,
+      "step": 1007
+    },
+    {
+      "epoch": 1.1072360291088836,
+      "grad_norm": 0.5804157257080078,
+      "learning_rate": 0.00021025641025641022,
+      "loss": 1.0152,
+      "step": 1008
+    },
+    {
+      "epoch": 1.10833447755046,
+      "grad_norm": 0.3624595105648041,
+      "learning_rate": 0.0002101343101343101,
+      "loss": 0.6843,
+      "step": 1009
+    },
+    {
+      "epoch": 1.1094329259920364,
+      "grad_norm": 0.5099515318870544,
+      "learning_rate": 0.00021001221001221,
+      "loss": 0.5568,
+      "step": 1010
+    },
+    {
+      "epoch": 1.1105313744336125,
+      "grad_norm": 0.46201249957084656,
+      "learning_rate": 0.00020989010989010985,
+      "loss": 0.5883,
+      "step": 1011
+    },
+    {
+      "epoch": 1.1116298228751889,
+      "grad_norm": 0.4493483603000641,
+      "learning_rate": 0.00020976800976800974,
+      "loss": 0.8338,
+      "step": 1012
+    },
+    {
+      "epoch": 1.112728271316765,
+      "grad_norm": 0.4771614968776703,
+      "learning_rate": 0.00020964590964590963,
+      "loss": 0.7251,
+      "step": 1013
+    },
+    {
+      "epoch": 1.1138267197583414,
+      "grad_norm": 2.073347806930542,
+      "learning_rate": 0.00020952380952380948,
+      "loss": 0.8921,
+      "step": 1014
+    },
+    {
+      "epoch": 1.1149251681999177,
+      "grad_norm": 0.435680091381073,
+      "learning_rate": 0.0002094017094017094,
+      "loss": 0.5444,
+      "step": 1015
+    },
+    {
+      "epoch": 1.1160236166414939,
+      "grad_norm": 0.46824783086776733,
+      "learning_rate": 0.00020927960927960928,
+      "loss": 0.5591,
+      "step": 1016
+    },
+    {
+      "epoch": 1.1171220650830702,
+      "grad_norm": 0.43938374519348145,
+      "learning_rate": 0.00020915750915750914,
+      "loss": 0.7476,
+      "step": 1017
+    },
+    {
+      "epoch": 1.1182205135246464,
+      "grad_norm": 0.3620377779006958,
+      "learning_rate": 0.00020903540903540903,
+      "loss": 0.5763,
+      "step": 1018
+    },
+    {
+      "epoch": 1.1193189619662227,
+      "grad_norm": 0.612406313419342,
+      "learning_rate": 0.0002089133089133089,
+      "loss": 0.706,
+      "step": 1019
+    },
+    {
+      "epoch": 1.120417410407799,
+      "grad_norm": 0.5045173168182373,
+      "learning_rate": 0.00020879120879120877,
+      "loss": 0.6799,
+      "step": 1020
+    },
+    {
+      "epoch": 1.1215158588493752,
+      "grad_norm": 0.4815331995487213,
+      "learning_rate": 0.00020866910866910865,
+      "loss": 0.8845,
+      "step": 1021
+    },
+    {
+      "epoch": 1.1226143072909516,
+      "grad_norm": 0.3756159245967865,
+      "learning_rate": 0.00020854700854700854,
+      "loss": 0.5545,
+      "step": 1022
+    },
+    {
+      "epoch": 1.1237127557325277,
+      "grad_norm": 0.3184347152709961,
+      "learning_rate": 0.0002084249084249084,
+      "loss": 0.5109,
+      "step": 1023
+    },
+    {
+      "epoch": 1.124811204174104,
+      "grad_norm": 0.4000808298587799,
+      "learning_rate": 0.00020830280830280828,
+      "loss": 0.8363,
+      "step": 1024
+    },
+    {
+      "epoch": 1.1259096526156804,
+      "grad_norm": 0.3930743336677551,
+      "learning_rate": 0.00020818070818070817,
+      "loss": 0.6183,
+      "step": 1025
+    },
+    {
+      "epoch": 1.1270081010572566,
+      "grad_norm": 0.7536817789077759,
+      "learning_rate": 0.00020805860805860803,
+      "loss": 0.7511,
+      "step": 1026
+    },
+    {
+      "epoch": 1.128106549498833,
+      "grad_norm": 0.5012079477310181,
+      "learning_rate": 0.00020793650793650791,
+      "loss": 0.6346,
+      "step": 1027
+    },
+    {
+      "epoch": 1.129204997940409,
+      "grad_norm": 0.9914690852165222,
+      "learning_rate": 0.00020781440781440783,
+      "loss": 0.5827,
+      "step": 1028
+    },
+    {
+      "epoch": 1.1303034463819854,
+      "grad_norm": 0.9096476435661316,
+      "learning_rate": 0.00020769230769230766,
+      "loss": 1.0235,
+      "step": 1029
+    },
+    {
+      "epoch": 1.1314018948235618,
+      "grad_norm": 0.6668229699134827,
+      "learning_rate": 0.00020757020757020757,
+      "loss": 0.741,
+      "step": 1030
+    },
+    {
+      "epoch": 1.132500343265138,
+      "grad_norm": 0.3232771158218384,
+      "learning_rate": 0.0002074481074481074,
+      "loss": 0.6206,
+      "step": 1031
+    },
+    {
+      "epoch": 1.1335987917067143,
+      "grad_norm": 0.278003990650177,
+      "learning_rate": 0.00020732600732600731,
+      "loss": 0.5661,
+      "step": 1032
+    },
+    {
+      "epoch": 1.1346972401482907,
+      "grad_norm": 1.481213927268982,
+      "learning_rate": 0.0002072039072039072,
+      "loss": 0.6422,
+      "step": 1033
+    },
+    {
+      "epoch": 1.1357956885898668,
+      "grad_norm": 0.4688512682914734,
+      "learning_rate": 0.00020708180708180706,
+      "loss": 0.4163,
+      "step": 1034
+    },
+    {
+      "epoch": 1.1368941370314432,
+      "grad_norm": 0.6438425779342651,
+      "learning_rate": 0.00020695970695970694,
+      "loss": 0.6241,
+      "step": 1035
+    },
+    {
+      "epoch": 1.1379925854730193,
+      "grad_norm": 0.5013176798820496,
+      "learning_rate": 0.00020683760683760683,
+      "loss": 0.6273,
+      "step": 1036
+    },
+    {
+      "epoch": 1.1390910339145957,
+      "grad_norm": 0.5178597569465637,
+      "learning_rate": 0.0002067155067155067,
+      "loss": 0.7489,
+      "step": 1037
+    },
+    {
+      "epoch": 1.1401894823561718,
+      "grad_norm": 0.5804840922355652,
+      "learning_rate": 0.00020659340659340657,
+      "loss": 0.9142,
+      "step": 1038
+    },
+    {
+      "epoch": 1.1412879307977482,
+      "grad_norm": 0.47613444924354553,
+      "learning_rate": 0.00020647130647130646,
+      "loss": 0.9531,
+      "step": 1039
+    },
+    {
+      "epoch": 1.1423863792393245,
+      "grad_norm": 0.4835624694824219,
+      "learning_rate": 0.00020634920634920632,
+      "loss": 0.6349,
+      "step": 1040
+    },
+    {
+      "epoch": 1.1434848276809007,
+      "grad_norm": 0.38351112604141235,
+      "learning_rate": 0.0002062271062271062,
+      "loss": 0.4726,
+      "step": 1041
+    },
+    {
+      "epoch": 1.144583276122477,
+      "grad_norm": 0.5533854365348816,
+      "learning_rate": 0.0002061050061050061,
+      "loss": 0.5108,
+      "step": 1042
+    },
+    {
+      "epoch": 1.1456817245640534,
+      "grad_norm": 0.4842824637889862,
+      "learning_rate": 0.00020598290598290595,
+      "loss": 0.6038,
+      "step": 1043
+    },
+    {
+      "epoch": 1.1467801730056295,
+      "grad_norm": 0.552798330783844,
+      "learning_rate": 0.00020586080586080583,
+      "loss": 0.8056,
+      "step": 1044
+    },
+    {
+      "epoch": 1.1478786214472059,
+      "grad_norm": 0.40466025471687317,
+      "learning_rate": 0.00020573870573870574,
+      "loss": 0.6234,
+      "step": 1045
+    },
+    {
+      "epoch": 1.148977069888782,
+      "grad_norm": 0.6988784074783325,
+      "learning_rate": 0.0002056166056166056,
+      "loss": 0.7721,
+      "step": 1046
+    },
+    {
+      "epoch": 1.1500755183303584,
+      "grad_norm": 0.4852863550186157,
+      "learning_rate": 0.0002054945054945055,
+      "loss": 0.6074,
+      "step": 1047
+    },
+    {
+      "epoch": 1.1511739667719347,
+      "grad_norm": 0.4548696279525757,
+      "learning_rate": 0.00020537240537240537,
+      "loss": 0.5592,
+      "step": 1048
+    },
+    {
+      "epoch": 1.1522724152135109,
+      "grad_norm": 0.9355410933494568,
+      "learning_rate": 0.00020525030525030523,
+      "loss": 0.8618,
+      "step": 1049
+    },
+    {
+      "epoch": 1.1533708636550872,
+      "grad_norm": 0.5641398429870605,
+      "learning_rate": 0.00020512820512820512,
+      "loss": 0.704,
+      "step": 1050
+    },
+    {
+      "epoch": 1.1544693120966634,
+      "grad_norm": 0.48187771439552307,
+      "learning_rate": 0.000205006105006105,
+      "loss": 0.6008,
+      "step": 1051
+    },
+    {
+      "epoch": 1.1555677605382397,
+      "grad_norm": 0.41609904170036316,
+      "learning_rate": 0.00020488400488400486,
+      "loss": 0.8812,
+      "step": 1052
+    },
+    {
+      "epoch": 1.156666208979816,
+      "grad_norm": 0.919477105140686,
+      "learning_rate": 0.00020476190476190475,
+      "loss": 0.6597,
+      "step": 1053
+    },
+    {
+      "epoch": 1.1577646574213922,
+      "grad_norm": 0.5008611083030701,
+      "learning_rate": 0.0002046398046398046,
+      "loss": 0.6501,
+      "step": 1054
+    },
+    {
+      "epoch": 1.1588631058629686,
+      "grad_norm": 0.39832696318626404,
+      "learning_rate": 0.0002045177045177045,
+      "loss": 0.6232,
+      "step": 1055
+    },
+    {
+      "epoch": 1.159961554304545,
+      "grad_norm": 0.5290446281433105,
+      "learning_rate": 0.00020439560439560438,
+      "loss": 0.6123,
+      "step": 1056
+    },
+    {
+      "epoch": 1.161060002746121,
+      "grad_norm": 0.40837669372558594,
+      "learning_rate": 0.00020427350427350423,
+      "loss": 0.4989,
+      "step": 1057
+    },
+    {
+      "epoch": 1.1621584511876974,
+      "grad_norm": 0.43407055735588074,
+      "learning_rate": 0.00020415140415140412,
+      "loss": 0.6961,
+      "step": 1058
+    },
+    {
+      "epoch": 1.1632568996292736,
+      "grad_norm": 0.7601787447929382,
+      "learning_rate": 0.00020402930402930403,
+      "loss": 0.9308,
+      "step": 1059
+    },
+    {
+      "epoch": 1.16435534807085,
+      "grad_norm": 0.452628493309021,
+      "learning_rate": 0.00020390720390720386,
+      "loss": 0.6478,
+      "step": 1060
+    },
+    {
+      "epoch": 1.165453796512426,
+      "grad_norm": 0.4524000287055969,
+      "learning_rate": 0.00020378510378510378,
+      "loss": 0.4499,
+      "step": 1061
+    },
+    {
+      "epoch": 1.1665522449540024,
+      "grad_norm": 0.5971822142601013,
+      "learning_rate": 0.00020366300366300366,
+      "loss": 0.6402,
+      "step": 1062
+    },
+    {
+      "epoch": 1.1676506933955788,
+      "grad_norm": 0.36858659982681274,
+      "learning_rate": 0.00020354090354090352,
+      "loss": 0.6511,
+      "step": 1063
+    },
+    {
+      "epoch": 1.168749141837155,
+      "grad_norm": 0.47295433282852173,
+      "learning_rate": 0.0002034188034188034,
+      "loss": 0.5977,
+      "step": 1064
+    },
+    {
+      "epoch": 1.1698475902787313,
+      "grad_norm": 0.4402971565723419,
+      "learning_rate": 0.0002032967032967033,
+      "loss": 0.4824,
+      "step": 1065
+    },
+    {
+      "epoch": 1.1709460387203077,
+      "grad_norm": 0.3752620816230774,
+      "learning_rate": 0.00020317460317460315,
+      "loss": 0.6519,
+      "step": 1066
+    },
+    {
+      "epoch": 1.1720444871618838,
+      "grad_norm": 0.45207279920578003,
+      "learning_rate": 0.00020305250305250303,
+      "loss": 0.6869,
+      "step": 1067
+    },
+    {
+      "epoch": 1.1731429356034602,
+      "grad_norm": 0.4255804121494293,
+      "learning_rate": 0.00020293040293040292,
+      "loss": 0.7289,
+      "step": 1068
+    },
+    {
+      "epoch": 1.1742413840450363,
+      "grad_norm": 0.48725178837776184,
+      "learning_rate": 0.00020280830280830278,
+      "loss": 0.5472,
+      "step": 1069
+    },
+    {
+      "epoch": 1.1753398324866127,
+      "grad_norm": 0.37094470858573914,
+      "learning_rate": 0.00020268620268620266,
+      "loss": 0.558,
+      "step": 1070
+    },
+    {
+      "epoch": 1.176438280928189,
+      "grad_norm": 0.4191375970840454,
+      "learning_rate": 0.00020256410256410255,
+      "loss": 0.6422,
+      "step": 1071
+    },
+    {
+      "epoch": 1.1775367293697652,
+      "grad_norm": 0.4091531038284302,
+      "learning_rate": 0.0002024420024420024,
+      "loss": 0.6705,
+      "step": 1072
+    },
+    {
+      "epoch": 1.1786351778113415,
+      "grad_norm": 0.4876718521118164,
+      "learning_rate": 0.0002023199023199023,
+      "loss": 0.8265,
+      "step": 1073
+    },
+    {
+      "epoch": 1.1797336262529177,
+      "grad_norm": 0.43008798360824585,
+      "learning_rate": 0.0002021978021978022,
+      "loss": 0.5159,
+      "step": 1074
+    },
+    {
+      "epoch": 1.180832074694494,
+      "grad_norm": 0.47896140813827515,
+      "learning_rate": 0.00020207570207570204,
+      "loss": 0.5455,
+      "step": 1075
+    },
+    {
+      "epoch": 1.1819305231360704,
+      "grad_norm": 0.5313389301300049,
+      "learning_rate": 0.00020195360195360195,
+      "loss": 0.7628,
+      "step": 1076
+    },
+    {
+      "epoch": 1.1830289715776465,
+      "grad_norm": 0.46337512135505676,
+      "learning_rate": 0.00020183150183150184,
+      "loss": 0.6661,
+      "step": 1077
+    },
+    {
+      "epoch": 1.1841274200192229,
+      "grad_norm": 0.4304458498954773,
+      "learning_rate": 0.0002017094017094017,
+      "loss": 0.7019,
+      "step": 1078
+    },
+    {
+      "epoch": 1.185225868460799,
+      "grad_norm": 0.638445258140564,
+      "learning_rate": 0.00020158730158730158,
+      "loss": 0.6972,
+      "step": 1079
+    },
+    {
+      "epoch": 1.1863243169023754,
+      "grad_norm": 1.8217968940734863,
+      "learning_rate": 0.00020146520146520144,
+      "loss": 0.5217,
+      "step": 1080
+    },
+    {
+      "epoch": 1.1874227653439517,
+      "grad_norm": 0.4996611773967743,
+      "learning_rate": 0.00020134310134310132,
+      "loss": 0.6767,
+      "step": 1081
+    },
+    {
+      "epoch": 1.1885212137855279,
+      "grad_norm": 0.43705832958221436,
+      "learning_rate": 0.0002012210012210012,
+      "loss": 0.7364,
+      "step": 1082
+    },
+    {
+      "epoch": 1.1896196622271042,
+      "grad_norm": 0.4148736596107483,
+      "learning_rate": 0.00020109890109890107,
+      "loss": 0.7544,
+      "step": 1083
+    },
+    {
+      "epoch": 1.1907181106686804,
+      "grad_norm": 0.5772218108177185,
+      "learning_rate": 0.00020097680097680095,
+      "loss": 0.6349,
+      "step": 1084
+    },
+    {
+      "epoch": 1.1918165591102567,
+      "grad_norm": 0.9127015471458435,
+      "learning_rate": 0.00020085470085470084,
+      "loss": 0.4772,
+      "step": 1085
+    },
+    {
+      "epoch": 1.192915007551833,
+      "grad_norm": 0.46906840801239014,
+      "learning_rate": 0.0002007326007326007,
+      "loss": 0.6184,
+      "step": 1086
+    },
+    {
+      "epoch": 1.1940134559934092,
+      "grad_norm": 0.38405168056488037,
+      "learning_rate": 0.00020061050061050058,
+      "loss": 0.5027,
+      "step": 1087
+    },
+    {
+      "epoch": 1.1951119044349856,
+      "grad_norm": 0.6352836489677429,
+      "learning_rate": 0.00020048840048840047,
+      "loss": 0.6674,
+      "step": 1088
+    },
+    {
+      "epoch": 1.196210352876562,
+      "grad_norm": 0.6750807762145996,
+      "learning_rate": 0.00020036630036630033,
+      "loss": 0.5707,
+      "step": 1089
+    },
+    {
+      "epoch": 1.197308801318138,
+      "grad_norm": 0.5661985874176025,
+      "learning_rate": 0.00020024420024420024,
+      "loss": 0.8298,
+      "step": 1090
+    },
+    {
+      "epoch": 1.1984072497597145,
+      "grad_norm": 0.6393309831619263,
+      "learning_rate": 0.00020012210012210012,
+      "loss": 0.7397,
+      "step": 1091
+    },
+    {
+      "epoch": 1.1995056982012906,
+      "grad_norm": 0.5442856550216675,
+      "learning_rate": 0.00019999999999999998,
+      "loss": 0.7176,
+      "step": 1092
+    },
+    {
+      "epoch": 1.200604146642867,
+      "grad_norm": 1.0100654363632202,
+      "learning_rate": 0.00019987789987789987,
+      "loss": 0.8052,
+      "step": 1093
+    },
+    {
+      "epoch": 1.201702595084443,
+      "grad_norm": 0.3916209936141968,
+      "learning_rate": 0.00019975579975579975,
+      "loss": 0.5951,
+      "step": 1094
+    },
+    {
+      "epoch": 1.2028010435260195,
+      "grad_norm": 0.3890608847141266,
+      "learning_rate": 0.0001996336996336996,
+      "loss": 0.8129,
+      "step": 1095
+    },
+    {
+      "epoch": 1.2038994919675958,
+      "grad_norm": 0.4267507493495941,
+      "learning_rate": 0.0001995115995115995,
+      "loss": 0.8741,
+      "step": 1096
+    },
+    {
+      "epoch": 1.204997940409172,
+      "grad_norm": 0.49055561423301697,
+      "learning_rate": 0.00019938949938949938,
+      "loss": 0.901,
+      "step": 1097
+    },
+    {
+      "epoch": 1.2060963888507483,
+      "grad_norm": 0.6662428379058838,
+      "learning_rate": 0.00019926739926739924,
+      "loss": 0.4971,
+      "step": 1098
+    },
+    {
+      "epoch": 1.2071948372923247,
+      "grad_norm": 0.4469052255153656,
+      "learning_rate": 0.00019914529914529913,
+      "loss": 0.6593,
+      "step": 1099
+    },
+    {
+      "epoch": 1.2082932857339008,
+      "grad_norm": 0.5514255166053772,
+      "learning_rate": 0.000199023199023199,
+      "loss": 0.8033,
+      "step": 1100
+    },
+    {
+      "epoch": 1.2093917341754772,
+      "grad_norm": 0.4838184714317322,
+      "learning_rate": 0.00019890109890109887,
+      "loss": 0.5533,
+      "step": 1101
+    },
+    {
+      "epoch": 1.2104901826170533,
+      "grad_norm": 0.6061891913414001,
+      "learning_rate": 0.00019877899877899876,
+      "loss": 0.5837,
+      "step": 1102
+    },
+    {
+      "epoch": 1.2115886310586297,
+      "grad_norm": 0.3387523889541626,
+      "learning_rate": 0.00019865689865689867,
+      "loss": 0.455,
+      "step": 1103
+    },
+    {
+      "epoch": 1.212687079500206,
+      "grad_norm": 0.5204731225967407,
+      "learning_rate": 0.0001985347985347985,
+      "loss": 0.6869,
+      "step": 1104
+    },
+    {
+      "epoch": 1.2137855279417822,
+      "grad_norm": 0.5747571587562561,
+      "learning_rate": 0.0001984126984126984,
+      "loss": 0.7208,
+      "step": 1105
+    },
+    {
+      "epoch": 1.2148839763833585,
+      "grad_norm": 0.5382461547851562,
+      "learning_rate": 0.00019829059829059824,
+      "loss": 0.6035,
+      "step": 1106
+    },
+    {
+      "epoch": 1.2159824248249347,
+      "grad_norm": 0.44335421919822693,
+      "learning_rate": 0.00019816849816849816,
+      "loss": 0.8563,
+      "step": 1107
+    },
+    {
+      "epoch": 1.217080873266511,
+      "grad_norm": 0.3059934675693512,
+      "learning_rate": 0.00019804639804639804,
+      "loss": 0.6422,
+      "step": 1108
+    },
+    {
+      "epoch": 1.2181793217080874,
+      "grad_norm": 0.4306177794933319,
+      "learning_rate": 0.0001979242979242979,
+      "loss": 0.5347,
+      "step": 1109
+    },
+    {
+      "epoch": 1.2192777701496635,
+      "grad_norm": 0.5196095705032349,
+      "learning_rate": 0.00019780219780219779,
+      "loss": 0.5996,
+      "step": 1110
+    },
+    {
+      "epoch": 1.22037621859124,
+      "grad_norm": 0.4814283549785614,
+      "learning_rate": 0.00019768009768009767,
+      "loss": 0.6782,
+      "step": 1111
+    },
+    {
+      "epoch": 1.2214746670328163,
+      "grad_norm": 0.2287791222333908,
+      "learning_rate": 0.00019755799755799753,
+      "loss": 0.5908,
+      "step": 1112
+    },
+    {
+      "epoch": 1.2225731154743924,
+      "grad_norm": 0.43044313788414,
+      "learning_rate": 0.00019743589743589742,
+      "loss": 0.6554,
+      "step": 1113
+    },
+    {
+      "epoch": 1.2236715639159688,
+      "grad_norm": 0.390874445438385,
+      "learning_rate": 0.0001973137973137973,
+      "loss": 0.5777,
+      "step": 1114
+    },
+    {
+      "epoch": 1.224770012357545,
+      "grad_norm": 0.5380458235740662,
+      "learning_rate": 0.00019719169719169716,
+      "loss": 0.467,
+      "step": 1115
+    },
+    {
+      "epoch": 1.2258684607991213,
+      "grad_norm": 0.6176440119743347,
+      "learning_rate": 0.00019706959706959704,
+      "loss": 0.5625,
+      "step": 1116
+    },
+    {
+      "epoch": 1.2269669092406974,
+      "grad_norm": 0.4321332275867462,
+      "learning_rate": 0.00019694749694749693,
+      "loss": 0.7262,
+      "step": 1117
+    },
+    {
+      "epoch": 1.2280653576822738,
+      "grad_norm": 0.5679623484611511,
+      "learning_rate": 0.0001968253968253968,
+      "loss": 0.8216,
+      "step": 1118
+    },
+    {
+      "epoch": 1.2291638061238501,
+      "grad_norm": 0.4741218686103821,
+      "learning_rate": 0.00019670329670329667,
+      "loss": 0.7164,
+      "step": 1119
+    },
+    {
+      "epoch": 1.2302622545654263,
+      "grad_norm": 0.6570267677307129,
+      "learning_rate": 0.00019658119658119659,
+      "loss": 0.7606,
+      "step": 1120
+    },
+    {
+      "epoch": 1.2313607030070026,
+      "grad_norm": 0.4256306290626526,
+      "learning_rate": 0.00019645909645909644,
+      "loss": 0.5137,
+      "step": 1121
+    },
+    {
+      "epoch": 1.232459151448579,
+      "grad_norm": 0.4444984793663025,
+      "learning_rate": 0.00019633699633699633,
+      "loss": 0.8863,
+      "step": 1122
+    },
+    {
+      "epoch": 1.2335575998901551,
+      "grad_norm": 0.458133339881897,
+      "learning_rate": 0.00019621489621489622,
+      "loss": 0.6445,
+      "step": 1123
+    },
+    {
+      "epoch": 1.2346560483317315,
+      "grad_norm": 0.6087627410888672,
+      "learning_rate": 0.00019609279609279607,
+      "loss": 0.5625,
+      "step": 1124
+    },
+    {
+      "epoch": 1.2357544967733076,
+      "grad_norm": 0.42782312631607056,
+      "learning_rate": 0.00019597069597069596,
+      "loss": 0.6321,
+      "step": 1125
+    },
+    {
+      "epoch": 1.236852945214884,
+      "grad_norm": 0.49623987078666687,
+      "learning_rate": 0.00019584859584859585,
+      "loss": 0.6473,
+      "step": 1126
+    },
+    {
+      "epoch": 1.2379513936564603,
+      "grad_norm": 0.5348198413848877,
+      "learning_rate": 0.0001957264957264957,
+      "loss": 0.6948,
+      "step": 1127
+    },
+    {
+      "epoch": 1.2390498420980365,
+      "grad_norm": 0.44476062059402466,
+      "learning_rate": 0.0001956043956043956,
+      "loss": 0.5917,
+      "step": 1128
+    },
+    {
+      "epoch": 1.2401482905396128,
+      "grad_norm": 0.5777286291122437,
+      "learning_rate": 0.00019548229548229547,
+      "loss": 0.7474,
+      "step": 1129
+    },
+    {
+      "epoch": 1.241246738981189,
+      "grad_norm": 0.3132689893245697,
+      "learning_rate": 0.00019536019536019533,
+      "loss": 0.5827,
+      "step": 1130
+    },
+    {
+      "epoch": 1.2423451874227653,
+      "grad_norm": 0.3898192346096039,
+      "learning_rate": 0.00019523809523809522,
+      "loss": 0.5469,
+      "step": 1131
+    },
+    {
+      "epoch": 1.2434436358643417,
+      "grad_norm": 0.338693767786026,
+      "learning_rate": 0.00019511599511599508,
+      "loss": 0.704,
+      "step": 1132
+    },
+    {
+      "epoch": 1.2445420843059178,
+      "grad_norm": 0.4276609718799591,
+      "learning_rate": 0.00019499389499389496,
+      "loss": 0.7269,
+      "step": 1133
+    },
+    {
+      "epoch": 1.2456405327474942,
+      "grad_norm": 0.7320281863212585,
+      "learning_rate": 0.00019487179487179487,
+      "loss": 0.62,
+      "step": 1134
+    },
+    {
+      "epoch": 1.2467389811890706,
+      "grad_norm": 0.4023820757865906,
+      "learning_rate": 0.0001947496947496947,
+      "loss": 0.4234,
+      "step": 1135
+    },
+    {
+      "epoch": 1.2478374296306467,
+      "grad_norm": 0.3218212425708771,
+      "learning_rate": 0.00019462759462759462,
+      "loss": 0.5325,
+      "step": 1136
+    },
+    {
+      "epoch": 1.248935878072223,
+      "grad_norm": 0.45131513476371765,
+      "learning_rate": 0.0001945054945054945,
+      "loss": 0.5667,
+      "step": 1137
+    },
+    {
+      "epoch": 1.2500343265137992,
+      "grad_norm": 0.604475200176239,
+      "learning_rate": 0.00019438339438339436,
+      "loss": 0.9018,
+      "step": 1138
+    },
+    {
+      "epoch": 1.2511327749553756,
+      "grad_norm": 0.46968311071395874,
+      "learning_rate": 0.00019426129426129425,
+      "loss": 0.7946,
+      "step": 1139
+    },
+    {
+      "epoch": 1.2522312233969517,
+      "grad_norm": 0.3960346281528473,
+      "learning_rate": 0.00019413919413919413,
+      "loss": 0.7719,
+      "step": 1140
+    },
+    {
+      "epoch": 1.253329671838528,
+      "grad_norm": 0.5146461129188538,
+      "learning_rate": 0.000194017094017094,
+      "loss": 0.8946,
+      "step": 1141
+    },
+    {
+      "epoch": 1.2544281202801044,
+      "grad_norm": 0.6343802809715271,
+      "learning_rate": 0.00019389499389499388,
+      "loss": 0.7822,
+      "step": 1142
+    },
+    {
+      "epoch": 1.2555265687216806,
+      "grad_norm": 0.4646434485912323,
+      "learning_rate": 0.00019377289377289376,
+      "loss": 0.6722,
+      "step": 1143
+    },
+    {
+      "epoch": 1.256625017163257,
+      "grad_norm": 0.48127877712249756,
+      "learning_rate": 0.00019365079365079362,
+      "loss": 0.9059,
+      "step": 1144
+    },
+    {
+      "epoch": 1.2577234656048333,
+      "grad_norm": 0.4040716290473938,
+      "learning_rate": 0.0001935286935286935,
+      "loss": 0.7288,
+      "step": 1145
+    },
+    {
+      "epoch": 1.2588219140464094,
+      "grad_norm": 0.43992865085601807,
+      "learning_rate": 0.0001934065934065934,
+      "loss": 0.5804,
+      "step": 1146
+    },
+    {
+      "epoch": 1.2599203624879858,
+      "grad_norm": 0.41578513383865356,
+      "learning_rate": 0.00019328449328449325,
+      "loss": 0.5459,
+      "step": 1147
+    },
+    {
+      "epoch": 1.261018810929562,
+      "grad_norm": 0.40165719389915466,
+      "learning_rate": 0.00019316239316239314,
+      "loss": 0.6001,
+      "step": 1148
+    },
+    {
+      "epoch": 1.2621172593711383,
+      "grad_norm": 0.43200212717056274,
+      "learning_rate": 0.00019304029304029305,
+      "loss": 0.8712,
+      "step": 1149
+    },
+    {
+      "epoch": 1.2632157078127144,
+      "grad_norm": 0.3217264413833618,
+      "learning_rate": 0.00019291819291819288,
+      "loss": 0.6074,
+      "step": 1150
+    },
+    {
+      "epoch": 1.2643141562542908,
+      "grad_norm": 0.3964528441429138,
+      "learning_rate": 0.0001927960927960928,
+      "loss": 0.6131,
+      "step": 1151
+    },
+    {
+      "epoch": 1.2654126046958671,
+      "grad_norm": 0.5151070952415466,
+      "learning_rate": 0.00019267399267399268,
+      "loss": 0.6992,
+      "step": 1152
+    },
+    {
+      "epoch": 1.2665110531374433,
+      "grad_norm": 0.5902129411697388,
+      "learning_rate": 0.00019255189255189254,
+      "loss": 0.7311,
+      "step": 1153
+    },
+    {
+      "epoch": 1.2676095015790196,
+      "grad_norm": 0.5386108160018921,
+      "learning_rate": 0.00019242979242979242,
+      "loss": 0.6469,
+      "step": 1154
+    },
+    {
+      "epoch": 1.268707950020596,
+      "grad_norm": 0.384093701839447,
+      "learning_rate": 0.0001923076923076923,
+      "loss": 0.7111,
+      "step": 1155
+    },
+    {
+      "epoch": 1.2698063984621721,
+      "grad_norm": 0.34160250425338745,
+      "learning_rate": 0.00019218559218559217,
+      "loss": 0.5396,
+      "step": 1156
+    },
+    {
+      "epoch": 1.2709048469037485,
+      "grad_norm": 0.6590912938117981,
+      "learning_rate": 0.00019206349206349205,
+      "loss": 1.1613,
+      "step": 1157
+    },
+    {
+      "epoch": 1.2720032953453249,
+      "grad_norm": 0.6230842471122742,
+      "learning_rate": 0.0001919413919413919,
+      "loss": 0.7701,
+      "step": 1158
+    },
+    {
+      "epoch": 1.273101743786901,
+      "grad_norm": 0.3881864547729492,
+      "learning_rate": 0.0001918192918192918,
+      "loss": 0.633,
+      "step": 1159
+    },
+    {
+      "epoch": 1.2742001922284774,
+      "grad_norm": 0.4538264274597168,
+      "learning_rate": 0.00019169719169719168,
+      "loss": 0.451,
+      "step": 1160
+    },
+    {
+      "epoch": 1.2752986406700535,
+      "grad_norm": 0.6188018321990967,
+      "learning_rate": 0.00019157509157509154,
+      "loss": 0.9563,
+      "step": 1161
+    },
+    {
+      "epoch": 1.2763970891116299,
+      "grad_norm": 0.4172852039337158,
+      "learning_rate": 0.00019145299145299142,
+      "loss": 0.8284,
+      "step": 1162
+    },
+    {
+      "epoch": 1.277495537553206,
+      "grad_norm": 0.338623583316803,
+      "learning_rate": 0.0001913308913308913,
+      "loss": 0.6745,
+      "step": 1163
+    },
+    {
+      "epoch": 1.2785939859947824,
+      "grad_norm": 0.3960900902748108,
+      "learning_rate": 0.00019120879120879117,
+      "loss": 0.6508,
+      "step": 1164
+    },
+    {
+      "epoch": 1.2796924344363587,
+      "grad_norm": 0.37232962250709534,
+      "learning_rate": 0.00019108669108669108,
+      "loss": 0.7347,
+      "step": 1165
+    },
+    {
+      "epoch": 1.2807908828779349,
+      "grad_norm": 0.47092223167419434,
+      "learning_rate": 0.00019096459096459097,
+      "loss": 0.8251,
+      "step": 1166
+    },
+    {
+      "epoch": 1.2818893313195112,
+      "grad_norm": 0.4647108316421509,
+      "learning_rate": 0.00019084249084249082,
+      "loss": 0.556,
+      "step": 1167
+    },
+    {
+      "epoch": 1.2829877797610876,
+      "grad_norm": 0.5812810659408569,
+      "learning_rate": 0.0001907203907203907,
+      "loss": 0.6802,
+      "step": 1168
+    },
+    {
+      "epoch": 1.2840862282026637,
+      "grad_norm": 0.3731052279472351,
+      "learning_rate": 0.0001905982905982906,
+      "loss": 0.6384,
+      "step": 1169
+    },
+    {
+      "epoch": 1.28518467664424,
+      "grad_norm": 0.47995856404304504,
+      "learning_rate": 0.00019047619047619045,
+      "loss": 0.4914,
+      "step": 1170
+    },
+    {
+      "epoch": 1.2862831250858162,
+      "grad_norm": 0.3223705589771271,
+      "learning_rate": 0.00019035409035409034,
+      "loss": 0.6676,
+      "step": 1171
+    },
+    {
+      "epoch": 1.2873815735273926,
+      "grad_norm": 0.5643377304077148,
+      "learning_rate": 0.00019023199023199023,
+      "loss": 0.8224,
+      "step": 1172
+    },
+    {
+      "epoch": 1.2884800219689687,
+      "grad_norm": 0.48324450850486755,
+      "learning_rate": 0.00019010989010989008,
+      "loss": 0.8005,
+      "step": 1173
+    },
+    {
+      "epoch": 1.289578470410545,
+      "grad_norm": 0.40516728162765503,
+      "learning_rate": 0.00018998778998778997,
+      "loss": 0.5463,
+      "step": 1174
+    },
+    {
+      "epoch": 1.2906769188521214,
+      "grad_norm": 0.45521625876426697,
+      "learning_rate": 0.00018986568986568985,
+      "loss": 0.7562,
+      "step": 1175
+    },
+    {
+      "epoch": 1.2917753672936976,
+      "grad_norm": 0.38747909665107727,
+      "learning_rate": 0.0001897435897435897,
+      "loss": 0.5074,
+      "step": 1176
+    },
+    {
+      "epoch": 1.292873815735274,
+      "grad_norm": 0.39688000082969666,
+      "learning_rate": 0.0001896214896214896,
+      "loss": 0.3551,
+      "step": 1177
+    },
+    {
+      "epoch": 1.2939722641768503,
+      "grad_norm": 0.6891604065895081,
+      "learning_rate": 0.0001894993894993895,
+      "loss": 0.601,
+      "step": 1178
+    },
+    {
+      "epoch": 1.2950707126184264,
+      "grad_norm": 0.5177300572395325,
+      "learning_rate": 0.00018937728937728934,
+      "loss": 0.5188,
+      "step": 1179
+    },
+    {
+      "epoch": 1.2961691610600028,
+      "grad_norm": 0.3166979253292084,
+      "learning_rate": 0.00018925518925518926,
+      "loss": 0.8411,
+      "step": 1180
+    },
+    {
+      "epoch": 1.2972676095015792,
+      "grad_norm": 0.6637437343597412,
+      "learning_rate": 0.00018913308913308914,
+      "loss": 0.7256,
+      "step": 1181
+    },
+    {
+      "epoch": 1.2983660579431553,
+      "grad_norm": 0.424932599067688,
+      "learning_rate": 0.000189010989010989,
+      "loss": 0.783,
+      "step": 1182
+    },
+    {
+      "epoch": 1.2994645063847314,
+      "grad_norm": 0.47751033306121826,
+      "learning_rate": 0.00018888888888888888,
+      "loss": 0.7039,
+      "step": 1183
+    },
+    {
+      "epoch": 1.3005629548263078,
+      "grad_norm": 0.4332704544067383,
+      "learning_rate": 0.00018876678876678874,
+      "loss": 0.4797,
+      "step": 1184
+    },
+    {
+      "epoch": 1.3016614032678842,
+      "grad_norm": 0.439431756734848,
+      "learning_rate": 0.00018864468864468863,
+      "loss": 0.6256,
+      "step": 1185
+    },
+    {
+      "epoch": 1.3027598517094603,
+      "grad_norm": 0.4334176480770111,
+      "learning_rate": 0.00018852258852258851,
+      "loss": 0.5583,
+      "step": 1186
+    },
+    {
+      "epoch": 1.3038583001510367,
+      "grad_norm": 0.42080724239349365,
+      "learning_rate": 0.00018840048840048837,
+      "loss": 0.461,
+      "step": 1187
+    },
+    {
+      "epoch": 1.304956748592613,
+      "grad_norm": 0.41007399559020996,
+      "learning_rate": 0.00018827838827838826,
+      "loss": 0.4746,
+      "step": 1188
+    },
+    {
+      "epoch": 1.3060551970341892,
+      "grad_norm": 0.3763822019100189,
+      "learning_rate": 0.00018815628815628814,
+      "loss": 0.5352,
+      "step": 1189
+    },
+    {
+      "epoch": 1.3071536454757655,
+      "grad_norm": 0.5557730197906494,
+      "learning_rate": 0.000188034188034188,
+      "loss": 0.5404,
+      "step": 1190
+    },
+    {
+      "epoch": 1.3082520939173419,
+      "grad_norm": 0.43677788972854614,
+      "learning_rate": 0.0001879120879120879,
+      "loss": 0.7111,
+      "step": 1191
+    },
+    {
+      "epoch": 1.309350542358918,
+      "grad_norm": 0.6084219217300415,
+      "learning_rate": 0.00018778998778998777,
+      "loss": 0.7524,
+      "step": 1192
+    },
+    {
+      "epoch": 1.3104489908004944,
+      "grad_norm": 0.7219144701957703,
+      "learning_rate": 0.00018766788766788763,
+      "loss": 0.6182,
+      "step": 1193
+    },
+    {
+      "epoch": 1.3115474392420705,
+      "grad_norm": 0.5280331969261169,
+      "learning_rate": 0.00018754578754578752,
+      "loss": 0.8023,
+      "step": 1194
+    },
+    {
+      "epoch": 1.3126458876836469,
+      "grad_norm": 0.42130032181739807,
+      "learning_rate": 0.00018742368742368743,
+      "loss": 0.5673,
+      "step": 1195
+    },
+    {
+      "epoch": 1.313744336125223,
+      "grad_norm": 0.6063292026519775,
+      "learning_rate": 0.0001873015873015873,
+      "loss": 0.6438,
+      "step": 1196
+    },
+    {
+      "epoch": 1.3148427845667994,
+      "grad_norm": 0.4073690176010132,
+      "learning_rate": 0.00018717948717948717,
+      "loss": 0.7099,
+      "step": 1197
+    },
+    {
+      "epoch": 1.3159412330083757,
+      "grad_norm": 0.5419113636016846,
+      "learning_rate": 0.00018705738705738706,
+      "loss": 0.6451,
+      "step": 1198
+    },
+    {
+      "epoch": 1.3170396814499519,
+      "grad_norm": 0.4489867091178894,
+      "learning_rate": 0.00018693528693528692,
+      "loss": 0.7522,
+      "step": 1199
+    },
+    {
+      "epoch": 1.3181381298915282,
+      "grad_norm": 0.3536837697029114,
+      "learning_rate": 0.0001868131868131868,
+      "loss": 0.6201,
+      "step": 1200
+    },
+    {
+      "epoch": 1.3192365783331046,
+      "grad_norm": 0.42462313175201416,
+      "learning_rate": 0.0001866910866910867,
+      "loss": 0.4804,
+      "step": 1201
+    },
+    {
+      "epoch": 1.3203350267746807,
+      "grad_norm": 0.612319827079773,
+      "learning_rate": 0.00018656898656898655,
+      "loss": 0.8546,
+      "step": 1202
+    },
+    {
+      "epoch": 1.321433475216257,
+      "grad_norm": 0.5242000222206116,
+      "learning_rate": 0.00018644688644688643,
+      "loss": 0.7577,
+      "step": 1203
+    },
+    {
+      "epoch": 1.3225319236578332,
+      "grad_norm": 0.5688628554344177,
+      "learning_rate": 0.00018632478632478632,
+      "loss": 0.6645,
+      "step": 1204
+    },
+    {
+      "epoch": 1.3236303720994096,
+      "grad_norm": 0.3695731461048126,
+      "learning_rate": 0.00018620268620268618,
+      "loss": 0.4979,
+      "step": 1205
+    },
+    {
+      "epoch": 1.3247288205409857,
+      "grad_norm": 0.44525593519210815,
+      "learning_rate": 0.00018608058608058606,
+      "loss": 0.807,
+      "step": 1206
+    },
+    {
+      "epoch": 1.325827268982562,
+      "grad_norm": 0.37627971172332764,
+      "learning_rate": 0.00018595848595848595,
+      "loss": 0.6584,
+      "step": 1207
+    },
+    {
+      "epoch": 1.3269257174241385,
+      "grad_norm": 0.39727315306663513,
+      "learning_rate": 0.0001858363858363858,
+      "loss": 0.5565,
+      "step": 1208
+    },
+    {
+      "epoch": 1.3280241658657146,
+      "grad_norm": 0.4151424169540405,
+      "learning_rate": 0.00018571428571428572,
+      "loss": 0.81,
+      "step": 1209
+    },
+    {
+      "epoch": 1.329122614307291,
+      "grad_norm": 0.37529075145721436,
+      "learning_rate": 0.00018559218559218555,
+      "loss": 0.6188,
+      "step": 1210
+    },
+    {
+      "epoch": 1.3302210627488673,
+      "grad_norm": 0.43061408400535583,
+      "learning_rate": 0.00018547008547008546,
+      "loss": 0.814,
+      "step": 1211
+    },
+    {
+      "epoch": 1.3313195111904434,
+      "grad_norm": 0.437511682510376,
+      "learning_rate": 0.00018534798534798535,
+      "loss": 0.55,
+      "step": 1212
+    },
+    {
+      "epoch": 1.3324179596320198,
+      "grad_norm": 0.5172685980796814,
+      "learning_rate": 0.0001852258852258852,
+      "loss": 0.6551,
+      "step": 1213
+    },
+    {
+      "epoch": 1.3335164080735962,
+      "grad_norm": 0.3292716443538666,
+      "learning_rate": 0.0001851037851037851,
+      "loss": 0.5108,
+      "step": 1214
+    },
+    {
+      "epoch": 1.3346148565151723,
+      "grad_norm": 0.7129474878311157,
+      "learning_rate": 0.00018498168498168498,
+      "loss": 0.7197,
+      "step": 1215
+    },
+    {
+      "epoch": 1.3357133049567487,
+      "grad_norm": 0.46317145228385925,
+      "learning_rate": 0.00018485958485958483,
+      "loss": 0.6553,
+      "step": 1216
+    },
+    {
+      "epoch": 1.3368117533983248,
+      "grad_norm": 0.5539398789405823,
+      "learning_rate": 0.00018473748473748472,
+      "loss": 0.7057,
+      "step": 1217
+    },
+    {
+      "epoch": 1.3379102018399012,
+      "grad_norm": 0.40555253624916077,
+      "learning_rate": 0.0001846153846153846,
+      "loss": 0.5976,
+      "step": 1218
+    },
+    {
+      "epoch": 1.3390086502814773,
+      "grad_norm": 0.462704062461853,
+      "learning_rate": 0.00018449328449328446,
+      "loss": 0.7018,
+      "step": 1219
+    },
+    {
+      "epoch": 1.3401070987230537,
+      "grad_norm": 0.407287061214447,
+      "learning_rate": 0.00018437118437118435,
+      "loss": 0.4726,
+      "step": 1220
+    },
+    {
+      "epoch": 1.34120554716463,
+      "grad_norm": 0.3654995858669281,
+      "learning_rate": 0.00018424908424908423,
+      "loss": 0.5811,
+      "step": 1221
+    },
+    {
+      "epoch": 1.3423039956062062,
+      "grad_norm": 0.46455878019332886,
+      "learning_rate": 0.0001841269841269841,
+      "loss": 0.8998,
+      "step": 1222
+    },
+    {
+      "epoch": 1.3434024440477825,
+      "grad_norm": 0.47929346561431885,
+      "learning_rate": 0.00018400488400488398,
+      "loss": 0.7348,
+      "step": 1223
+    },
+    {
+      "epoch": 1.344500892489359,
+      "grad_norm": 0.7128652930259705,
+      "learning_rate": 0.0001838827838827839,
+      "loss": 1.2647,
+      "step": 1224
+    },
+    {
+      "epoch": 1.345599340930935,
+      "grad_norm": 0.3956572413444519,
+      "learning_rate": 0.00018376068376068372,
+      "loss": 0.6985,
+      "step": 1225
+    },
+    {
+      "epoch": 1.3466977893725114,
+      "grad_norm": 0.5585309863090515,
+      "learning_rate": 0.00018363858363858364,
+      "loss": 1.0086,
+      "step": 1226
+    },
+    {
+      "epoch": 1.3477962378140875,
+      "grad_norm": 1.5960838794708252,
+      "learning_rate": 0.00018351648351648352,
+      "loss": 0.644,
+      "step": 1227
+    },
+    {
+      "epoch": 1.3488946862556639,
+      "grad_norm": 0.6499342322349548,
+      "learning_rate": 0.00018339438339438338,
+      "loss": 0.7698,
+      "step": 1228
+    },
+    {
+      "epoch": 1.34999313469724,
+      "grad_norm": 0.42246925830841064,
+      "learning_rate": 0.00018327228327228326,
+      "loss": 0.5614,
+      "step": 1229
+    },
+    {
+      "epoch": 1.3510915831388164,
+      "grad_norm": 0.42192572355270386,
+      "learning_rate": 0.00018315018315018315,
+      "loss": 0.7726,
+      "step": 1230
+    },
+    {
+      "epoch": 1.3521900315803927,
+      "grad_norm": 0.6409221887588501,
+      "learning_rate": 0.000183028083028083,
+      "loss": 0.5928,
+      "step": 1231
+    },
+    {
+      "epoch": 1.3532884800219689,
+      "grad_norm": 1.328852653503418,
+      "learning_rate": 0.0001829059829059829,
+      "loss": 0.7861,
+      "step": 1232
+    },
+    {
+      "epoch": 1.3543869284635452,
+      "grad_norm": 0.4519331753253937,
+      "learning_rate": 0.00018278388278388275,
+      "loss": 0.5938,
+      "step": 1233
+    },
+    {
+      "epoch": 1.3554853769051216,
+      "grad_norm": 0.3942720592021942,
+      "learning_rate": 0.00018266178266178264,
+      "loss": 0.4781,
+      "step": 1234
+    },
+    {
+      "epoch": 1.3565838253466977,
+      "grad_norm": 0.5066869258880615,
+      "learning_rate": 0.00018253968253968252,
+      "loss": 0.8069,
+      "step": 1235
+    },
+    {
+      "epoch": 1.357682273788274,
+      "grad_norm": 0.37002792954444885,
+      "learning_rate": 0.00018241758241758238,
+      "loss": 0.5737,
+      "step": 1236
+    },
+    {
+      "epoch": 1.3587807222298505,
+      "grad_norm": 0.3738810122013092,
+      "learning_rate": 0.00018229548229548227,
+      "loss": 0.5169,
+      "step": 1237
+    },
+    {
+      "epoch": 1.3598791706714266,
+      "grad_norm": 0.44956260919570923,
+      "learning_rate": 0.00018217338217338215,
+      "loss": 0.5614,
+      "step": 1238
+    },
+    {
+      "epoch": 1.3609776191130027,
+      "grad_norm": 0.34839004278182983,
+      "learning_rate": 0.000182051282051282,
+      "loss": 0.5783,
+      "step": 1239
+    },
+    {
+      "epoch": 1.362076067554579,
+      "grad_norm": 0.30152127146720886,
+      "learning_rate": 0.00018192918192918192,
+      "loss": 0.4321,
+      "step": 1240
+    },
+    {
+      "epoch": 1.3631745159961555,
+      "grad_norm": 0.6672345399856567,
+      "learning_rate": 0.0001818070818070818,
+      "loss": 0.6073,
+      "step": 1241
+    },
+    {
+      "epoch": 1.3642729644377316,
+      "grad_norm": 0.45652687549591064,
+      "learning_rate": 0.00018168498168498167,
+      "loss": 0.6193,
+      "step": 1242
+    },
+    {
+      "epoch": 1.365371412879308,
+      "grad_norm": 0.6392306089401245,
+      "learning_rate": 0.00018156288156288155,
+      "loss": 0.8388,
+      "step": 1243
+    },
+    {
+      "epoch": 1.3664698613208843,
+      "grad_norm": 0.5510252714157104,
+      "learning_rate": 0.00018144078144078144,
+      "loss": 0.6512,
+      "step": 1244
+    },
+    {
+      "epoch": 1.3675683097624605,
+      "grad_norm": 0.38780227303504944,
+      "learning_rate": 0.0001813186813186813,
+      "loss": 0.6835,
+      "step": 1245
+    },
+    {
+      "epoch": 1.3686667582040368,
+      "grad_norm": 0.47472965717315674,
+      "learning_rate": 0.00018119658119658118,
+      "loss": 0.6625,
+      "step": 1246
+    },
+    {
+      "epoch": 1.3697652066456132,
+      "grad_norm": 0.3599228262901306,
+      "learning_rate": 0.00018107448107448107,
+      "loss": 0.5063,
+      "step": 1247
+    },
+    {
+      "epoch": 1.3708636550871893,
+      "grad_norm": 0.3284567892551422,
+      "learning_rate": 0.00018095238095238093,
+      "loss": 0.7679,
+      "step": 1248
+    },
+    {
+      "epoch": 1.3719621035287657,
+      "grad_norm": 0.5258575081825256,
+      "learning_rate": 0.0001808302808302808,
+      "loss": 0.6213,
+      "step": 1249
+    },
+    {
+      "epoch": 1.3730605519703418,
+      "grad_norm": 0.3211069405078888,
+      "learning_rate": 0.0001807081807081807,
+      "loss": 0.5306,
+      "step": 1250
+    },
+    {
+      "epoch": 1.3741590004119182,
+      "grad_norm": 0.6325588822364807,
+      "learning_rate": 0.00018058608058608056,
+      "loss": 0.8104,
+      "step": 1251
+    },
+    {
+      "epoch": 1.3752574488534943,
+      "grad_norm": 0.4994303584098816,
+      "learning_rate": 0.00018046398046398044,
+      "loss": 0.6464,
+      "step": 1252
+    },
+    {
+      "epoch": 1.3763558972950707,
+      "grad_norm": 0.3013019263744354,
+      "learning_rate": 0.00018034188034188035,
+      "loss": 0.4749,
+      "step": 1253
+    },
+    {
+      "epoch": 1.377454345736647,
+      "grad_norm": 1.0342131853103638,
+      "learning_rate": 0.00018021978021978018,
+      "loss": 0.7995,
+      "step": 1254
+    },
+    {
+      "epoch": 1.3785527941782232,
+      "grad_norm": 0.40213823318481445,
+      "learning_rate": 0.0001800976800976801,
+      "loss": 0.8791,
+      "step": 1255
+    },
+    {
+      "epoch": 1.3796512426197995,
+      "grad_norm": 0.37126532196998596,
+      "learning_rate": 0.00017997557997557998,
+      "loss": 0.551,
+      "step": 1256
+    },
+    {
+      "epoch": 1.380749691061376,
+      "grad_norm": 0.3417685031890869,
+      "learning_rate": 0.00017985347985347984,
+      "loss": 0.583,
+      "step": 1257
+    },
+    {
+      "epoch": 1.381848139502952,
+      "grad_norm": 0.33571329712867737,
+      "learning_rate": 0.00017973137973137973,
+      "loss": 0.4927,
+      "step": 1258
+    },
+    {
+      "epoch": 1.3829465879445284,
+      "grad_norm": 0.5128073692321777,
+      "learning_rate": 0.00017960927960927959,
+      "loss": 0.5903,
+      "step": 1259
+    },
+    {
+      "epoch": 1.3840450363861048,
+      "grad_norm": 0.5345245599746704,
+      "learning_rate": 0.00017948717948717947,
+      "loss": 0.5828,
+      "step": 1260
+    },
+    {
+      "epoch": 1.385143484827681,
+      "grad_norm": 0.312639981508255,
+      "learning_rate": 0.00017936507936507936,
+      "loss": 0.6905,
+      "step": 1261
+    },
+    {
+      "epoch": 1.386241933269257,
+      "grad_norm": 0.4795394837856293,
+      "learning_rate": 0.00017924297924297921,
+      "loss": 0.6193,
+      "step": 1262
+    },
+    {
+      "epoch": 1.3873403817108334,
+      "grad_norm": 0.39672231674194336,
+      "learning_rate": 0.0001791208791208791,
+      "loss": 0.7833,
+      "step": 1263
+    },
+    {
+      "epoch": 1.3884388301524098,
+      "grad_norm": 0.46752655506134033,
+      "learning_rate": 0.00017899877899877899,
+      "loss": 0.6385,
+      "step": 1264
+    },
+    {
+      "epoch": 1.389537278593986,
+      "grad_norm": 0.5376736521720886,
+      "learning_rate": 0.00017887667887667884,
+      "loss": 0.6362,
+      "step": 1265
+    },
+    {
+      "epoch": 1.3906357270355623,
+      "grad_norm": 0.5675904750823975,
+      "learning_rate": 0.00017875457875457873,
+      "loss": 0.7975,
+      "step": 1266
+    },
+    {
+      "epoch": 1.3917341754771386,
+      "grad_norm": 0.5429015755653381,
+      "learning_rate": 0.00017863247863247861,
+      "loss": 0.5415,
+      "step": 1267
+    },
+    {
+      "epoch": 1.3928326239187148,
+      "grad_norm": 0.3714626729488373,
+      "learning_rate": 0.00017851037851037847,
+      "loss": 0.7104,
+      "step": 1268
+    },
+    {
+      "epoch": 1.3939310723602911,
+      "grad_norm": 0.7549324035644531,
+      "learning_rate": 0.00017838827838827836,
+      "loss": 0.698,
+      "step": 1269
+    },
+    {
+      "epoch": 1.3950295208018675,
+      "grad_norm": 0.36867257952690125,
+      "learning_rate": 0.00017826617826617827,
+      "loss": 0.6019,
+      "step": 1270
+    },
+    {
+      "epoch": 1.3961279692434436,
+      "grad_norm": 0.42439624667167664,
+      "learning_rate": 0.00017814407814407813,
+      "loss": 0.4626,
+      "step": 1271
+    },
+    {
+      "epoch": 1.39722641768502,
+      "grad_norm": 0.4768877923488617,
+      "learning_rate": 0.00017802197802197802,
+      "loss": 0.671,
+      "step": 1272
+    },
+    {
+      "epoch": 1.3983248661265961,
+      "grad_norm": 0.3415908217430115,
+      "learning_rate": 0.0001778998778998779,
+      "loss": 0.5904,
+      "step": 1273
+    },
+    {
+      "epoch": 1.3994233145681725,
+      "grad_norm": 0.5370535850524902,
+      "learning_rate": 0.00017777777777777776,
+      "loss": 0.578,
+      "step": 1274
+    },
+    {
+      "epoch": 1.4005217630097486,
+      "grad_norm": 0.61114901304245,
+      "learning_rate": 0.00017765567765567764,
+      "loss": 0.6498,
+      "step": 1275
+    },
+    {
+      "epoch": 1.401620211451325,
+      "grad_norm": 0.3491772711277008,
+      "learning_rate": 0.00017753357753357753,
+      "loss": 0.6057,
+      "step": 1276
+    },
+    {
+      "epoch": 1.4027186598929013,
+      "grad_norm": 0.4992705285549164,
+      "learning_rate": 0.0001774114774114774,
+      "loss": 0.8541,
+      "step": 1277
+    },
+    {
+      "epoch": 1.4038171083344775,
+      "grad_norm": 0.5476379990577698,
+      "learning_rate": 0.00017728937728937727,
+      "loss": 0.5608,
+      "step": 1278
+    },
+    {
+      "epoch": 1.4049155567760538,
+      "grad_norm": 0.6107895374298096,
+      "learning_rate": 0.00017716727716727716,
+      "loss": 0.7437,
+      "step": 1279
+    },
+    {
+      "epoch": 1.4060140052176302,
+      "grad_norm": 0.510809600353241,
+      "learning_rate": 0.00017704517704517702,
+      "loss": 0.6569,
+      "step": 1280
+    },
+    {
+      "epoch": 1.4071124536592063,
+      "grad_norm": 0.5050077438354492,
+      "learning_rate": 0.0001769230769230769,
+      "loss": 0.6566,
+      "step": 1281
+    },
+    {
+      "epoch": 1.4082109021007827,
+      "grad_norm": 0.44812703132629395,
+      "learning_rate": 0.0001768009768009768,
+      "loss": 0.6557,
+      "step": 1282
+    },
+    {
+      "epoch": 1.4093093505423588,
+      "grad_norm": 0.5216537714004517,
+      "learning_rate": 0.00017667887667887665,
+      "loss": 0.7311,
+      "step": 1283
+    },
+    {
+      "epoch": 1.4104077989839352,
+      "grad_norm": 0.5608856081962585,
+      "learning_rate": 0.00017655677655677656,
+      "loss": 0.9001,
+      "step": 1284
+    },
+    {
+      "epoch": 1.4115062474255113,
+      "grad_norm": 0.47205066680908203,
+      "learning_rate": 0.0001764346764346764,
+      "loss": 0.5214,
+      "step": 1285
+    },
+    {
+      "epoch": 1.4126046958670877,
+      "grad_norm": 0.4073629081249237,
+      "learning_rate": 0.0001763125763125763,
+      "loss": 0.483,
+      "step": 1286
+    },
+    {
+      "epoch": 1.413703144308664,
+      "grad_norm": 0.42381593585014343,
+      "learning_rate": 0.0001761904761904762,
+      "loss": 0.4895,
+      "step": 1287
+    },
+    {
+      "epoch": 1.4148015927502402,
+      "grad_norm": 0.629356861114502,
+      "learning_rate": 0.00017606837606837605,
+      "loss": 0.4639,
+      "step": 1288
+    },
+    {
+      "epoch": 1.4159000411918166,
+      "grad_norm": 0.3123486340045929,
+      "learning_rate": 0.00017594627594627593,
+      "loss": 0.4575,
+      "step": 1289
+    },
+    {
+      "epoch": 1.416998489633393,
+      "grad_norm": 0.4163682460784912,
+      "learning_rate": 0.00017582417582417582,
+      "loss": 0.7511,
+      "step": 1290
+    },
+    {
+      "epoch": 1.418096938074969,
+      "grad_norm": 0.5697455406188965,
+      "learning_rate": 0.00017570207570207568,
+      "loss": 0.5977,
+      "step": 1291
+    },
+    {
+      "epoch": 1.4191953865165454,
+      "grad_norm": 0.39232510328292847,
+      "learning_rate": 0.00017557997557997556,
+      "loss": 0.6133,
+      "step": 1292
+    },
+    {
+      "epoch": 1.4202938349581218,
+      "grad_norm": 0.5452993512153625,
+      "learning_rate": 0.00017545787545787545,
+      "loss": 0.6596,
+      "step": 1293
+    },
+    {
+      "epoch": 1.421392283399698,
+      "grad_norm": 0.39080601930618286,
+      "learning_rate": 0.0001753357753357753,
+      "loss": 0.7422,
+      "step": 1294
+    },
+    {
+      "epoch": 1.4224907318412743,
+      "grad_norm": 0.6513398289680481,
+      "learning_rate": 0.0001752136752136752,
+      "loss": 0.5277,
+      "step": 1295
+    },
+    {
+      "epoch": 1.4235891802828504,
+      "grad_norm": 0.4627130329608917,
+      "learning_rate": 0.00017509157509157508,
+      "loss": 0.6296,
+      "step": 1296
+    },
+    {
+      "epoch": 1.4246876287244268,
+      "grad_norm": 0.499700129032135,
+      "learning_rate": 0.00017496947496947494,
+      "loss": 0.689,
+      "step": 1297
+    },
+    {
+      "epoch": 1.425786077166003,
+      "grad_norm": 0.4668709635734558,
+      "learning_rate": 0.00017484737484737482,
+      "loss": 0.784,
+      "step": 1298
+    },
+    {
+      "epoch": 1.4268845256075793,
+      "grad_norm": 0.6378145217895508,
+      "learning_rate": 0.00017472527472527473,
+      "loss": 0.5077,
+      "step": 1299
+    },
+    {
+      "epoch": 1.4279829740491556,
+      "grad_norm": 0.6320174336433411,
+      "learning_rate": 0.00017460317460317457,
+      "loss": 1.061,
+      "step": 1300
+    },
+    {
+      "epoch": 1.4290814224907318,
+      "grad_norm": 0.48719078302383423,
+      "learning_rate": 0.00017448107448107448,
+      "loss": 0.7181,
+      "step": 1301
+    },
+    {
+      "epoch": 1.4301798709323081,
+      "grad_norm": 0.5345287919044495,
+      "learning_rate": 0.00017435897435897436,
+      "loss": 0.5599,
+      "step": 1302
+    },
+    {
+      "epoch": 1.4312783193738845,
+      "grad_norm": 0.567857563495636,
+      "learning_rate": 0.00017423687423687422,
+      "loss": 0.6294,
+      "step": 1303
+    },
+    {
+      "epoch": 1.4323767678154606,
+      "grad_norm": 0.5715040564537048,
+      "learning_rate": 0.0001741147741147741,
+      "loss": 0.5326,
+      "step": 1304
+    },
+    {
+      "epoch": 1.433475216257037,
+      "grad_norm": 0.40048834681510925,
+      "learning_rate": 0.000173992673992674,
+      "loss": 0.687,
+      "step": 1305
+    },
+    {
+      "epoch": 1.4345736646986131,
+      "grad_norm": 0.4964540898799896,
+      "learning_rate": 0.00017387057387057385,
+      "loss": 0.6149,
+      "step": 1306
+    },
+    {
+      "epoch": 1.4356721131401895,
+      "grad_norm": 0.5018569231033325,
+      "learning_rate": 0.00017374847374847374,
+      "loss": 0.4224,
+      "step": 1307
+    },
+    {
+      "epoch": 1.4367705615817656,
+      "grad_norm": 0.6026094555854797,
+      "learning_rate": 0.00017362637362637362,
+      "loss": 0.8934,
+      "step": 1308
+    },
+    {
+      "epoch": 1.437869010023342,
+      "grad_norm": 0.33409950137138367,
+      "learning_rate": 0.00017350427350427348,
+      "loss": 0.6725,
+      "step": 1309
+    },
+    {
+      "epoch": 1.4389674584649184,
+      "grad_norm": 0.43982234597206116,
+      "learning_rate": 0.00017338217338217337,
+      "loss": 0.9203,
+      "step": 1310
+    },
+    {
+      "epoch": 1.4400659069064945,
+      "grad_norm": 0.843877911567688,
+      "learning_rate": 0.00017326007326007322,
+      "loss": 0.6028,
+      "step": 1311
+    },
+    {
+      "epoch": 1.4411643553480709,
+      "grad_norm": 0.35148733854293823,
+      "learning_rate": 0.0001731379731379731,
+      "loss": 0.7503,
+      "step": 1312
+    },
+    {
+      "epoch": 1.4422628037896472,
+      "grad_norm": 0.4561845362186432,
+      "learning_rate": 0.000173015873015873,
+      "loss": 0.6577,
+      "step": 1313
+    },
+    {
+      "epoch": 1.4433612522312234,
+      "grad_norm": 0.47295713424682617,
+      "learning_rate": 0.00017289377289377285,
+      "loss": 0.8013,
+      "step": 1314
+    },
+    {
+      "epoch": 1.4444597006727997,
+      "grad_norm": 0.46340033411979675,
+      "learning_rate": 0.00017277167277167277,
+      "loss": 0.73,
+      "step": 1315
+    },
+    {
+      "epoch": 1.445558149114376,
+      "grad_norm": 0.49221453070640564,
+      "learning_rate": 0.00017264957264957265,
+      "loss": 0.6735,
+      "step": 1316
+    },
+    {
+      "epoch": 1.4466565975559522,
+      "grad_norm": 0.36250925064086914,
+      "learning_rate": 0.0001725274725274725,
+      "loss": 0.7463,
+      "step": 1317
+    },
+    {
+      "epoch": 1.4477550459975284,
+      "grad_norm": 0.3832615911960602,
+      "learning_rate": 0.0001724053724053724,
+      "loss": 0.7295,
+      "step": 1318
+    },
+    {
+      "epoch": 1.4488534944391047,
+      "grad_norm": 0.7413591742515564,
+      "learning_rate": 0.00017228327228327228,
+      "loss": 0.7627,
+      "step": 1319
+    },
+    {
+      "epoch": 1.449951942880681,
+      "grad_norm": 0.45626765489578247,
+      "learning_rate": 0.00017216117216117214,
+      "loss": 0.727,
+      "step": 1320
+    },
+    {
+      "epoch": 1.4510503913222572,
+      "grad_norm": 0.3024120330810547,
+      "learning_rate": 0.00017203907203907202,
+      "loss": 0.3986,
+      "step": 1321
+    },
+    {
+      "epoch": 1.4521488397638336,
+      "grad_norm": 0.31635284423828125,
+      "learning_rate": 0.0001719169719169719,
+      "loss": 0.3469,
+      "step": 1322
+    },
+    {
+      "epoch": 1.45324728820541,
+      "grad_norm": 0.36893391609191895,
+      "learning_rate": 0.00017179487179487177,
+      "loss": 0.7017,
+      "step": 1323
+    },
+    {
+      "epoch": 1.454345736646986,
+      "grad_norm": 0.4804024398326874,
+      "learning_rate": 0.00017167277167277165,
+      "loss": 0.8811,
+      "step": 1324
+    },
+    {
+      "epoch": 1.4554441850885624,
+      "grad_norm": 0.4446522295475006,
+      "learning_rate": 0.00017155067155067154,
+      "loss": 0.8027,
+      "step": 1325
+    },
+    {
+      "epoch": 1.4565426335301388,
+      "grad_norm": 0.27936413884162903,
+      "learning_rate": 0.0001714285714285714,
+      "loss": 0.3846,
+      "step": 1326
+    },
+    {
+      "epoch": 1.457641081971715,
+      "grad_norm": 0.3312259316444397,
+      "learning_rate": 0.00017130647130647128,
+      "loss": 0.4852,
+      "step": 1327
+    },
+    {
+      "epoch": 1.4587395304132913,
+      "grad_norm": 0.4751642644405365,
+      "learning_rate": 0.0001711843711843712,
+      "loss": 0.7337,
+      "step": 1328
+    },
+    {
+      "epoch": 1.4598379788548674,
+      "grad_norm": 0.5365067720413208,
+      "learning_rate": 0.00017106227106227103,
+      "loss": 0.8052,
+      "step": 1329
+    },
+    {
+      "epoch": 1.4609364272964438,
+      "grad_norm": 0.5944942831993103,
+      "learning_rate": 0.00017094017094017094,
+      "loss": 0.7673,
+      "step": 1330
+    },
+    {
+      "epoch": 1.46203487573802,
+      "grad_norm": 0.48244431614875793,
+      "learning_rate": 0.00017081807081807083,
+      "loss": 0.855,
+      "step": 1331
+    },
+    {
+      "epoch": 1.4631333241795963,
+      "grad_norm": 0.32348135113716125,
+      "learning_rate": 0.00017069597069597068,
+      "loss": 0.5133,
+      "step": 1332
+    },
+    {
+      "epoch": 1.4642317726211727,
+      "grad_norm": 0.6455866694450378,
+      "learning_rate": 0.00017057387057387057,
+      "loss": 0.6825,
+      "step": 1333
+    },
+    {
+      "epoch": 1.4653302210627488,
+      "grad_norm": 0.3937522768974304,
+      "learning_rate": 0.00017045177045177045,
+      "loss": 0.6335,
+      "step": 1334
+    },
+    {
+      "epoch": 1.4664286695043252,
+      "grad_norm": 0.33579352498054504,
+      "learning_rate": 0.0001703296703296703,
+      "loss": 0.4711,
+      "step": 1335
+    },
+    {
+      "epoch": 1.4675271179459015,
+      "grad_norm": 0.5055533647537231,
+      "learning_rate": 0.0001702075702075702,
+      "loss": 0.6512,
+      "step": 1336
+    },
+    {
+      "epoch": 1.4686255663874777,
+      "grad_norm": 0.40702182054519653,
+      "learning_rate": 0.00017008547008547006,
+      "loss": 0.8833,
+      "step": 1337
+    },
+    {
+      "epoch": 1.469724014829054,
+      "grad_norm": 0.3574135899543762,
+      "learning_rate": 0.00016996336996336994,
+      "loss": 0.7127,
+      "step": 1338
+    },
+    {
+      "epoch": 1.4708224632706302,
+      "grad_norm": 0.45641472935676575,
+      "learning_rate": 0.00016984126984126983,
+      "loss": 0.7258,
+      "step": 1339
+    },
+    {
+      "epoch": 1.4719209117122065,
+      "grad_norm": 1.5012352466583252,
+      "learning_rate": 0.0001697191697191697,
+      "loss": 0.8065,
+      "step": 1340
+    },
+    {
+      "epoch": 1.4730193601537827,
+      "grad_norm": 0.5025885701179504,
+      "learning_rate": 0.00016959706959706957,
+      "loss": 0.9377,
+      "step": 1341
+    },
+    {
+      "epoch": 1.474117808595359,
+      "grad_norm": 0.2942202687263489,
+      "learning_rate": 0.00016947496947496946,
+      "loss": 0.5693,
+      "step": 1342
+    },
+    {
+      "epoch": 1.4752162570369354,
+      "grad_norm": 0.48770126700401306,
+      "learning_rate": 0.00016935286935286932,
+      "loss": 0.5483,
+      "step": 1343
+    },
+    {
+      "epoch": 1.4763147054785115,
+      "grad_norm": 0.3853349983692169,
+      "learning_rate": 0.0001692307692307692,
+      "loss": 0.5787,
+      "step": 1344
+    },
+    {
+      "epoch": 1.4774131539200879,
+      "grad_norm": 0.3593169152736664,
+      "learning_rate": 0.00016910866910866911,
+      "loss": 0.6426,
+      "step": 1345
+    },
+    {
+      "epoch": 1.4785116023616642,
+      "grad_norm": 0.5932713150978088,
+      "learning_rate": 0.00016898656898656897,
+      "loss": 0.7543,
+      "step": 1346
+    },
+    {
+      "epoch": 1.4796100508032404,
+      "grad_norm": 0.43406638503074646,
+      "learning_rate": 0.00016886446886446886,
+      "loss": 0.7868,
+      "step": 1347
+    },
+    {
+      "epoch": 1.4807084992448167,
+      "grad_norm": 0.38596048951148987,
+      "learning_rate": 0.00016874236874236874,
+      "loss": 0.49,
+      "step": 1348
+    },
+    {
+      "epoch": 1.481806947686393,
+      "grad_norm": 0.42844533920288086,
+      "learning_rate": 0.0001686202686202686,
+      "loss": 0.6485,
+      "step": 1349
+    },
+    {
+      "epoch": 1.4829053961279692,
+      "grad_norm": 0.5165280103683472,
+      "learning_rate": 0.0001684981684981685,
+      "loss": 0.6924,
+      "step": 1350
+    },
+    {
+      "epoch": 1.4840038445695456,
+      "grad_norm": 0.5717988610267639,
+      "learning_rate": 0.00016837606837606837,
+      "loss": 0.5624,
+      "step": 1351
+    },
+    {
+      "epoch": 1.4851022930111217,
+      "grad_norm": 0.4384293556213379,
+      "learning_rate": 0.00016825396825396823,
+      "loss": 0.7895,
+      "step": 1352
+    },
+    {
+      "epoch": 1.486200741452698,
+      "grad_norm": 0.5472243428230286,
+      "learning_rate": 0.00016813186813186812,
+      "loss": 0.8838,
+      "step": 1353
+    },
+    {
+      "epoch": 1.4872991898942742,
+      "grad_norm": 0.3903232216835022,
+      "learning_rate": 0.000168009768009768,
+      "loss": 0.5452,
+      "step": 1354
+    },
+    {
+      "epoch": 1.4883976383358506,
+      "grad_norm": 0.3799583613872528,
+      "learning_rate": 0.00016788766788766786,
+      "loss": 0.8931,
+      "step": 1355
+    },
+    {
+      "epoch": 1.489496086777427,
+      "grad_norm": 0.4481349289417267,
+      "learning_rate": 0.00016776556776556775,
+      "loss": 0.5956,
+      "step": 1356
+    },
+    {
+      "epoch": 1.490594535219003,
+      "grad_norm": 0.45875266194343567,
+      "learning_rate": 0.00016764346764346763,
+      "loss": 0.4729,
+      "step": 1357
+    },
+    {
+      "epoch": 1.4916929836605795,
+      "grad_norm": 0.494112104177475,
+      "learning_rate": 0.0001675213675213675,
+      "loss": 0.6416,
+      "step": 1358
+    },
+    {
+      "epoch": 1.4927914321021558,
+      "grad_norm": 0.3976772725582123,
+      "learning_rate": 0.0001673992673992674,
+      "loss": 0.6601,
+      "step": 1359
+    },
+    {
+      "epoch": 1.493889880543732,
+      "grad_norm": 0.29009610414505005,
+      "learning_rate": 0.0001672771672771673,
+      "loss": 0.4261,
+      "step": 1360
+    },
+    {
+      "epoch": 1.4949883289853083,
+      "grad_norm": 0.5540419816970825,
+      "learning_rate": 0.00016715506715506715,
+      "loss": 0.8206,
+      "step": 1361
+    },
+    {
+      "epoch": 1.4960867774268845,
+      "grad_norm": 0.41308313608169556,
+      "learning_rate": 0.00016703296703296703,
+      "loss": 0.7862,
+      "step": 1362
+    },
+    {
+      "epoch": 1.4971852258684608,
+      "grad_norm": 0.6565150618553162,
+      "learning_rate": 0.0001669108669108669,
+      "loss": 0.6963,
+      "step": 1363
+    },
+    {
+      "epoch": 1.498283674310037,
+      "grad_norm": 0.4901321530342102,
+      "learning_rate": 0.00016678876678876678,
+      "loss": 0.7063,
+      "step": 1364
+    },
+    {
+      "epoch": 1.4993821227516133,
+      "grad_norm": 0.4676086902618408,
+      "learning_rate": 0.00016666666666666666,
+      "loss": 0.5142,
+      "step": 1365
+    },
+    {
+      "epoch": 1.5004805711931897,
+      "grad_norm": 0.4745628833770752,
+      "learning_rate": 0.00016654456654456652,
+      "loss": 0.7659,
+      "step": 1366
+    },
+    {
+      "epoch": 1.5015790196347658,
+      "grad_norm": 0.42693057656288147,
+      "learning_rate": 0.0001664224664224664,
+      "loss": 0.9233,
+      "step": 1367
+    },
+    {
+      "epoch": 1.5026774680763422,
+      "grad_norm": 0.4110391139984131,
+      "learning_rate": 0.0001663003663003663,
+      "loss": 0.5062,
+      "step": 1368
+    },
+    {
+      "epoch": 1.5037759165179185,
+      "grad_norm": 0.3090996742248535,
+      "learning_rate": 0.00016617826617826615,
+      "loss": 0.4462,
+      "step": 1369
+    },
+    {
+      "epoch": 1.5048743649594947,
+      "grad_norm": 0.42027410864830017,
+      "learning_rate": 0.00016605616605616603,
+      "loss": 0.8589,
+      "step": 1370
+    },
+    {
+      "epoch": 1.505972813401071,
+      "grad_norm": 0.38396796584129333,
+      "learning_rate": 0.00016593406593406592,
+      "loss": 0.6609,
+      "step": 1371
+    },
+    {
+      "epoch": 1.5070712618426474,
+      "grad_norm": 0.5236012935638428,
+      "learning_rate": 0.00016581196581196578,
+      "loss": 0.6506,
+      "step": 1372
+    },
+    {
+      "epoch": 1.5081697102842235,
+      "grad_norm": 0.7232113480567932,
+      "learning_rate": 0.00016568986568986566,
+      "loss": 0.6689,
+      "step": 1373
+    },
+    {
+      "epoch": 1.5092681587257997,
+      "grad_norm": 0.4777502417564392,
+      "learning_rate": 0.00016556776556776558,
+      "loss": 0.5701,
+      "step": 1374
+    },
+    {
+      "epoch": 1.510366607167376,
+      "grad_norm": 0.39154767990112305,
+      "learning_rate": 0.0001654456654456654,
+      "loss": 0.4906,
+      "step": 1375
+    },
+    {
+      "epoch": 1.5114650556089524,
+      "grad_norm": 0.469382107257843,
+      "learning_rate": 0.00016532356532356532,
+      "loss": 0.5768,
+      "step": 1376
+    },
+    {
+      "epoch": 1.5125635040505285,
+      "grad_norm": 0.3485945761203766,
+      "learning_rate": 0.0001652014652014652,
+      "loss": 0.7814,
+      "step": 1377
+    },
+    {
+      "epoch": 1.513661952492105,
+      "grad_norm": 0.4375949203968048,
+      "learning_rate": 0.00016507936507936506,
+      "loss": 0.6328,
+      "step": 1378
+    },
+    {
+      "epoch": 1.5147604009336813,
+      "grad_norm": 0.47778064012527466,
+      "learning_rate": 0.00016495726495726495,
+      "loss": 0.635,
+      "step": 1379
+    },
+    {
+      "epoch": 1.5158588493752574,
+      "grad_norm": 0.3515126705169678,
+      "learning_rate": 0.00016483516483516484,
+      "loss": 0.7014,
+      "step": 1380
+    },
+    {
+      "epoch": 1.5169572978168337,
+      "grad_norm": 0.3710018992424011,
+      "learning_rate": 0.0001647130647130647,
+      "loss": 0.7903,
+      "step": 1381
+    },
+    {
+      "epoch": 1.51805574625841,
+      "grad_norm": 0.37630394101142883,
+      "learning_rate": 0.00016459096459096458,
+      "loss": 0.5446,
+      "step": 1382
+    },
+    {
+      "epoch": 1.5191541946999862,
+      "grad_norm": 0.4312807321548462,
+      "learning_rate": 0.00016446886446886446,
+      "loss": 0.6101,
+      "step": 1383
+    },
+    {
+      "epoch": 1.5202526431415624,
+      "grad_norm": 0.399384468793869,
+      "learning_rate": 0.00016434676434676432,
+      "loss": 0.5734,
+      "step": 1384
+    },
+    {
+      "epoch": 1.521351091583139,
+      "grad_norm": 0.41233471035957336,
+      "learning_rate": 0.0001642246642246642,
+      "loss": 0.6525,
+      "step": 1385
+    },
+    {
+      "epoch": 1.522449540024715,
+      "grad_norm": 0.5215228199958801,
+      "learning_rate": 0.0001641025641025641,
+      "loss": 0.4804,
+      "step": 1386
+    },
+    {
+      "epoch": 1.5235479884662912,
+      "grad_norm": 0.42069393396377563,
+      "learning_rate": 0.00016398046398046395,
+      "loss": 0.5517,
+      "step": 1387
+    },
+    {
+      "epoch": 1.5246464369078676,
+      "grad_norm": 1.7902978658676147,
+      "learning_rate": 0.00016385836385836384,
+      "loss": 0.6295,
+      "step": 1388
+    },
+    {
+      "epoch": 1.525744885349444,
+      "grad_norm": 0.7353507280349731,
+      "learning_rate": 0.0001637362637362637,
+      "loss": 1.0585,
+      "step": 1389
+    },
+    {
+      "epoch": 1.52684333379102,
+      "grad_norm": 0.45992404222488403,
+      "learning_rate": 0.0001636141636141636,
+      "loss": 0.7671,
+      "step": 1390
+    },
+    {
+      "epoch": 1.5279417822325965,
+      "grad_norm": 0.3927334249019623,
+      "learning_rate": 0.0001634920634920635,
+      "loss": 0.7479,
+      "step": 1391
+    },
+    {
+      "epoch": 1.5290402306741728,
+      "grad_norm": 0.32833003997802734,
+      "learning_rate": 0.00016336996336996335,
+      "loss": 0.5774,
+      "step": 1392
+    },
+    {
+      "epoch": 1.530138679115749,
+      "grad_norm": 0.4306529462337494,
+      "learning_rate": 0.00016324786324786324,
+      "loss": 0.6317,
+      "step": 1393
+    },
+    {
+      "epoch": 1.5312371275573253,
+      "grad_norm": 0.5411052703857422,
+      "learning_rate": 0.00016312576312576312,
+      "loss": 0.6637,
+      "step": 1394
+    },
+    {
+      "epoch": 1.5323355759989017,
+      "grad_norm": 0.633800745010376,
+      "learning_rate": 0.00016300366300366298,
+      "loss": 0.7145,
+      "step": 1395
+    },
+    {
+      "epoch": 1.5334340244404778,
+      "grad_norm": 0.6986578702926636,
+      "learning_rate": 0.00016288156288156287,
+      "loss": 0.7194,
+      "step": 1396
+    },
+    {
+      "epoch": 1.534532472882054,
+      "grad_norm": 0.5223686695098877,
+      "learning_rate": 0.00016275946275946275,
+      "loss": 0.7849,
+      "step": 1397
+    },
+    {
+      "epoch": 1.5356309213236303,
+      "grad_norm": 0.5342483520507812,
+      "learning_rate": 0.0001626373626373626,
+      "loss": 0.8885,
+      "step": 1398
+    },
+    {
+      "epoch": 1.5367293697652067,
+      "grad_norm": 0.5467656850814819,
+      "learning_rate": 0.0001625152625152625,
+      "loss": 0.6265,
+      "step": 1399
+    },
+    {
+      "epoch": 1.5378278182067828,
+      "grad_norm": 0.4483658969402313,
+      "learning_rate": 0.00016239316239316238,
+      "loss": 0.7133,
+      "step": 1400
+    },
+    {
+      "epoch": 1.5389262666483592,
+      "grad_norm": 0.5714216232299805,
+      "learning_rate": 0.00016227106227106224,
+      "loss": 0.5212,
+      "step": 1401
+    },
+    {
+      "epoch": 1.5400247150899355,
+      "grad_norm": 0.5487145781517029,
+      "learning_rate": 0.00016214896214896213,
+      "loss": 0.6276,
+      "step": 1402
+    },
+    {
+      "epoch": 1.5411231635315117,
+      "grad_norm": 0.3687078654766083,
+      "learning_rate": 0.00016202686202686204,
+      "loss": 0.7512,
+      "step": 1403
+    },
+    {
+      "epoch": 1.542221611973088,
+      "grad_norm": 0.3596762418746948,
+      "learning_rate": 0.00016190476190476187,
+      "loss": 0.7192,
+      "step": 1404
+    },
+    {
+      "epoch": 1.5433200604146644,
+      "grad_norm": 0.4092305898666382,
+      "learning_rate": 0.00016178266178266178,
+      "loss": 0.7339,
+      "step": 1405
+    },
+    {
+      "epoch": 1.5444185088562405,
+      "grad_norm": 0.4018193483352661,
+      "learning_rate": 0.00016166056166056167,
+      "loss": 0.7213,
+      "step": 1406
+    },
+    {
+      "epoch": 1.5455169572978167,
+      "grad_norm": 0.4993208646774292,
+      "learning_rate": 0.00016153846153846153,
+      "loss": 0.6362,
+      "step": 1407
+    },
+    {
+      "epoch": 1.5466154057393933,
+      "grad_norm": 0.3958855867385864,
+      "learning_rate": 0.0001614163614163614,
+      "loss": 0.8482,
+      "step": 1408
+    },
+    {
+      "epoch": 1.5477138541809694,
+      "grad_norm": 0.32689765095710754,
+      "learning_rate": 0.0001612942612942613,
+      "loss": 0.6583,
+      "step": 1409
+    },
+    {
+      "epoch": 1.5488123026225455,
+      "grad_norm": 0.48947611451148987,
+      "learning_rate": 0.00016117216117216116,
+      "loss": 0.6707,
+      "step": 1410
+    },
+    {
+      "epoch": 1.549910751064122,
+      "grad_norm": 0.3446139395236969,
+      "learning_rate": 0.00016105006105006104,
+      "loss": 0.8914,
+      "step": 1411
+    },
+    {
+      "epoch": 1.5510091995056983,
+      "grad_norm": 0.585746705532074,
+      "learning_rate": 0.0001609279609279609,
+      "loss": 0.5413,
+      "step": 1412
+    },
+    {
+      "epoch": 1.5521076479472744,
+      "grad_norm": 0.6561328172683716,
+      "learning_rate": 0.00016080586080586079,
+      "loss": 0.3728,
+      "step": 1413
+    },
+    {
+      "epoch": 1.5532060963888508,
+      "grad_norm": 0.47158828377723694,
+      "learning_rate": 0.00016068376068376067,
+      "loss": 0.6525,
+      "step": 1414
+    },
+    {
+      "epoch": 1.5543045448304271,
+      "grad_norm": 0.3676914572715759,
+      "learning_rate": 0.00016056166056166053,
+      "loss": 0.7395,
+      "step": 1415
+    },
+    {
+      "epoch": 1.5554029932720033,
+      "grad_norm": 0.608076810836792,
+      "learning_rate": 0.00016043956043956041,
+      "loss": 0.5289,
+      "step": 1416
+    },
+    {
+      "epoch": 1.5565014417135794,
+      "grad_norm": 0.44940462708473206,
+      "learning_rate": 0.0001603174603174603,
+      "loss": 0.6282,
+      "step": 1417
+    },
+    {
+      "epoch": 1.557599890155156,
+      "grad_norm": 0.48062869906425476,
+      "learning_rate": 0.00016019536019536016,
+      "loss": 0.7438,
+      "step": 1418
+    },
+    {
+      "epoch": 1.5586983385967321,
+      "grad_norm": 0.43834635615348816,
+      "learning_rate": 0.00016007326007326004,
+      "loss": 0.4248,
+      "step": 1419
+    },
+    {
+      "epoch": 1.5597967870383083,
+      "grad_norm": 0.5203731060028076,
+      "learning_rate": 0.00015995115995115996,
+      "loss": 0.91,
+      "step": 1420
+    },
+    {
+      "epoch": 1.5608952354798846,
+      "grad_norm": 0.5766960978507996,
+      "learning_rate": 0.00015982905982905981,
+      "loss": 0.7211,
+      "step": 1421
+    },
+    {
+      "epoch": 1.561993683921461,
+      "grad_norm": 0.3048666715621948,
+      "learning_rate": 0.0001597069597069597,
+      "loss": 0.5618,
+      "step": 1422
+    },
+    {
+      "epoch": 1.5630921323630371,
+      "grad_norm": 0.3916679322719574,
+      "learning_rate": 0.00015958485958485959,
+      "loss": 0.6954,
+      "step": 1423
+    },
+    {
+      "epoch": 1.5641905808046135,
+      "grad_norm": 0.6336612105369568,
+      "learning_rate": 0.00015946275946275944,
+      "loss": 0.6368,
+      "step": 1424
+    },
+    {
+      "epoch": 1.5652890292461898,
+      "grad_norm": 0.8314816355705261,
+      "learning_rate": 0.00015934065934065933,
+      "loss": 0.7633,
+      "step": 1425
+    },
+    {
+      "epoch": 1.566387477687766,
+      "grad_norm": 0.46973487734794617,
+      "learning_rate": 0.00015921855921855922,
+      "loss": 0.6915,
+      "step": 1426
+    },
+    {
+      "epoch": 1.5674859261293423,
+      "grad_norm": 0.48737633228302,
+      "learning_rate": 0.00015909645909645907,
+      "loss": 0.5346,
+      "step": 1427
+    },
+    {
+      "epoch": 1.5685843745709187,
+      "grad_norm": 0.548876941204071,
+      "learning_rate": 0.00015897435897435896,
+      "loss": 1.0449,
+      "step": 1428
+    },
+    {
+      "epoch": 1.5696828230124948,
+      "grad_norm": 0.5039654970169067,
+      "learning_rate": 0.00015885225885225884,
+      "loss": 0.9953,
+      "step": 1429
+    },
+    {
+      "epoch": 1.570781271454071,
+      "grad_norm": 0.7233378887176514,
+      "learning_rate": 0.0001587301587301587,
+      "loss": 0.7068,
+      "step": 1430
+    },
+    {
+      "epoch": 1.5718797198956473,
+      "grad_norm": 0.5767638683319092,
+      "learning_rate": 0.0001586080586080586,
+      "loss": 0.8055,
+      "step": 1431
+    },
+    {
+      "epoch": 1.5729781683372237,
+      "grad_norm": 0.34450021386146545,
+      "learning_rate": 0.00015848595848595847,
+      "loss": 0.726,
+      "step": 1432
+    },
+    {
+      "epoch": 1.5740766167787998,
+      "grad_norm": 0.8474962711334229,
+      "learning_rate": 0.00015836385836385833,
+      "loss": 0.6974,
+      "step": 1433
+    },
+    {
+      "epoch": 1.5751750652203762,
+      "grad_norm": 1.565746545791626,
+      "learning_rate": 0.00015824175824175824,
+      "loss": 0.7766,
+      "step": 1434
+    },
+    {
+      "epoch": 1.5762735136619526,
+      "grad_norm": 0.4393616020679474,
+      "learning_rate": 0.00015811965811965813,
+      "loss": 0.6071,
+      "step": 1435
+    },
+    {
+      "epoch": 1.5773719621035287,
+      "grad_norm": 0.5209214091300964,
+      "learning_rate": 0.000157997557997558,
+      "loss": 0.7546,
+      "step": 1436
+    },
+    {
+      "epoch": 1.578470410545105,
+      "grad_norm": 0.6069398522377014,
+      "learning_rate": 0.00015787545787545787,
+      "loss": 0.7322,
+      "step": 1437
+    },
+    {
+      "epoch": 1.5795688589866814,
+      "grad_norm": 0.6168296337127686,
+      "learning_rate": 0.00015775335775335773,
+      "loss": 0.5169,
+      "step": 1438
+    },
+    {
+      "epoch": 1.5806673074282576,
+      "grad_norm": 0.25368016958236694,
+      "learning_rate": 0.00015763125763125762,
+      "loss": 0.4838,
+      "step": 1439
+    },
+    {
+      "epoch": 1.5817657558698337,
+      "grad_norm": 0.4165039360523224,
+      "learning_rate": 0.0001575091575091575,
+      "loss": 1.0135,
+      "step": 1440
+    },
+    {
+      "epoch": 1.5828642043114103,
+      "grad_norm": 0.4596197307109833,
+      "learning_rate": 0.00015738705738705736,
+      "loss": 0.5545,
+      "step": 1441
+    },
+    {
+      "epoch": 1.5839626527529864,
+      "grad_norm": 0.5077592730522156,
+      "learning_rate": 0.00015726495726495725,
+      "loss": 0.7754,
+      "step": 1442
+    },
+    {
+      "epoch": 1.5850611011945626,
+      "grad_norm": 0.5041285157203674,
+      "learning_rate": 0.00015714285714285713,
+      "loss": 0.8384,
+      "step": 1443
+    },
+    {
+      "epoch": 1.586159549636139,
+      "grad_norm": 0.40924420952796936,
+      "learning_rate": 0.000157020757020757,
+      "loss": 0.5511,
+      "step": 1444
+    },
+    {
+      "epoch": 1.5872579980777153,
+      "grad_norm": 0.4800551235675812,
+      "learning_rate": 0.00015689865689865688,
+      "loss": 0.6154,
+      "step": 1445
+    },
+    {
+      "epoch": 1.5883564465192914,
+      "grad_norm": 0.433174729347229,
+      "learning_rate": 0.00015677655677655676,
+      "loss": 0.6158,
+      "step": 1446
+    },
+    {
+      "epoch": 1.5894548949608678,
+      "grad_norm": 0.29649895429611206,
+      "learning_rate": 0.00015665445665445662,
+      "loss": 0.5729,
+      "step": 1447
+    },
+    {
+      "epoch": 1.5905533434024441,
+      "grad_norm": 0.3815969228744507,
+      "learning_rate": 0.0001565323565323565,
+      "loss": 0.6748,
+      "step": 1448
+    },
+    {
+      "epoch": 1.5916517918440203,
+      "grad_norm": 0.4933919608592987,
+      "learning_rate": 0.00015641025641025642,
+      "loss": 0.7683,
+      "step": 1449
+    },
+    {
+      "epoch": 1.5927502402855966,
+      "grad_norm": 0.5053071975708008,
+      "learning_rate": 0.00015628815628815625,
+      "loss": 0.6779,
+      "step": 1450
+    },
+    {
+      "epoch": 1.593848688727173,
+      "grad_norm": 0.3900013566017151,
+      "learning_rate": 0.00015616605616605616,
+      "loss": 0.6326,
+      "step": 1451
+    },
+    {
+      "epoch": 1.5949471371687491,
+      "grad_norm": 0.5823982357978821,
+      "learning_rate": 0.00015604395604395605,
+      "loss": 0.6104,
+      "step": 1452
+    },
+    {
+      "epoch": 1.5960455856103253,
+      "grad_norm": 0.5277792811393738,
+      "learning_rate": 0.0001559218559218559,
+      "loss": 0.6647,
+      "step": 1453
+    },
+    {
+      "epoch": 1.5971440340519016,
+      "grad_norm": 0.32926440238952637,
+      "learning_rate": 0.0001557997557997558,
+      "loss": 0.6064,
+      "step": 1454
+    },
+    {
+      "epoch": 1.598242482493478,
+      "grad_norm": 0.7350378036499023,
+      "learning_rate": 0.00015567765567765568,
+      "loss": 0.7951,
+      "step": 1455
+    },
+    {
+      "epoch": 1.5993409309350541,
+      "grad_norm": 0.4125807285308838,
+      "learning_rate": 0.00015555555555555554,
+      "loss": 0.7761,
+      "step": 1456
+    },
+    {
+      "epoch": 1.6004393793766305,
+      "grad_norm": 0.49707722663879395,
+      "learning_rate": 0.00015543345543345542,
+      "loss": 0.7299,
+      "step": 1457
+    },
+    {
+      "epoch": 1.6015378278182069,
+      "grad_norm": 0.3240358829498291,
+      "learning_rate": 0.0001553113553113553,
+      "loss": 0.4832,
+      "step": 1458
+    },
+    {
+      "epoch": 1.602636276259783,
+      "grad_norm": 0.44430434703826904,
+      "learning_rate": 0.00015518925518925517,
+      "loss": 0.5968,
+      "step": 1459
+    },
+    {
+      "epoch": 1.6037347247013594,
+      "grad_norm": 0.3702992796897888,
+      "learning_rate": 0.00015506715506715505,
+      "loss": 0.7177,
+      "step": 1460
+    },
+    {
+      "epoch": 1.6048331731429357,
+      "grad_norm": 0.5001052618026733,
+      "learning_rate": 0.00015494505494505494,
+      "loss": 0.7448,
+      "step": 1461
+    },
+    {
+      "epoch": 1.6059316215845119,
+      "grad_norm": 0.45969969034194946,
+      "learning_rate": 0.0001548229548229548,
+      "loss": 0.8292,
+      "step": 1462
+    },
+    {
+      "epoch": 1.607030070026088,
+      "grad_norm": 0.46075674891471863,
+      "learning_rate": 0.00015470085470085468,
+      "loss": 0.5624,
+      "step": 1463
+    },
+    {
+      "epoch": 1.6081285184676646,
+      "grad_norm": 2.077080488204956,
+      "learning_rate": 0.00015457875457875454,
+      "loss": 0.6643,
+      "step": 1464
+    },
+    {
+      "epoch": 1.6092269669092407,
+      "grad_norm": 0.46008172631263733,
+      "learning_rate": 0.00015445665445665445,
+      "loss": 0.6329,
+      "step": 1465
+    },
+    {
+      "epoch": 1.6103254153508169,
+      "grad_norm": 0.5016405582427979,
+      "learning_rate": 0.00015433455433455434,
+      "loss": 0.7692,
+      "step": 1466
+    },
+    {
+      "epoch": 1.6114238637923932,
+      "grad_norm": 0.46292269229888916,
+      "learning_rate": 0.0001542124542124542,
+      "loss": 0.6485,
+      "step": 1467
+    },
+    {
+      "epoch": 1.6125223122339696,
+      "grad_norm": 0.4498538672924042,
+      "learning_rate": 0.00015409035409035408,
+      "loss": 0.598,
+      "step": 1468
+    },
+    {
+      "epoch": 1.6136207606755457,
+      "grad_norm": 0.3537295162677765,
+      "learning_rate": 0.00015396825396825397,
+      "loss": 0.6356,
+      "step": 1469
+    },
+    {
+      "epoch": 1.614719209117122,
+      "grad_norm": 0.9966747164726257,
+      "learning_rate": 0.00015384615384615382,
+      "loss": 0.6627,
+      "step": 1470
+    },
+    {
+      "epoch": 1.6158176575586984,
+      "grad_norm": 0.9386951327323914,
+      "learning_rate": 0.0001537240537240537,
+      "loss": 0.8148,
+      "step": 1471
+    },
+    {
+      "epoch": 1.6169161060002746,
+      "grad_norm": 0.3452979028224945,
+      "learning_rate": 0.0001536019536019536,
+      "loss": 0.5778,
+      "step": 1472
+    },
+    {
+      "epoch": 1.618014554441851,
+      "grad_norm": 0.3443523049354553,
+      "learning_rate": 0.00015347985347985345,
+      "loss": 0.9228,
+      "step": 1473
+    },
+    {
+      "epoch": 1.6191130028834273,
+      "grad_norm": 0.5345872044563293,
+      "learning_rate": 0.00015335775335775334,
+      "loss": 0.4682,
+      "step": 1474
+    },
+    {
+      "epoch": 1.6202114513250034,
+      "grad_norm": 0.35112351179122925,
+      "learning_rate": 0.00015323565323565322,
+      "loss": 0.5482,
+      "step": 1475
+    },
+    {
+      "epoch": 1.6213098997665796,
+      "grad_norm": 0.39090535044670105,
+      "learning_rate": 0.00015311355311355308,
+      "loss": 0.825,
+      "step": 1476
+    },
+    {
+      "epoch": 1.622408348208156,
+      "grad_norm": 1.1684538125991821,
+      "learning_rate": 0.00015299145299145297,
+      "loss": 0.6561,
+      "step": 1477
+    },
+    {
+      "epoch": 1.6235067966497323,
+      "grad_norm": 0.4006233513355255,
+      "learning_rate": 0.00015286935286935288,
+      "loss": 0.3647,
+      "step": 1478
+    },
+    {
+      "epoch": 1.6246052450913084,
+      "grad_norm": 0.30577126145362854,
+      "learning_rate": 0.0001527472527472527,
+      "loss": 0.4934,
+      "step": 1479
+    },
+    {
+      "epoch": 1.6257036935328848,
+      "grad_norm": 0.39927995204925537,
+      "learning_rate": 0.00015262515262515263,
+      "loss": 0.6028,
+      "step": 1480
+    },
+    {
+      "epoch": 1.6268021419744612,
+      "grad_norm": 0.49143150448799133,
+      "learning_rate": 0.0001525030525030525,
+      "loss": 0.4595,
+      "step": 1481
+    },
+    {
+      "epoch": 1.6279005904160373,
+      "grad_norm": 0.8603225946426392,
+      "learning_rate": 0.00015238095238095237,
+      "loss": 0.8617,
+      "step": 1482
+    },
+    {
+      "epoch": 1.6289990388576137,
+      "grad_norm": 0.534269392490387,
+      "learning_rate": 0.00015225885225885225,
+      "loss": 0.6648,
+      "step": 1483
+    },
+    {
+      "epoch": 1.63009748729919,
+      "grad_norm": 0.4987354278564453,
+      "learning_rate": 0.00015213675213675214,
+      "loss": 0.5908,
+      "step": 1484
+    },
+    {
+      "epoch": 1.6311959357407662,
+      "grad_norm": 0.5739774107933044,
+      "learning_rate": 0.000152014652014652,
+      "loss": 0.7652,
+      "step": 1485
+    },
+    {
+      "epoch": 1.6322943841823423,
+      "grad_norm": 0.5343801975250244,
+      "learning_rate": 0.00015189255189255188,
+      "loss": 0.6864,
+      "step": 1486
+    },
+    {
+      "epoch": 1.6333928326239189,
+      "grad_norm": 0.45683905482292175,
+      "learning_rate": 0.00015177045177045177,
+      "loss": 0.7179,
+      "step": 1487
+    },
+    {
+      "epoch": 1.634491281065495,
+      "grad_norm": 0.5020450949668884,
+      "learning_rate": 0.00015164835164835163,
+      "loss": 0.4356,
+      "step": 1488
+    },
+    {
+      "epoch": 1.6355897295070712,
+      "grad_norm": 0.3870914876461029,
+      "learning_rate": 0.0001515262515262515,
+      "loss": 0.692,
+      "step": 1489
+    },
+    {
+      "epoch": 1.6366881779486475,
+      "grad_norm": 0.5256255269050598,
+      "learning_rate": 0.00015140415140415137,
+      "loss": 0.7184,
+      "step": 1490
+    },
+    {
+      "epoch": 1.6377866263902239,
+      "grad_norm": 0.27588197588920593,
+      "learning_rate": 0.00015128205128205126,
+      "loss": 0.6928,
+      "step": 1491
+    },
+    {
+      "epoch": 1.6388850748318,
+      "grad_norm": 0.43336692452430725,
+      "learning_rate": 0.00015115995115995114,
+      "loss": 0.7357,
+      "step": 1492
+    },
+    {
+      "epoch": 1.6399835232733764,
+      "grad_norm": 0.7952486872673035,
+      "learning_rate": 0.000151037851037851,
+      "loss": 0.5536,
+      "step": 1493
+    },
+    {
+      "epoch": 1.6410819717149527,
+      "grad_norm": 3.8659090995788574,
+      "learning_rate": 0.00015091575091575089,
+      "loss": 0.6409,
+      "step": 1494
+    },
+    {
+      "epoch": 1.6421804201565289,
+      "grad_norm": 0.3824027478694916,
+      "learning_rate": 0.0001507936507936508,
+      "loss": 0.5988,
+      "step": 1495
+    },
+    {
+      "epoch": 1.643278868598105,
+      "grad_norm": 0.45106491446495056,
+      "learning_rate": 0.00015067155067155066,
+      "loss": 0.7568,
+      "step": 1496
+    },
+    {
+      "epoch": 1.6443773170396816,
+      "grad_norm": 0.719417154788971,
+      "learning_rate": 0.00015054945054945054,
+      "loss": 0.8191,
+      "step": 1497
+    },
+    {
+      "epoch": 1.6454757654812577,
+      "grad_norm": 0.4702167212963104,
+      "learning_rate": 0.00015042735042735043,
+      "loss": 0.6761,
+      "step": 1498
+    },
+    {
+      "epoch": 1.6465742139228339,
+      "grad_norm": 0.49441996216773987,
+      "learning_rate": 0.0001503052503052503,
+      "loss": 0.7323,
+      "step": 1499
+    },
+    {
+      "epoch": 1.6476726623644102,
+      "grad_norm": 0.623470664024353,
+      "learning_rate": 0.00015018315018315017,
+      "loss": 0.8384,
+      "step": 1500
+    },
+    {
+      "epoch": 1.6487711108059866,
+      "grad_norm": 0.5583334565162659,
+      "learning_rate": 0.00015006105006105006,
+      "loss": 0.8238,
+      "step": 1501
+    },
+    {
+      "epoch": 1.6498695592475627,
+      "grad_norm": 0.4803924560546875,
+      "learning_rate": 0.00014993894993894994,
+      "loss": 0.5322,
+      "step": 1502
+    },
+    {
+      "epoch": 1.650968007689139,
+      "grad_norm": 0.709605872631073,
+      "learning_rate": 0.0001498168498168498,
+      "loss": 0.8254,
+      "step": 1503
+    },
+    {
+      "epoch": 1.6520664561307155,
+      "grad_norm": 0.48047375679016113,
+      "learning_rate": 0.0001496947496947497,
+      "loss": 0.5263,
+      "step": 1504
+    },
+    {
+      "epoch": 1.6531649045722916,
+      "grad_norm": 0.41796261072158813,
+      "learning_rate": 0.00014957264957264957,
+      "loss": 0.5803,
+      "step": 1505
+    },
+    {
+      "epoch": 1.654263353013868,
+      "grad_norm": 0.7576707601547241,
+      "learning_rate": 0.00014945054945054943,
+      "loss": 0.545,
+      "step": 1506
+    },
+    {
+      "epoch": 1.6553618014554443,
+      "grad_norm": 0.4668630063533783,
+      "learning_rate": 0.00014932844932844932,
+      "loss": 0.6213,
+      "step": 1507
+    },
+    {
+      "epoch": 1.6564602498970205,
+      "grad_norm": 0.9730806350708008,
+      "learning_rate": 0.00014920634920634917,
+      "loss": 0.5415,
+      "step": 1508
+    },
+    {
+      "epoch": 1.6575586983385966,
+      "grad_norm": 0.39670151472091675,
+      "learning_rate": 0.0001490842490842491,
+      "loss": 0.7931,
+      "step": 1509
+    },
+    {
+      "epoch": 1.658657146780173,
+      "grad_norm": 0.6003556847572327,
+      "learning_rate": 0.00014896214896214895,
+      "loss": 0.7494,
+      "step": 1510
+    },
+    {
+      "epoch": 1.6597555952217493,
+      "grad_norm": 0.4335152506828308,
+      "learning_rate": 0.00014884004884004883,
+      "loss": 0.7003,
+      "step": 1511
+    },
+    {
+      "epoch": 1.6608540436633255,
+      "grad_norm": 0.34025630354881287,
+      "learning_rate": 0.00014871794871794872,
+      "loss": 0.9012,
+      "step": 1512
+    },
+    {
+      "epoch": 1.6619524921049018,
+      "grad_norm": 0.403934508562088,
+      "learning_rate": 0.00014859584859584858,
+      "loss": 0.717,
+      "step": 1513
+    },
+    {
+      "epoch": 1.6630509405464782,
+      "grad_norm": 0.45691147446632385,
+      "learning_rate": 0.00014847374847374846,
+      "loss": 0.4833,
+      "step": 1514
+    },
+    {
+      "epoch": 1.6641493889880543,
+      "grad_norm": 0.42266151309013367,
+      "learning_rate": 0.00014835164835164835,
+      "loss": 0.5892,
+      "step": 1515
+    },
+    {
+      "epoch": 1.6652478374296307,
+      "grad_norm": 0.392337441444397,
+      "learning_rate": 0.0001482295482295482,
+      "loss": 0.7748,
+      "step": 1516
+    },
+    {
+      "epoch": 1.666346285871207,
+      "grad_norm": 0.352081298828125,
+      "learning_rate": 0.0001481074481074481,
+      "loss": 0.6018,
+      "step": 1517
+    },
+    {
+      "epoch": 1.6674447343127832,
+      "grad_norm": 0.46293389797210693,
+      "learning_rate": 0.00014798534798534798,
+      "loss": 0.4696,
+      "step": 1518
+    },
+    {
+      "epoch": 1.6685431827543593,
+      "grad_norm": 0.6427372097969055,
+      "learning_rate": 0.00014786324786324786,
+      "loss": 0.7279,
+      "step": 1519
+    },
+    {
+      "epoch": 1.669641631195936,
+      "grad_norm": 0.500382125377655,
+      "learning_rate": 0.00014774114774114772,
+      "loss": 0.7395,
+      "step": 1520
+    },
+    {
+      "epoch": 1.670740079637512,
+      "grad_norm": 0.4410606920719147,
+      "learning_rate": 0.0001476190476190476,
+      "loss": 0.501,
+      "step": 1521
+    },
+    {
+      "epoch": 1.6718385280790882,
+      "grad_norm": 0.5587645769119263,
+      "learning_rate": 0.0001474969474969475,
+      "loss": 0.8655,
+      "step": 1522
+    },
+    {
+      "epoch": 1.6729369765206645,
+      "grad_norm": 0.4312286376953125,
+      "learning_rate": 0.00014737484737484735,
+      "loss": 0.9578,
+      "step": 1523
+    },
+    {
+      "epoch": 1.674035424962241,
+      "grad_norm": 0.48694175481796265,
+      "learning_rate": 0.00014725274725274723,
+      "loss": 0.6806,
+      "step": 1524
+    },
+    {
+      "epoch": 1.675133873403817,
+      "grad_norm": 0.39892563223838806,
+      "learning_rate": 0.00014713064713064712,
+      "loss": 0.598,
+      "step": 1525
+    },
+    {
+      "epoch": 1.6762323218453934,
+      "grad_norm": 0.4714735150337219,
+      "learning_rate": 0.000147008547008547,
+      "loss": 0.9637,
+      "step": 1526
+    },
+    {
+      "epoch": 1.6773307702869698,
+      "grad_norm": 0.8308823108673096,
+      "learning_rate": 0.00014688644688644686,
+      "loss": 0.7886,
+      "step": 1527
+    },
+    {
+      "epoch": 1.678429218728546,
+      "grad_norm": 0.5142358541488647,
+      "learning_rate": 0.00014676434676434675,
+      "loss": 0.8028,
+      "step": 1528
+    },
+    {
+      "epoch": 1.6795276671701223,
+      "grad_norm": 0.4001234471797943,
+      "learning_rate": 0.00014664224664224663,
+      "loss": 0.59,
+      "step": 1529
+    },
+    {
+      "epoch": 1.6806261156116986,
+      "grad_norm": 0.4112735688686371,
+      "learning_rate": 0.0001465201465201465,
+      "loss": 0.6523,
+      "step": 1530
+    },
+    {
+      "epoch": 1.6817245640532748,
+      "grad_norm": 0.4391016960144043,
+      "learning_rate": 0.0001463980463980464,
+      "loss": 0.7372,
+      "step": 1531
+    },
+    {
+      "epoch": 1.682823012494851,
+      "grad_norm": 0.7199782133102417,
+      "learning_rate": 0.00014627594627594626,
+      "loss": 0.8493,
+      "step": 1532
+    },
+    {
+      "epoch": 1.6839214609364273,
+      "grad_norm": 0.42379269003868103,
+      "learning_rate": 0.00014615384615384615,
+      "loss": 0.6609,
+      "step": 1533
+    },
+    {
+      "epoch": 1.6850199093780036,
+      "grad_norm": 0.41174909472465515,
+      "learning_rate": 0.000146031746031746,
+      "loss": 0.7021,
+      "step": 1534
+    },
+    {
+      "epoch": 1.6861183578195797,
+      "grad_norm": 0.4856640100479126,
+      "learning_rate": 0.0001459096459096459,
+      "loss": 0.6055,
+      "step": 1535
+    },
+    {
+      "epoch": 1.687216806261156,
+      "grad_norm": 0.5789656043052673,
+      "learning_rate": 0.00014578754578754578,
+      "loss": 0.7003,
+      "step": 1536
+    },
+    {
+      "epoch": 1.6883152547027325,
+      "grad_norm": 0.5711427330970764,
+      "learning_rate": 0.00014566544566544564,
+      "loss": 0.5762,
+      "step": 1537
+    },
+    {
+      "epoch": 1.6894137031443086,
+      "grad_norm": 0.3285518288612366,
+      "learning_rate": 0.00014554334554334552,
+      "loss": 0.6232,
+      "step": 1538
+    },
+    {
+      "epoch": 1.690512151585885,
+      "grad_norm": 0.48425230383872986,
+      "learning_rate": 0.0001454212454212454,
+      "loss": 0.5515,
+      "step": 1539
+    },
+    {
+      "epoch": 1.6916106000274613,
+      "grad_norm": 0.573079526424408,
+      "learning_rate": 0.0001452991452991453,
+      "loss": 0.7776,
+      "step": 1540
+    },
+    {
+      "epoch": 1.6927090484690375,
+      "grad_norm": 0.49084943532943726,
+      "learning_rate": 0.00014517704517704518,
+      "loss": 0.6504,
+      "step": 1541
+    },
+    {
+      "epoch": 1.6938074969106136,
+      "grad_norm": 0.46472617983818054,
+      "learning_rate": 0.00014505494505494504,
+      "loss": 0.6971,
+      "step": 1542
+    },
+    {
+      "epoch": 1.6949059453521902,
+      "grad_norm": 0.4890255033969879,
+      "learning_rate": 0.00014493284493284492,
+      "loss": 0.9292,
+      "step": 1543
+    },
+    {
+      "epoch": 1.6960043937937663,
+      "grad_norm": 0.42868301272392273,
+      "learning_rate": 0.0001448107448107448,
+      "loss": 0.6024,
+      "step": 1544
+    },
+    {
+      "epoch": 1.6971028422353425,
+      "grad_norm": 0.5118973255157471,
+      "learning_rate": 0.00014468864468864467,
+      "loss": 0.7598,
+      "step": 1545
+    },
+    {
+      "epoch": 1.6982012906769188,
+      "grad_norm": 0.40809181332588196,
+      "learning_rate": 0.00014456654456654455,
+      "loss": 0.5157,
+      "step": 1546
+    },
+    {
+      "epoch": 1.6992997391184952,
+      "grad_norm": 0.5236404538154602,
+      "learning_rate": 0.0001444444444444444,
+      "loss": 0.84,
+      "step": 1547
+    },
+    {
+      "epoch": 1.7003981875600713,
+      "grad_norm": 0.5712966322898865,
+      "learning_rate": 0.00014432234432234432,
+      "loss": 0.7208,
+      "step": 1548
+    },
+    {
+      "epoch": 1.7014966360016477,
+      "grad_norm": 0.2910475730895996,
+      "learning_rate": 0.00014420024420024418,
+      "loss": 0.4998,
+      "step": 1549
+    },
+    {
+      "epoch": 1.702595084443224,
+      "grad_norm": 0.5326736569404602,
+      "learning_rate": 0.00014407814407814407,
+      "loss": 0.5492,
+      "step": 1550
+    },
+    {
+      "epoch": 1.7036935328848002,
+      "grad_norm": 0.5454451441764832,
+      "learning_rate": 0.00014395604395604395,
+      "loss": 0.9016,
+      "step": 1551
+    },
+    {
+      "epoch": 1.7047919813263763,
+      "grad_norm": 0.45031625032424927,
+      "learning_rate": 0.0001438339438339438,
+      "loss": 0.671,
+      "step": 1552
+    },
+    {
+      "epoch": 1.705890429767953,
+      "grad_norm": 0.5496229529380798,
+      "learning_rate": 0.0001437118437118437,
+      "loss": 0.6333,
+      "step": 1553
+    },
+    {
+      "epoch": 1.706988878209529,
+      "grad_norm": 0.4200669825077057,
+      "learning_rate": 0.00014358974358974358,
+      "loss": 0.6158,
+      "step": 1554
+    },
+    {
+      "epoch": 1.7080873266511052,
+      "grad_norm": 0.7623536586761475,
+      "learning_rate": 0.00014346764346764347,
+      "loss": 0.686,
+      "step": 1555
+    },
+    {
+      "epoch": 1.7091857750926815,
+      "grad_norm": 0.3363445997238159,
+      "learning_rate": 0.00014334554334554333,
+      "loss": 0.305,
+      "step": 1556
+    },
+    {
+      "epoch": 1.710284223534258,
+      "grad_norm": 0.5042807459831238,
+      "learning_rate": 0.0001432234432234432,
+      "loss": 0.72,
+      "step": 1557
+    },
+    {
+      "epoch": 1.711382671975834,
+      "grad_norm": 0.5264353156089783,
+      "learning_rate": 0.0001431013431013431,
+      "loss": 0.6778,
+      "step": 1558
+    },
+    {
+      "epoch": 1.7124811204174104,
+      "grad_norm": 0.48960715532302856,
+      "learning_rate": 0.00014297924297924296,
+      "loss": 0.4935,
+      "step": 1559
+    },
+    {
+      "epoch": 1.7135795688589868,
+      "grad_norm": 0.4308861792087555,
+      "learning_rate": 0.00014285714285714284,
+      "loss": 0.6527,
+      "step": 1560
+    },
+    {
+      "epoch": 1.714678017300563,
+      "grad_norm": 0.42890703678131104,
+      "learning_rate": 0.00014273504273504273,
+      "loss": 0.4846,
+      "step": 1561
+    },
+    {
+      "epoch": 1.7157764657421393,
+      "grad_norm": 0.5222750902175903,
+      "learning_rate": 0.0001426129426129426,
+      "loss": 0.764,
+      "step": 1562
+    },
+    {
+      "epoch": 1.7168749141837156,
+      "grad_norm": 0.49664998054504395,
+      "learning_rate": 0.00014249084249084247,
+      "loss": 0.5728,
+      "step": 1563
+    },
+    {
+      "epoch": 1.7179733626252918,
+      "grad_norm": 0.3131520748138428,
+      "learning_rate": 0.00014236874236874236,
+      "loss": 0.5089,
+      "step": 1564
+    },
+    {
+      "epoch": 1.719071811066868,
+      "grad_norm": 0.5098987221717834,
+      "learning_rate": 0.00014224664224664224,
+      "loss": 0.781,
+      "step": 1565
+    },
+    {
+      "epoch": 1.7201702595084445,
+      "grad_norm": 0.4040893316268921,
+      "learning_rate": 0.0001421245421245421,
+      "loss": 0.7358,
+      "step": 1566
+    },
+    {
+      "epoch": 1.7212687079500206,
+      "grad_norm": 0.3601396679878235,
+      "learning_rate": 0.00014200244200244198,
+      "loss": 0.5531,
+      "step": 1567
+    },
+    {
+      "epoch": 1.7223671563915968,
+      "grad_norm": 0.6634377837181091,
+      "learning_rate": 0.00014188034188034187,
+      "loss": 0.6548,
+      "step": 1568
+    },
+    {
+      "epoch": 1.7234656048331731,
+      "grad_norm": 0.35935553908348083,
+      "learning_rate": 0.00014175824175824173,
+      "loss": 0.5653,
+      "step": 1569
+    },
+    {
+      "epoch": 1.7245640532747495,
+      "grad_norm": 0.4607802927494049,
+      "learning_rate": 0.00014163614163614164,
+      "loss": 0.9111,
+      "step": 1570
+    },
+    {
+      "epoch": 1.7256625017163256,
+      "grad_norm": 1.0116467475891113,
+      "learning_rate": 0.0001415140415140415,
+      "loss": 0.9226,
+      "step": 1571
+    },
+    {
+      "epoch": 1.726760950157902,
+      "grad_norm": 0.9484761953353882,
+      "learning_rate": 0.00014139194139194139,
+      "loss": 0.7536,
+      "step": 1572
+    },
+    {
+      "epoch": 1.7278593985994783,
+      "grad_norm": 0.3684981167316437,
+      "learning_rate": 0.00014126984126984124,
+      "loss": 0.5013,
+      "step": 1573
+    },
+    {
+      "epoch": 1.7289578470410545,
+      "grad_norm": 0.40037083625793457,
+      "learning_rate": 0.00014114774114774113,
+      "loss": 0.8069,
+      "step": 1574
+    },
+    {
+      "epoch": 1.7300562954826306,
+      "grad_norm": 0.42828282713890076,
+      "learning_rate": 0.00014102564102564101,
+      "loss": 0.5586,
+      "step": 1575
+    },
+    {
+      "epoch": 1.7311547439242072,
+      "grad_norm": 0.3461548686027527,
+      "learning_rate": 0.00014090354090354087,
+      "loss": 0.6045,
+      "step": 1576
+    },
+    {
+      "epoch": 1.7322531923657833,
+      "grad_norm": 0.622982919216156,
+      "learning_rate": 0.00014078144078144079,
+      "loss": 0.8943,
+      "step": 1577
+    },
+    {
+      "epoch": 1.7333516408073595,
+      "grad_norm": 0.3318479359149933,
+      "learning_rate": 0.00014065934065934064,
+      "loss": 0.4058,
+      "step": 1578
+    },
+    {
+      "epoch": 1.7344500892489358,
+      "grad_norm": 0.5178685188293457,
+      "learning_rate": 0.00014053724053724053,
+      "loss": 0.5839,
+      "step": 1579
+    },
+    {
+      "epoch": 1.7355485376905122,
+      "grad_norm": 0.44273868203163147,
+      "learning_rate": 0.00014041514041514042,
+      "loss": 0.5394,
+      "step": 1580
+    },
+    {
+      "epoch": 1.7366469861320883,
+      "grad_norm": 0.60169517993927,
+      "learning_rate": 0.00014029304029304027,
+      "loss": 0.6753,
+      "step": 1581
+    },
+    {
+      "epoch": 1.7377454345736647,
+      "grad_norm": 0.7691718339920044,
+      "learning_rate": 0.00014017094017094016,
+      "loss": 0.9618,
+      "step": 1582
+    },
+    {
+      "epoch": 1.738843883015241,
+      "grad_norm": 0.3900390565395355,
+      "learning_rate": 0.00014004884004884004,
+      "loss": 0.5809,
+      "step": 1583
+    },
+    {
+      "epoch": 1.7399423314568172,
+      "grad_norm": 0.6272429823875427,
+      "learning_rate": 0.00013992673992673993,
+      "loss": 0.8579,
+      "step": 1584
+    },
+    {
+      "epoch": 1.7410407798983936,
+      "grad_norm": 0.30017220973968506,
+      "learning_rate": 0.0001398046398046398,
+      "loss": 0.5335,
+      "step": 1585
+    },
+    {
+      "epoch": 1.74213922833997,
+      "grad_norm": 0.4937066435813904,
+      "learning_rate": 0.00013968253968253967,
+      "loss": 0.7941,
+      "step": 1586
+    },
+    {
+      "epoch": 1.743237676781546,
+      "grad_norm": 0.47317594289779663,
+      "learning_rate": 0.00013956043956043956,
+      "loss": 0.6013,
+      "step": 1587
+    },
+    {
+      "epoch": 1.7443361252231222,
+      "grad_norm": 1.9155733585357666,
+      "learning_rate": 0.00013943833943833942,
+      "loss": 0.6708,
+      "step": 1588
+    },
+    {
+      "epoch": 1.7454345736646986,
+      "grad_norm": 0.3844835162162781,
+      "learning_rate": 0.0001393162393162393,
+      "loss": 0.7176,
+      "step": 1589
+    },
+    {
+      "epoch": 1.746533022106275,
+      "grad_norm": 0.42810145020484924,
+      "learning_rate": 0.0001391941391941392,
+      "loss": 0.9255,
+      "step": 1590
+    },
+    {
+      "epoch": 1.747631470547851,
+      "grad_norm": 3.846015691757202,
+      "learning_rate": 0.00013907203907203905,
+      "loss": 0.6202,
+      "step": 1591
+    },
+    {
+      "epoch": 1.7487299189894274,
+      "grad_norm": 0.42783257365226746,
+      "learning_rate": 0.00013894993894993893,
+      "loss": 0.7451,
+      "step": 1592
+    },
+    {
+      "epoch": 1.7498283674310038,
+      "grad_norm": 0.5237023234367371,
+      "learning_rate": 0.00013882783882783882,
+      "loss": 0.7961,
+      "step": 1593
+    },
+    {
+      "epoch": 1.75092681587258,
+      "grad_norm": 2.5639729499816895,
+      "learning_rate": 0.0001387057387057387,
+      "loss": 0.7026,
+      "step": 1594
+    },
+    {
+      "epoch": 1.7520252643141563,
+      "grad_norm": 0.5686498284339905,
+      "learning_rate": 0.00013858363858363856,
+      "loss": 0.4916,
+      "step": 1595
+    },
+    {
+      "epoch": 1.7531237127557326,
+      "grad_norm": 0.561611533164978,
+      "learning_rate": 0.00013846153846153845,
+      "loss": 0.772,
+      "step": 1596
+    },
+    {
+      "epoch": 1.7542221611973088,
+      "grad_norm": 0.6220077872276306,
+      "learning_rate": 0.00013833943833943833,
+      "loss": 0.5694,
+      "step": 1597
+    },
+    {
+      "epoch": 1.755320609638885,
+      "grad_norm": 0.6902570724487305,
+      "learning_rate": 0.0001382173382173382,
+      "loss": 0.7963,
+      "step": 1598
+    },
+    {
+      "epoch": 1.7564190580804615,
+      "grad_norm": 2.0417702198028564,
+      "learning_rate": 0.00013809523809523808,
+      "loss": 0.6721,
+      "step": 1599
+    },
+    {
+      "epoch": 1.7575175065220376,
+      "grad_norm": 0.36764901876449585,
+      "learning_rate": 0.00013797313797313796,
+      "loss": 0.5714,
+      "step": 1600
+    },
+    {
+      "epoch": 1.7586159549636138,
+      "grad_norm": 0.6679022908210754,
+      "learning_rate": 0.00013785103785103785,
+      "loss": 0.7025,
+      "step": 1601
+    },
+    {
+      "epoch": 1.7597144034051901,
+      "grad_norm": 0.5749796628952026,
+      "learning_rate": 0.0001377289377289377,
+      "loss": 0.7381,
+      "step": 1602
+    },
+    {
+      "epoch": 1.7608128518467665,
+      "grad_norm": 0.9285687208175659,
+      "learning_rate": 0.0001376068376068376,
+      "loss": 0.6,
+      "step": 1603
+    },
+    {
+      "epoch": 1.7619113002883426,
+      "grad_norm": 0.8209772706031799,
+      "learning_rate": 0.00013748473748473748,
+      "loss": 0.5701,
+      "step": 1604
+    },
+    {
+      "epoch": 1.763009748729919,
+      "grad_norm": 0.7823337912559509,
+      "learning_rate": 0.00013736263736263734,
+      "loss": 0.6695,
+      "step": 1605
+    },
+    {
+      "epoch": 1.7641081971714954,
+      "grad_norm": 0.4885605275630951,
+      "learning_rate": 0.00013724053724053725,
+      "loss": 0.6487,
+      "step": 1606
+    },
+    {
+      "epoch": 1.7652066456130715,
+      "grad_norm": 0.36517488956451416,
+      "learning_rate": 0.0001371184371184371,
+      "loss": 0.5798,
+      "step": 1607
+    },
+    {
+      "epoch": 1.7663050940546479,
+      "grad_norm": 0.49961966276168823,
+      "learning_rate": 0.000136996336996337,
+      "loss": 0.4373,
+      "step": 1608
+    },
+    {
+      "epoch": 1.7674035424962242,
+      "grad_norm": 0.495263010263443,
+      "learning_rate": 0.00013687423687423688,
+      "loss": 0.5868,
+      "step": 1609
+    },
+    {
+      "epoch": 1.7685019909378004,
+      "grad_norm": 0.7384648323059082,
+      "learning_rate": 0.00013675213675213674,
+      "loss": 0.4957,
+      "step": 1610
+    },
+    {
+      "epoch": 1.7696004393793765,
+      "grad_norm": 0.465440034866333,
+      "learning_rate": 0.00013663003663003662,
+      "loss": 0.7424,
+      "step": 1611
+    },
+    {
+      "epoch": 1.7706988878209529,
+      "grad_norm": 0.68381667137146,
+      "learning_rate": 0.00013650793650793648,
+      "loss": 1.0421,
+      "step": 1612
+    },
+    {
+      "epoch": 1.7717973362625292,
+      "grad_norm": 4.455906867980957,
+      "learning_rate": 0.00013638583638583637,
+      "loss": 0.6626,
+      "step": 1613
+    },
+    {
+      "epoch": 1.7728957847041054,
+      "grad_norm": 0.6165801286697388,
+      "learning_rate": 0.00013626373626373625,
+      "loss": 0.6072,
+      "step": 1614
+    },
+    {
+      "epoch": 1.7739942331456817,
+      "grad_norm": 0.8296604156494141,
+      "learning_rate": 0.00013614163614163614,
+      "loss": 0.6507,
+      "step": 1615
+    },
+    {
+      "epoch": 1.775092681587258,
+      "grad_norm": 0.4678190350532532,
+      "learning_rate": 0.00013601953601953602,
+      "loss": 0.8466,
+      "step": 1616
+    },
+    {
+      "epoch": 1.7761911300288342,
+      "grad_norm": 1.2141482830047607,
+      "learning_rate": 0.00013589743589743588,
+      "loss": 0.513,
+      "step": 1617
+    },
+    {
+      "epoch": 1.7772895784704106,
+      "grad_norm": 0.4522024691104889,
+      "learning_rate": 0.00013577533577533577,
+      "loss": 0.7571,
+      "step": 1618
+    },
+    {
+      "epoch": 1.778388026911987,
+      "grad_norm": 2.0903220176696777,
+      "learning_rate": 0.00013565323565323565,
+      "loss": 0.7359,
+      "step": 1619
+    },
+    {
+      "epoch": 1.779486475353563,
+      "grad_norm": 0.5292307734489441,
+      "learning_rate": 0.0001355311355311355,
+      "loss": 0.6526,
+      "step": 1620
+    },
+    {
+      "epoch": 1.7805849237951392,
+      "grad_norm": 0.5047786235809326,
+      "learning_rate": 0.0001354090354090354,
+      "loss": 0.7056,
+      "step": 1621
+    },
+    {
+      "epoch": 1.7816833722367158,
+      "grad_norm": 0.4102507531642914,
+      "learning_rate": 0.00013528693528693528,
+      "loss": 0.8673,
+      "step": 1622
+    },
+    {
+      "epoch": 1.782781820678292,
+      "grad_norm": 0.471556693315506,
+      "learning_rate": 0.00013516483516483517,
+      "loss": 0.9424,
+      "step": 1623
+    },
+    {
+      "epoch": 1.783880269119868,
+      "grad_norm": 0.6595687866210938,
+      "learning_rate": 0.00013504273504273502,
+      "loss": 0.661,
+      "step": 1624
+    },
+    {
+      "epoch": 1.7849787175614444,
+      "grad_norm": 0.6221860647201538,
+      "learning_rate": 0.0001349206349206349,
+      "loss": 0.5457,
+      "step": 1625
+    },
+    {
+      "epoch": 1.7860771660030208,
+      "grad_norm": 0.9256211519241333,
+      "learning_rate": 0.0001347985347985348,
+      "loss": 0.9216,
+      "step": 1626
+    },
+    {
+      "epoch": 1.787175614444597,
+      "grad_norm": 0.31376492977142334,
+      "learning_rate": 0.00013467643467643465,
+      "loss": 0.7071,
+      "step": 1627
+    },
+    {
+      "epoch": 1.7882740628861733,
+      "grad_norm": 0.5313776135444641,
+      "learning_rate": 0.00013455433455433454,
+      "loss": 0.8111,
+      "step": 1628
+    },
+    {
+      "epoch": 1.7893725113277497,
+      "grad_norm": 0.8203330636024475,
+      "learning_rate": 0.00013443223443223442,
+      "loss": 0.5301,
+      "step": 1629
+    },
+    {
+      "epoch": 1.7904709597693258,
+      "grad_norm": 0.42774948477745056,
+      "learning_rate": 0.0001343101343101343,
+      "loss": 0.8359,
+      "step": 1630
+    },
+    {
+      "epoch": 1.791569408210902,
+      "grad_norm": 0.8165685534477234,
+      "learning_rate": 0.00013418803418803417,
+      "loss": 0.4894,
+      "step": 1631
+    },
+    {
+      "epoch": 1.7926678566524785,
+      "grad_norm": 0.5739139318466187,
+      "learning_rate": 0.00013406593406593405,
+      "loss": 0.7009,
+      "step": 1632
+    },
+    {
+      "epoch": 1.7937663050940547,
+      "grad_norm": 0.5102986097335815,
+      "learning_rate": 0.00013394383394383394,
+      "loss": 0.7174,
+      "step": 1633
+    },
+    {
+      "epoch": 1.7948647535356308,
+      "grad_norm": 1.1377652883529663,
+      "learning_rate": 0.0001338217338217338,
+      "loss": 0.79,
+      "step": 1634
+    },
+    {
+      "epoch": 1.7959632019772072,
+      "grad_norm": 0.44272491335868835,
+      "learning_rate": 0.00013369963369963368,
+      "loss": 0.6761,
+      "step": 1635
+    },
+    {
+      "epoch": 1.7970616504187835,
+      "grad_norm": 0.5084714889526367,
+      "learning_rate": 0.00013357753357753357,
+      "loss": 0.6848,
+      "step": 1636
+    },
+    {
+      "epoch": 1.7981600988603597,
+      "grad_norm": 0.752017080783844,
+      "learning_rate": 0.00013345543345543345,
+      "loss": 0.6107,
+      "step": 1637
+    },
+    {
+      "epoch": 1.799258547301936,
+      "grad_norm": 0.4430617690086365,
+      "learning_rate": 0.0001333333333333333,
+      "loss": 0.7639,
+      "step": 1638
+    },
+    {
+      "epoch": 1.8003569957435124,
+      "grad_norm": 0.8098049759864807,
+      "learning_rate": 0.0001332112332112332,
+      "loss": 0.8172,
+      "step": 1639
+    },
+    {
+      "epoch": 1.8014554441850885,
+      "grad_norm": 0.6817697286605835,
+      "learning_rate": 0.00013308913308913308,
+      "loss": 0.8274,
+      "step": 1640
+    },
+    {
+      "epoch": 1.8025538926266649,
+      "grad_norm": 0.5132669806480408,
+      "learning_rate": 0.00013296703296703294,
+      "loss": 0.6269,
+      "step": 1641
+    },
+    {
+      "epoch": 1.8036523410682412,
+      "grad_norm": 0.8487284183502197,
+      "learning_rate": 0.00013284493284493283,
+      "loss": 0.6734,
+      "step": 1642
+    },
+    {
+      "epoch": 1.8047507895098174,
+      "grad_norm": 0.7084116339683533,
+      "learning_rate": 0.0001327228327228327,
+      "loss": 0.703,
+      "step": 1643
+    },
+    {
+      "epoch": 1.8058492379513935,
+      "grad_norm": 0.39045432209968567,
+      "learning_rate": 0.00013260073260073257,
+      "loss": 0.5466,
+      "step": 1644
+    },
+    {
+      "epoch": 1.8069476863929699,
+      "grad_norm": 0.4408475160598755,
+      "learning_rate": 0.00013247863247863248,
+      "loss": 0.4998,
+      "step": 1645
+    },
+    {
+      "epoch": 1.8080461348345462,
+      "grad_norm": 0.41640380024909973,
+      "learning_rate": 0.00013235653235653234,
+      "loss": 0.49,
+      "step": 1646
+    },
+    {
+      "epoch": 1.8091445832761224,
+      "grad_norm": 0.6760729551315308,
+      "learning_rate": 0.00013223443223443223,
+      "loss": 0.4537,
+      "step": 1647
+    },
+    {
+      "epoch": 1.8102430317176987,
+      "grad_norm": 0.42953255772590637,
+      "learning_rate": 0.0001321123321123321,
+      "loss": 0.489,
+      "step": 1648
+    },
+    {
+      "epoch": 1.811341480159275,
+      "grad_norm": 0.3260825574398041,
+      "learning_rate": 0.00013199023199023197,
+      "loss": 0.6633,
+      "step": 1649
+    },
+    {
+      "epoch": 1.8124399286008512,
+      "grad_norm": 0.7073171138763428,
+      "learning_rate": 0.00013186813186813186,
+      "loss": 0.4953,
+      "step": 1650
+    },
+    {
+      "epoch": 1.8135383770424276,
+      "grad_norm": 0.36153069138526917,
+      "learning_rate": 0.00013174603174603172,
+      "loss": 0.7641,
+      "step": 1651
+    },
+    {
+      "epoch": 1.814636825484004,
+      "grad_norm": 0.4233636260032654,
+      "learning_rate": 0.00013162393162393163,
+      "loss": 0.7119,
+      "step": 1652
+    },
+    {
+      "epoch": 1.81573527392558,
+      "grad_norm": 0.5262153148651123,
+      "learning_rate": 0.0001315018315018315,
+      "loss": 0.4516,
+      "step": 1653
+    },
+    {
+      "epoch": 1.8168337223671562,
+      "grad_norm": 0.5263295769691467,
+      "learning_rate": 0.00013137973137973137,
+      "loss": 0.7786,
+      "step": 1654
+    },
+    {
+      "epoch": 1.8179321708087328,
+      "grad_norm": 0.3681116998195648,
+      "learning_rate": 0.00013125763125763126,
+      "loss": 0.5295,
+      "step": 1655
+    },
+    {
+      "epoch": 1.819030619250309,
+      "grad_norm": 0.5075433254241943,
+      "learning_rate": 0.00013113553113553112,
+      "loss": 0.6017,
+      "step": 1656
+    },
+    {
+      "epoch": 1.820129067691885,
+      "grad_norm": 0.2960616946220398,
+      "learning_rate": 0.000131013431013431,
+      "loss": 0.4951,
+      "step": 1657
+    },
+    {
+      "epoch": 1.8212275161334615,
+      "grad_norm": 0.4010205864906311,
+      "learning_rate": 0.0001308913308913309,
+      "loss": 0.8916,
+      "step": 1658
+    },
+    {
+      "epoch": 1.8223259645750378,
+      "grad_norm": 0.9112391471862793,
+      "learning_rate": 0.00013076923076923077,
+      "loss": 0.4978,
+      "step": 1659
+    },
+    {
+      "epoch": 1.823424413016614,
+      "grad_norm": 0.7214633226394653,
+      "learning_rate": 0.00013064713064713063,
+      "loss": 0.791,
+      "step": 1660
+    },
+    {
+      "epoch": 1.8245228614581903,
+      "grad_norm": 0.4174933433532715,
+      "learning_rate": 0.00013052503052503052,
+      "loss": 0.4099,
+      "step": 1661
+    },
+    {
+      "epoch": 1.8256213098997667,
+      "grad_norm": 0.4622137248516083,
+      "learning_rate": 0.0001304029304029304,
+      "loss": 1.1726,
+      "step": 1662
+    },
+    {
+      "epoch": 1.8267197583413428,
+      "grad_norm": 0.5991957783699036,
+      "learning_rate": 0.00013028083028083026,
+      "loss": 0.6713,
+      "step": 1663
+    },
+    {
+      "epoch": 1.8278182067829192,
+      "grad_norm": 0.43959730863571167,
+      "learning_rate": 0.00013015873015873015,
+      "loss": 0.5676,
+      "step": 1664
+    },
+    {
+      "epoch": 1.8289166552244955,
+      "grad_norm": 0.6271671056747437,
+      "learning_rate": 0.00013003663003663003,
+      "loss": 0.7399,
+      "step": 1665
+    },
+    {
+      "epoch": 1.8300151036660717,
+      "grad_norm": 0.6412084102630615,
+      "learning_rate": 0.0001299145299145299,
+      "loss": 0.7585,
+      "step": 1666
+    },
+    {
+      "epoch": 1.8311135521076478,
+      "grad_norm": 0.4066605269908905,
+      "learning_rate": 0.00012979242979242977,
+      "loss": 0.5756,
+      "step": 1667
+    },
+    {
+      "epoch": 1.8322120005492242,
+      "grad_norm": 0.3568172752857208,
+      "learning_rate": 0.00012967032967032966,
+      "loss": 0.968,
+      "step": 1668
+    },
+    {
+      "epoch": 1.8333104489908005,
+      "grad_norm": 0.5061100721359253,
+      "learning_rate": 0.00012954822954822955,
+      "loss": 0.5089,
+      "step": 1669
+    },
+    {
+      "epoch": 1.8344088974323767,
+      "grad_norm": 3.013622522354126,
+      "learning_rate": 0.0001294261294261294,
+      "loss": 0.5101,
+      "step": 1670
+    },
+    {
+      "epoch": 1.835507345873953,
+      "grad_norm": 0.40078219771385193,
+      "learning_rate": 0.0001293040293040293,
+      "loss": 0.5602,
+      "step": 1671
+    },
+    {
+      "epoch": 1.8366057943155294,
+      "grad_norm": 0.4108009338378906,
+      "learning_rate": 0.00012918192918192918,
+      "loss": 0.6338,
+      "step": 1672
+    },
+    {
+      "epoch": 1.8377042427571055,
+      "grad_norm": 0.5452212691307068,
+      "learning_rate": 0.00012905982905982903,
+      "loss": 0.5358,
+      "step": 1673
+    },
+    {
+      "epoch": 1.838802691198682,
+      "grad_norm": 0.4694603979587555,
+      "learning_rate": 0.00012893772893772895,
+      "loss": 0.7031,
+      "step": 1674
+    },
+    {
+      "epoch": 1.8399011396402583,
+      "grad_norm": 0.3787671625614166,
+      "learning_rate": 0.0001288156288156288,
+      "loss": 0.5667,
+      "step": 1675
+    },
+    {
+      "epoch": 1.8409995880818344,
+      "grad_norm": 0.4842737317085266,
+      "learning_rate": 0.0001286935286935287,
+      "loss": 0.5082,
+      "step": 1676
+    },
+    {
+      "epoch": 1.8420980365234105,
+      "grad_norm": 0.7690992951393127,
+      "learning_rate": 0.00012857142857142855,
+      "loss": 0.706,
+      "step": 1677
+    },
+    {
+      "epoch": 1.8431964849649871,
+      "grad_norm": 1.0891668796539307,
+      "learning_rate": 0.00012844932844932843,
+      "loss": 0.7162,
+      "step": 1678
+    },
+    {
+      "epoch": 1.8442949334065633,
+      "grad_norm": 0.4118032157421112,
+      "learning_rate": 0.00012832722832722832,
+      "loss": 0.7019,
+      "step": 1679
+    },
+    {
+      "epoch": 1.8453933818481394,
+      "grad_norm": 0.513157308101654,
+      "learning_rate": 0.00012820512820512818,
+      "loss": 0.4359,
+      "step": 1680
+    },
+    {
+      "epoch": 1.8464918302897158,
+      "grad_norm": 1.3229504823684692,
+      "learning_rate": 0.0001280830280830281,
+      "loss": 0.5555,
+      "step": 1681
+    },
+    {
+      "epoch": 1.8475902787312921,
+      "grad_norm": 0.6301699876785278,
+      "learning_rate": 0.00012796092796092795,
+      "loss": 0.5211,
+      "step": 1682
+    },
+    {
+      "epoch": 1.8486887271728683,
+      "grad_norm": 0.6125632524490356,
+      "learning_rate": 0.00012783882783882783,
+      "loss": 0.6287,
+      "step": 1683
+    },
+    {
+      "epoch": 1.8497871756144446,
+      "grad_norm": 1.806593418121338,
+      "learning_rate": 0.00012771672771672772,
+      "loss": 0.5794,
+      "step": 1684
+    },
+    {
+      "epoch": 1.850885624056021,
+      "grad_norm": 1.2972358465194702,
+      "learning_rate": 0.00012759462759462758,
+      "loss": 0.9205,
+      "step": 1685
+    },
+    {
+      "epoch": 1.8519840724975971,
+      "grad_norm": 1.0519033670425415,
+      "learning_rate": 0.00012747252747252746,
+      "loss": 0.7103,
+      "step": 1686
+    },
+    {
+      "epoch": 1.8530825209391735,
+      "grad_norm": 1.6489734649658203,
+      "learning_rate": 0.00012735042735042735,
+      "loss": 0.7585,
+      "step": 1687
+    },
+    {
+      "epoch": 1.8541809693807498,
+      "grad_norm": 0.7229527235031128,
+      "learning_rate": 0.0001272283272283272,
+      "loss": 0.8109,
+      "step": 1688
+    },
+    {
+      "epoch": 1.855279417822326,
+      "grad_norm": 0.35257261991500854,
+      "learning_rate": 0.0001271062271062271,
+      "loss": 0.8014,
+      "step": 1689
+    },
+    {
+      "epoch": 1.856377866263902,
+      "grad_norm": 0.4653327167034149,
+      "learning_rate": 0.00012698412698412698,
+      "loss": 0.6404,
+      "step": 1690
+    },
+    {
+      "epoch": 1.8574763147054785,
+      "grad_norm": 0.5230842232704163,
+      "learning_rate": 0.00012686202686202686,
+      "loss": 0.7413,
+      "step": 1691
+    },
+    {
+      "epoch": 1.8585747631470548,
+      "grad_norm": 0.42130210995674133,
+      "learning_rate": 0.00012673992673992672,
+      "loss": 0.7283,
+      "step": 1692
+    },
+    {
+      "epoch": 1.859673211588631,
+      "grad_norm": 1.4667960405349731,
+      "learning_rate": 0.0001266178266178266,
+      "loss": 0.5656,
+      "step": 1693
+    },
+    {
+      "epoch": 1.8607716600302073,
+      "grad_norm": 0.4077359139919281,
+      "learning_rate": 0.0001264957264957265,
+      "loss": 0.5891,
+      "step": 1694
+    },
+    {
+      "epoch": 1.8618701084717837,
+      "grad_norm": 0.503654956817627,
+      "learning_rate": 0.00012637362637362635,
+      "loss": 0.5912,
+      "step": 1695
+    },
+    {
+      "epoch": 1.8629685569133598,
+      "grad_norm": 1.6315315961837769,
+      "learning_rate": 0.00012625152625152624,
+      "loss": 0.5588,
+      "step": 1696
+    },
+    {
+      "epoch": 1.8640670053549362,
+      "grad_norm": 0.783920407295227,
+      "learning_rate": 0.00012612942612942612,
+      "loss": 0.6585,
+      "step": 1697
+    },
+    {
+      "epoch": 1.8651654537965126,
+      "grad_norm": 0.7186728715896606,
+      "learning_rate": 0.000126007326007326,
+      "loss": 0.9174,
+      "step": 1698
+    },
+    {
+      "epoch": 1.8662639022380887,
+      "grad_norm": 0.8784156441688538,
+      "learning_rate": 0.00012588522588522587,
+      "loss": 0.5835,
+      "step": 1699
+    },
+    {
+      "epoch": 1.8673623506796648,
+      "grad_norm": 0.7090787887573242,
+      "learning_rate": 0.00012576312576312575,
+      "loss": 0.7555,
+      "step": 1700
+    },
+    {
+      "epoch": 1.8684607991212414,
+      "grad_norm": 0.5508129596710205,
+      "learning_rate": 0.00012564102564102564,
+      "loss": 0.6168,
+      "step": 1701
+    },
+    {
+      "epoch": 1.8695592475628175,
+      "grad_norm": 0.40403681993484497,
+      "learning_rate": 0.0001255189255189255,
+      "loss": 0.4528,
+      "step": 1702
+    },
+    {
+      "epoch": 1.8706576960043937,
+      "grad_norm": 0.9553635716438293,
+      "learning_rate": 0.00012539682539682538,
+      "loss": 0.654,
+      "step": 1703
+    },
+    {
+      "epoch": 1.87175614444597,
+      "grad_norm": 1.0610092878341675,
+      "learning_rate": 0.00012527472527472527,
+      "loss": 0.6115,
+      "step": 1704
+    },
+    {
+      "epoch": 1.8728545928875464,
+      "grad_norm": 0.32898634672164917,
+      "learning_rate": 0.00012515262515262515,
+      "loss": 0.5651,
+      "step": 1705
+    },
+    {
+      "epoch": 1.8739530413291225,
+      "grad_norm": 0.4018780589103699,
+      "learning_rate": 0.000125030525030525,
+      "loss": 0.5919,
+      "step": 1706
+    },
+    {
+      "epoch": 1.875051489770699,
+      "grad_norm": 1.6521873474121094,
+      "learning_rate": 0.0001249084249084249,
+      "loss": 0.7137,
+      "step": 1707
+    },
+    {
+      "epoch": 1.8761499382122753,
+      "grad_norm": 0.5515930652618408,
+      "learning_rate": 0.00012478632478632478,
+      "loss": 0.4471,
+      "step": 1708
+    },
+    {
+      "epoch": 1.8772483866538514,
+      "grad_norm": 0.4156915545463562,
+      "learning_rate": 0.00012466422466422464,
+      "loss": 0.6575,
+      "step": 1709
+    },
+    {
+      "epoch": 1.8783468350954275,
+      "grad_norm": 0.41263312101364136,
+      "learning_rate": 0.00012454212454212453,
+      "loss": 0.542,
+      "step": 1710
+    },
+    {
+      "epoch": 1.8794452835370041,
+      "grad_norm": 1.0169517993927002,
+      "learning_rate": 0.0001244200244200244,
+      "loss": 1.1631,
+      "step": 1711
+    },
+    {
+      "epoch": 1.8805437319785803,
+      "grad_norm": 0.49169981479644775,
+      "learning_rate": 0.0001242979242979243,
+      "loss": 0.6707,
+      "step": 1712
+    },
+    {
+      "epoch": 1.8816421804201564,
+      "grad_norm": 0.44801297783851624,
+      "learning_rate": 0.00012417582417582416,
+      "loss": 1.0036,
+      "step": 1713
+    },
+    {
+      "epoch": 1.8827406288617328,
+      "grad_norm": 0.47181040048599243,
+      "learning_rate": 0.00012405372405372404,
+      "loss": 0.6693,
+      "step": 1714
+    },
+    {
+      "epoch": 1.8838390773033091,
+      "grad_norm": 0.39900457859039307,
+      "learning_rate": 0.00012393162393162393,
+      "loss": 0.6421,
+      "step": 1715
+    },
+    {
+      "epoch": 1.8849375257448853,
+      "grad_norm": 1.1160179376602173,
+      "learning_rate": 0.00012380952380952378,
+      "loss": 0.6599,
+      "step": 1716
+    },
+    {
+      "epoch": 1.8860359741864616,
+      "grad_norm": 0.6951555609703064,
+      "learning_rate": 0.00012368742368742367,
+      "loss": 0.743,
+      "step": 1717
+    },
+    {
+      "epoch": 1.887134422628038,
+      "grad_norm": 0.5381472706794739,
+      "learning_rate": 0.00012356532356532356,
+      "loss": 0.5051,
+      "step": 1718
+    },
+    {
+      "epoch": 1.8882328710696141,
+      "grad_norm": 0.48717793822288513,
+      "learning_rate": 0.00012344322344322341,
+      "loss": 0.7015,
+      "step": 1719
+    },
+    {
+      "epoch": 1.8893313195111905,
+      "grad_norm": 0.3720596432685852,
+      "learning_rate": 0.00012332112332112333,
+      "loss": 0.6743,
+      "step": 1720
+    },
+    {
+      "epoch": 1.8904297679527668,
+      "grad_norm": 1.1850451231002808,
+      "learning_rate": 0.00012319902319902318,
+      "loss": 0.6132,
+      "step": 1721
+    },
+    {
+      "epoch": 1.891528216394343,
+      "grad_norm": 0.4546525180339813,
+      "learning_rate": 0.00012307692307692307,
+      "loss": 0.5465,
+      "step": 1722
+    },
+    {
+      "epoch": 1.8926266648359191,
+      "grad_norm": 0.41415080428123474,
+      "learning_rate": 0.00012295482295482296,
+      "loss": 0.7259,
+      "step": 1723
+    },
+    {
+      "epoch": 1.8937251132774955,
+      "grad_norm": 0.44278842210769653,
+      "learning_rate": 0.00012283272283272281,
+      "loss": 0.7244,
+      "step": 1724
+    },
+    {
+      "epoch": 1.8948235617190718,
+      "grad_norm": 0.3887364864349365,
+      "learning_rate": 0.0001227106227106227,
+      "loss": 0.7124,
+      "step": 1725
+    },
+    {
+      "epoch": 1.895922010160648,
+      "grad_norm": 0.5405781269073486,
+      "learning_rate": 0.00012258852258852256,
+      "loss": 0.5153,
+      "step": 1726
+    },
+    {
+      "epoch": 1.8970204586022243,
+      "grad_norm": 0.3530559837818146,
+      "learning_rate": 0.00012246642246642247,
+      "loss": 0.5429,
+      "step": 1727
+    },
+    {
+      "epoch": 1.8981189070438007,
+      "grad_norm": 0.523621678352356,
+      "learning_rate": 0.00012234432234432233,
+      "loss": 0.5645,
+      "step": 1728
+    },
+    {
+      "epoch": 1.8992173554853768,
+      "grad_norm": 0.3893704116344452,
+      "learning_rate": 0.00012222222222222221,
+      "loss": 0.6419,
+      "step": 1729
+    },
+    {
+      "epoch": 1.9003158039269532,
+      "grad_norm": 0.7010704278945923,
+      "learning_rate": 0.0001221001221001221,
+      "loss": 0.5202,
+      "step": 1730
+    },
+    {
+      "epoch": 1.9014142523685296,
+      "grad_norm": 0.45551490783691406,
+      "learning_rate": 0.00012197802197802197,
+      "loss": 0.8492,
+      "step": 1731
+    },
+    {
+      "epoch": 1.9025127008101057,
+      "grad_norm": 1.0112484693527222,
+      "learning_rate": 0.00012185592185592184,
+      "loss": 0.8602,
+      "step": 1732
+    },
+    {
+      "epoch": 1.9036111492516818,
+      "grad_norm": 0.4509601294994354,
+      "learning_rate": 0.00012173382173382173,
+      "loss": 0.6138,
+      "step": 1733
+    },
+    {
+      "epoch": 1.9047095976932584,
+      "grad_norm": 0.4303388297557831,
+      "learning_rate": 0.0001216117216117216,
+      "loss": 0.4748,
+      "step": 1734
+    },
+    {
+      "epoch": 1.9058080461348346,
+      "grad_norm": 0.4452000558376312,
+      "learning_rate": 0.00012148962148962147,
+      "loss": 0.5869,
+      "step": 1735
+    },
+    {
+      "epoch": 1.9069064945764107,
+      "grad_norm": 0.5915077924728394,
+      "learning_rate": 0.00012136752136752136,
+      "loss": 0.8057,
+      "step": 1736
+    },
+    {
+      "epoch": 1.908004943017987,
+      "grad_norm": 0.38761547207832336,
+      "learning_rate": 0.00012124542124542123,
+      "loss": 0.5772,
+      "step": 1737
+    },
+    {
+      "epoch": 1.9091033914595634,
+      "grad_norm": 0.517752468585968,
+      "learning_rate": 0.00012112332112332112,
+      "loss": 0.7865,
+      "step": 1738
+    },
+    {
+      "epoch": 1.9102018399011396,
+      "grad_norm": 0.5325546860694885,
+      "learning_rate": 0.00012100122100122099,
+      "loss": 0.5934,
+      "step": 1739
+    },
+    {
+      "epoch": 1.911300288342716,
+      "grad_norm": 0.3930620551109314,
+      "learning_rate": 0.00012087912087912087,
+      "loss": 0.5974,
+      "step": 1740
+    },
+    {
+      "epoch": 1.9123987367842923,
+      "grad_norm": 1.1001818180084229,
+      "learning_rate": 0.00012075702075702075,
+      "loss": 0.6524,
+      "step": 1741
+    },
+    {
+      "epoch": 1.9134971852258684,
+      "grad_norm": 0.3690165877342224,
+      "learning_rate": 0.00012063492063492062,
+      "loss": 0.36,
+      "step": 1742
+    },
+    {
+      "epoch": 1.9145956336674448,
+      "grad_norm": 0.4403206408023834,
+      "learning_rate": 0.0001205128205128205,
+      "loss": 0.5737,
+      "step": 1743
+    },
+    {
+      "epoch": 1.9156940821090211,
+      "grad_norm": 0.651498019695282,
+      "learning_rate": 0.00012039072039072037,
+      "loss": 0.657,
+      "step": 1744
+    },
+    {
+      "epoch": 1.9167925305505973,
+      "grad_norm": 0.6880660057067871,
+      "learning_rate": 0.00012026862026862025,
+      "loss": 0.6891,
+      "step": 1745
+    },
+    {
+      "epoch": 1.9178909789921734,
+      "grad_norm": 0.4968664348125458,
+      "learning_rate": 0.00012014652014652015,
+      "loss": 0.841,
+      "step": 1746
+    },
+    {
+      "epoch": 1.9189894274337498,
+      "grad_norm": 0.4392407536506653,
+      "learning_rate": 0.00012002442002442002,
+      "loss": 0.7096,
+      "step": 1747
+    },
+    {
+      "epoch": 1.9200878758753261,
+      "grad_norm": 0.41028741002082825,
+      "learning_rate": 0.00011990231990231989,
+      "loss": 0.5838,
+      "step": 1748
+    },
+    {
+      "epoch": 1.9211863243169023,
+      "grad_norm": 0.7928158640861511,
+      "learning_rate": 0.00011978021978021978,
+      "loss": 0.6633,
+      "step": 1749
+    },
+    {
+      "epoch": 1.9222847727584786,
+      "grad_norm": 0.4970681071281433,
+      "learning_rate": 0.00011965811965811965,
+      "loss": 0.7764,
+      "step": 1750
+    },
+    {
+      "epoch": 1.923383221200055,
+      "grad_norm": 0.49581378698349,
+      "learning_rate": 0.00011953601953601952,
+      "loss": 0.7204,
+      "step": 1751
+    },
+    {
+      "epoch": 1.9244816696416311,
+      "grad_norm": 1.309241771697998,
+      "learning_rate": 0.00011941391941391939,
+      "loss": 0.5859,
+      "step": 1752
+    },
+    {
+      "epoch": 1.9255801180832075,
+      "grad_norm": 0.4651016592979431,
+      "learning_rate": 0.00011929181929181929,
+      "loss": 0.6425,
+      "step": 1753
+    },
+    {
+      "epoch": 1.9266785665247839,
+      "grad_norm": 0.5377634167671204,
+      "learning_rate": 0.00011916971916971916,
+      "loss": 0.8244,
+      "step": 1754
+    },
+    {
+      "epoch": 1.92777701496636,
+      "grad_norm": 0.6809287667274475,
+      "learning_rate": 0.00011904761904761903,
+      "loss": 0.5711,
+      "step": 1755
+    },
+    {
+      "epoch": 1.9288754634079361,
+      "grad_norm": 0.650701105594635,
+      "learning_rate": 0.00011892551892551892,
+      "loss": 0.8341,
+      "step": 1756
+    },
+    {
+      "epoch": 1.9299739118495127,
+      "grad_norm": 1.1710751056671143,
+      "learning_rate": 0.00011880341880341879,
+      "loss": 0.8093,
+      "step": 1757
+    },
+    {
+      "epoch": 1.9310723602910889,
+      "grad_norm": 0.4244484603404999,
+      "learning_rate": 0.00011868131868131866,
+      "loss": 0.5556,
+      "step": 1758
+    },
+    {
+      "epoch": 1.932170808732665,
+      "grad_norm": 0.43999040126800537,
+      "learning_rate": 0.00011855921855921855,
+      "loss": 0.4582,
+      "step": 1759
+    },
+    {
+      "epoch": 1.9332692571742414,
+      "grad_norm": 0.4197145700454712,
+      "learning_rate": 0.00011843711843711843,
+      "loss": 0.6475,
+      "step": 1760
+    },
+    {
+      "epoch": 1.9343677056158177,
+      "grad_norm": 0.36619749665260315,
+      "learning_rate": 0.0001183150183150183,
+      "loss": 0.5804,
+      "step": 1761
+    },
+    {
+      "epoch": 1.9354661540573939,
+      "grad_norm": 1.7230706214904785,
+      "learning_rate": 0.00011819291819291819,
+      "loss": 0.7064,
+      "step": 1762
+    },
+    {
+      "epoch": 1.9365646024989702,
+      "grad_norm": 0.7621874213218689,
+      "learning_rate": 0.00011807081807081806,
+      "loss": 0.6766,
+      "step": 1763
+    },
+    {
+      "epoch": 1.9376630509405466,
+      "grad_norm": 0.5920525789260864,
+      "learning_rate": 0.00011794871794871794,
+      "loss": 0.7092,
+      "step": 1764
+    },
+    {
+      "epoch": 1.9387614993821227,
+      "grad_norm": 1.5368432998657227,
+      "learning_rate": 0.00011782661782661781,
+      "loss": 0.3366,
+      "step": 1765
+    },
+    {
+      "epoch": 1.9398599478236989,
+      "grad_norm": 0.43197643756866455,
+      "learning_rate": 0.00011770451770451769,
+      "loss": 0.6158,
+      "step": 1766
+    },
+    {
+      "epoch": 1.9409583962652754,
+      "grad_norm": 0.4623143970966339,
+      "learning_rate": 0.00011758241758241756,
+      "loss": 0.6574,
+      "step": 1767
+    },
+    {
+      "epoch": 1.9420568447068516,
+      "grad_norm": 0.40638601779937744,
+      "learning_rate": 0.00011746031746031744,
+      "loss": 0.4385,
+      "step": 1768
+    },
+    {
+      "epoch": 1.9431552931484277,
+      "grad_norm": 0.5941652655601501,
+      "learning_rate": 0.00011733821733821734,
+      "loss": 0.8634,
+      "step": 1769
+    },
+    {
+      "epoch": 1.944253741590004,
+      "grad_norm": 0.9646288156509399,
+      "learning_rate": 0.00011721611721611721,
+      "loss": 0.7107,
+      "step": 1770
+    },
+    {
+      "epoch": 1.9453521900315804,
+      "grad_norm": 1.6859776973724365,
+      "learning_rate": 0.00011709401709401708,
+      "loss": 0.5544,
+      "step": 1771
+    },
+    {
+      "epoch": 1.9464506384731566,
+      "grad_norm": 0.4034999907016754,
+      "learning_rate": 0.00011697191697191697,
+      "loss": 0.559,
+      "step": 1772
+    },
+    {
+      "epoch": 1.947549086914733,
+      "grad_norm": 0.3644643723964691,
+      "learning_rate": 0.00011684981684981684,
+      "loss": 0.535,
+      "step": 1773
+    },
+    {
+      "epoch": 1.9486475353563093,
+      "grad_norm": 0.5826202034950256,
+      "learning_rate": 0.00011672771672771671,
+      "loss": 0.6405,
+      "step": 1774
+    },
+    {
+      "epoch": 1.9497459837978854,
+      "grad_norm": 0.5501505136489868,
+      "learning_rate": 0.00011660561660561661,
+      "loss": 0.5702,
+      "step": 1775
+    },
+    {
+      "epoch": 1.9508444322394618,
+      "grad_norm": 0.7928853631019592,
+      "learning_rate": 0.00011648351648351648,
+      "loss": 0.666,
+      "step": 1776
+    },
+    {
+      "epoch": 1.9519428806810382,
+      "grad_norm": 0.8168489933013916,
+      "learning_rate": 0.00011636141636141635,
+      "loss": 0.4451,
+      "step": 1777
+    },
+    {
+      "epoch": 1.9530413291226143,
+      "grad_norm": 0.3752410113811493,
+      "learning_rate": 0.00011623931623931622,
+      "loss": 0.6552,
+      "step": 1778
+    },
+    {
+      "epoch": 1.9541397775641904,
+      "grad_norm": 0.9020218849182129,
+      "learning_rate": 0.00011611721611721611,
+      "loss": 0.5994,
+      "step": 1779
+    },
+    {
+      "epoch": 1.9552382260057668,
+      "grad_norm": 0.7668479084968567,
+      "learning_rate": 0.00011599511599511598,
+      "loss": 0.5007,
+      "step": 1780
+    },
+    {
+      "epoch": 1.9563366744473432,
+      "grad_norm": 0.5034022331237793,
+      "learning_rate": 0.00011587301587301585,
+      "loss": 0.5211,
+      "step": 1781
+    },
+    {
+      "epoch": 1.9574351228889193,
+      "grad_norm": 1.0153850317001343,
+      "learning_rate": 0.00011575091575091575,
+      "loss": 0.5953,
+      "step": 1782
+    },
+    {
+      "epoch": 1.9585335713304957,
+      "grad_norm": 0.40088045597076416,
+      "learning_rate": 0.00011562881562881562,
+      "loss": 0.568,
+      "step": 1783
+    },
+    {
+      "epoch": 1.959632019772072,
+      "grad_norm": 1.4017099142074585,
+      "learning_rate": 0.0001155067155067155,
+      "loss": 0.7058,
+      "step": 1784
+    },
+    {
+      "epoch": 1.9607304682136482,
+      "grad_norm": 0.6009597778320312,
+      "learning_rate": 0.00011538461538461538,
+      "loss": 0.6239,
+      "step": 1785
+    },
+    {
+      "epoch": 1.9618289166552245,
+      "grad_norm": 0.5155071020126343,
+      "learning_rate": 0.00011526251526251525,
+      "loss": 0.6089,
+      "step": 1786
+    },
+    {
+      "epoch": 1.9629273650968009,
+      "grad_norm": 0.4248057007789612,
+      "learning_rate": 0.00011514041514041513,
+      "loss": 0.6481,
+      "step": 1787
+    },
+    {
+      "epoch": 1.964025813538377,
+      "grad_norm": 0.6521177887916565,
+      "learning_rate": 0.00011501831501831501,
+      "loss": 0.6598,
+      "step": 1788
+    },
+    {
+      "epoch": 1.9651242619799532,
+      "grad_norm": 0.44697993993759155,
+      "learning_rate": 0.00011489621489621488,
+      "loss": 0.8944,
+      "step": 1789
+    },
+    {
+      "epoch": 1.9662227104215297,
+      "grad_norm": 0.41537097096443176,
+      "learning_rate": 0.00011477411477411476,
+      "loss": 0.5304,
+      "step": 1790
+    },
+    {
+      "epoch": 1.9673211588631059,
+      "grad_norm": 0.48793885111808777,
+      "learning_rate": 0.00011465201465201464,
+      "loss": 0.7262,
+      "step": 1791
+    },
+    {
+      "epoch": 1.968419607304682,
+      "grad_norm": 0.8768893480300903,
+      "learning_rate": 0.00011452991452991453,
+      "loss": 0.6748,
+      "step": 1792
+    },
+    {
+      "epoch": 1.9695180557462584,
+      "grad_norm": 0.39224761724472046,
+      "learning_rate": 0.0001144078144078144,
+      "loss": 0.5503,
+      "step": 1793
+    },
+    {
+      "epoch": 1.9706165041878347,
+      "grad_norm": 0.5617446899414062,
+      "learning_rate": 0.00011428571428571427,
+      "loss": 0.7329,
+      "step": 1794
+    },
+    {
+      "epoch": 1.9717149526294109,
+      "grad_norm": 0.3787171542644501,
+      "learning_rate": 0.00011416361416361416,
+      "loss": 0.545,
+      "step": 1795
+    },
+    {
+      "epoch": 1.9728134010709872,
+      "grad_norm": 1.5167701244354248,
+      "learning_rate": 0.00011404151404151403,
+      "loss": 0.492,
+      "step": 1796
+    },
+    {
+      "epoch": 1.9739118495125636,
+      "grad_norm": 0.6436883807182312,
+      "learning_rate": 0.0001139194139194139,
+      "loss": 0.5644,
+      "step": 1797
+    },
+    {
+      "epoch": 1.9750102979541397,
+      "grad_norm": 0.7104658484458923,
+      "learning_rate": 0.0001137973137973138,
+      "loss": 0.7485,
+      "step": 1798
+    },
+    {
+      "epoch": 1.976108746395716,
+      "grad_norm": 0.7996894717216492,
+      "learning_rate": 0.00011367521367521367,
+      "loss": 0.6918,
+      "step": 1799
+    },
+    {
+      "epoch": 1.9772071948372925,
+      "grad_norm": 0.6419106721878052,
+      "learning_rate": 0.00011355311355311354,
+      "loss": 0.5945,
+      "step": 1800
+    },
+    {
+      "epoch": 1.9783056432788686,
+      "grad_norm": 0.5158131718635559,
+      "learning_rate": 0.00011343101343101343,
+      "loss": 0.6685,
+      "step": 1801
+    },
+    {
+      "epoch": 1.9794040917204447,
+      "grad_norm": 1.0825144052505493,
+      "learning_rate": 0.0001133089133089133,
+      "loss": 0.6774,
+      "step": 1802
+    },
+    {
+      "epoch": 1.980502540162021,
+      "grad_norm": 0.3999088704586029,
+      "learning_rate": 0.00011318681318681317,
+      "loss": 0.632,
+      "step": 1803
+    },
+    {
+      "epoch": 1.9816009886035975,
+      "grad_norm": 0.8866069316864014,
+      "learning_rate": 0.00011306471306471304,
+      "loss": 0.6541,
+      "step": 1804
+    },
+    {
+      "epoch": 1.9826994370451736,
+      "grad_norm": 0.3858928978443146,
+      "learning_rate": 0.00011294261294261294,
+      "loss": 0.6608,
+      "step": 1805
+    },
+    {
+      "epoch": 1.98379788548675,
+      "grad_norm": 0.513117790222168,
+      "learning_rate": 0.00011282051282051281,
+      "loss": 0.7598,
+      "step": 1806
+    },
+    {
+      "epoch": 1.9848963339283263,
+      "grad_norm": 0.3166581392288208,
+      "learning_rate": 0.00011269841269841269,
+      "loss": 0.781,
+      "step": 1807
+    },
+    {
+      "epoch": 1.9859947823699025,
+      "grad_norm": 0.3982362151145935,
+      "learning_rate": 0.00011257631257631257,
+      "loss": 0.873,
+      "step": 1808
+    },
+    {
+      "epoch": 1.9870932308114788,
+      "grad_norm": 0.3784008026123047,
+      "learning_rate": 0.00011245421245421244,
+      "loss": 0.7286,
+      "step": 1809
+    },
+    {
+      "epoch": 1.9881916792530552,
+      "grad_norm": 0.7578315138816833,
+      "learning_rate": 0.00011233211233211232,
+      "loss": 0.5958,
+      "step": 1810
+    },
+    {
+      "epoch": 1.9892901276946313,
+      "grad_norm": 0.8509061932563782,
+      "learning_rate": 0.0001122100122100122,
+      "loss": 0.557,
+      "step": 1811
+    },
+    {
+      "epoch": 1.9903885761362075,
+      "grad_norm": 0.5107323527336121,
+      "learning_rate": 0.00011208791208791207,
+      "loss": 0.6994,
+      "step": 1812
+    },
+    {
+      "epoch": 1.991487024577784,
+      "grad_norm": 0.5421388149261475,
+      "learning_rate": 0.00011196581196581196,
+      "loss": 0.8839,
+      "step": 1813
+    },
+    {
+      "epoch": 1.9925854730193602,
+      "grad_norm": 0.7442356944084167,
+      "learning_rate": 0.00011184371184371184,
+      "loss": 0.6676,
+      "step": 1814
+    },
+    {
+      "epoch": 1.9936839214609363,
+      "grad_norm": 0.34132111072540283,
+      "learning_rate": 0.00011172161172161172,
+      "loss": 0.5714,
+      "step": 1815
+    },
+    {
+      "epoch": 1.9947823699025127,
+      "grad_norm": 0.3995620906352997,
+      "learning_rate": 0.00011159951159951159,
+      "loss": 0.4811,
+      "step": 1816
+    },
+    {
+      "epoch": 1.995880818344089,
+      "grad_norm": 0.5613861083984375,
+      "learning_rate": 0.00011147741147741146,
+      "loss": 0.7495,
+      "step": 1817
+    },
+    {
+      "epoch": 1.9969792667856652,
+      "grad_norm": 0.4366309642791748,
+      "learning_rate": 0.00011135531135531135,
+      "loss": 0.6512,
+      "step": 1818
+    },
+    {
+      "epoch": 1.9980777152272415,
+      "grad_norm": 0.889916718006134,
+      "learning_rate": 0.00011123321123321122,
+      "loss": 0.5544,
+      "step": 1819
+    },
+    {
+      "epoch": 1.999176163668818,
+      "grad_norm": 0.512112021446228,
+      "learning_rate": 0.00011111111111111109,
+      "loss": 1.136,
+      "step": 1820
+    },
+    {
+      "epoch": 2.000274612110394,
+      "grad_norm": 0.5241844654083252,
+      "learning_rate": 0.00011098901098901099,
+      "loss": 0.5898,
+      "step": 1821
+    },
+    {
+      "epoch": 2.00137306055197,
+      "grad_norm": 0.38159477710723877,
+      "learning_rate": 0.00011086691086691086,
+      "loss": 0.5523,
+      "step": 1822
+    },
+    {
+      "epoch": 2.0024715089935468,
+      "grad_norm": 1.0415009260177612,
+      "learning_rate": 0.00011074481074481073,
+      "loss": 0.6963,
+      "step": 1823
+    },
+    {
+      "epoch": 2.003569957435123,
+      "grad_norm": 0.5349957942962646,
+      "learning_rate": 0.00011062271062271062,
+      "loss": 0.4422,
+      "step": 1824
+    },
+    {
+      "epoch": 2.004668405876699,
+      "grad_norm": 0.4512043297290802,
+      "learning_rate": 0.00011050061050061049,
+      "loss": 0.5467,
+      "step": 1825
+    },
+    {
+      "epoch": 2.0057668543182756,
+      "grad_norm": 0.8268045783042908,
+      "learning_rate": 0.00011037851037851036,
+      "loss": 0.6931,
+      "step": 1826
+    },
+    {
+      "epoch": 2.0068653027598518,
+      "grad_norm": 0.47922319173812866,
+      "learning_rate": 0.00011025641025641026,
+      "loss": 0.707,
+      "step": 1827
+    },
+    {
+      "epoch": 2.007963751201428,
+      "grad_norm": 1.352858304977417,
+      "learning_rate": 0.00011013431013431013,
+      "loss": 0.5658,
+      "step": 1828
+    },
+    {
+      "epoch": 2.0090621996430045,
+      "grad_norm": 0.6304643154144287,
+      "learning_rate": 0.00011001221001221,
+      "loss": 0.6526,
+      "step": 1829
+    },
+    {
+      "epoch": 2.0101606480845806,
+      "grad_norm": 0.3759060502052307,
+      "learning_rate": 0.00010989010989010988,
+      "loss": 0.627,
+      "step": 1830
+    },
+    {
+      "epoch": 2.0112590965261568,
+      "grad_norm": 0.5676531195640564,
+      "learning_rate": 0.00010976800976800976,
+      "loss": 0.7568,
+      "step": 1831
+    },
+    {
+      "epoch": 2.012357544967733,
+      "grad_norm": 0.7481321692466736,
+      "learning_rate": 0.00010964590964590963,
+      "loss": 0.7304,
+      "step": 1832
+    },
+    {
+      "epoch": 2.0134559934093095,
+      "grad_norm": 1.0350905656814575,
+      "learning_rate": 0.0001095238095238095,
+      "loss": 0.7414,
+      "step": 1833
+    },
+    {
+      "epoch": 2.0145544418508856,
+      "grad_norm": 0.7817292809486389,
+      "learning_rate": 0.00010940170940170939,
+      "loss": 0.7742,
+      "step": 1834
+    },
+    {
+      "epoch": 2.0156528902924618,
+      "grad_norm": 0.44659602642059326,
+      "learning_rate": 0.00010927960927960928,
+      "loss": 0.7872,
+      "step": 1835
+    },
+    {
+      "epoch": 2.0167513387340383,
+      "grad_norm": 0.46931198239326477,
+      "learning_rate": 0.00010915750915750915,
+      "loss": 0.5596,
+      "step": 1836
+    },
+    {
+      "epoch": 2.0178497871756145,
+      "grad_norm": 0.34634560346603394,
+      "learning_rate": 0.00010903540903540903,
+      "loss": 0.6861,
+      "step": 1837
+    },
+    {
+      "epoch": 2.0189482356171906,
+      "grad_norm": 0.36579200625419617,
+      "learning_rate": 0.0001089133089133089,
+      "loss": 0.6586,
+      "step": 1838
+    },
+    {
+      "epoch": 2.020046684058767,
+      "grad_norm": 0.9167144894599915,
+      "learning_rate": 0.00010879120879120878,
+      "loss": 0.7125,
+      "step": 1839
+    },
+    {
+      "epoch": 2.0211451325003433,
+      "grad_norm": 0.4107789993286133,
+      "learning_rate": 0.00010866910866910866,
+      "loss": 0.6089,
+      "step": 1840
+    },
+    {
+      "epoch": 2.0222435809419195,
+      "grad_norm": 1.0845204591751099,
+      "learning_rate": 0.00010854700854700854,
+      "loss": 0.499,
+      "step": 1841
+    },
+    {
+      "epoch": 2.0233420293834956,
+      "grad_norm": 0.382376492023468,
+      "learning_rate": 0.00010842490842490841,
+      "loss": 0.5505,
+      "step": 1842
+    },
+    {
+      "epoch": 2.024440477825072,
+      "grad_norm": 0.38339781761169434,
+      "learning_rate": 0.00010830280830280828,
+      "loss": 0.4593,
+      "step": 1843
+    },
+    {
+      "epoch": 2.0255389262666483,
+      "grad_norm": 0.45328769087791443,
+      "learning_rate": 0.00010818070818070818,
+      "loss": 0.8437,
+      "step": 1844
+    },
+    {
+      "epoch": 2.0266373747082245,
+      "grad_norm": 0.3051920533180237,
+      "learning_rate": 0.00010805860805860805,
+      "loss": 0.6096,
+      "step": 1845
+    },
+    {
+      "epoch": 2.027735823149801,
+      "grad_norm": 0.4249560236930847,
+      "learning_rate": 0.00010793650793650792,
+      "loss": 0.6441,
+      "step": 1846
+    },
+    {
+      "epoch": 2.028834271591377,
+      "grad_norm": 0.6639708280563354,
+      "learning_rate": 0.00010781440781440781,
+      "loss": 0.716,
+      "step": 1847
+    },
+    {
+      "epoch": 2.0299327200329533,
+      "grad_norm": 0.4324635863304138,
+      "learning_rate": 0.00010769230769230768,
+      "loss": 0.5288,
+      "step": 1848
+    },
+    {
+      "epoch": 2.03103116847453,
+      "grad_norm": 0.46487629413604736,
+      "learning_rate": 0.00010757020757020755,
+      "loss": 0.4908,
+      "step": 1849
+    },
+    {
+      "epoch": 2.032129616916106,
+      "grad_norm": 0.5104641318321228,
+      "learning_rate": 0.00010744810744810745,
+      "loss": 0.6367,
+      "step": 1850
+    },
+    {
+      "epoch": 2.033228065357682,
+      "grad_norm": 0.4010922312736511,
+      "learning_rate": 0.00010732600732600732,
+      "loss": 0.4266,
+      "step": 1851
+    },
+    {
+      "epoch": 2.0343265137992583,
+      "grad_norm": 0.6835510730743408,
+      "learning_rate": 0.0001072039072039072,
+      "loss": 1.0077,
+      "step": 1852
+    },
+    {
+      "epoch": 2.035424962240835,
+      "grad_norm": 0.7012602686882019,
+      "learning_rate": 0.00010708180708180708,
+      "loss": 0.7656,
+      "step": 1853
+    },
+    {
+      "epoch": 2.036523410682411,
+      "grad_norm": 0.8202001452445984,
+      "learning_rate": 0.00010695970695970695,
+      "loss": 0.9796,
+      "step": 1854
+    },
+    {
+      "epoch": 2.037621859123987,
+      "grad_norm": 0.37708353996276855,
+      "learning_rate": 0.00010683760683760682,
+      "loss": 0.3664,
+      "step": 1855
+    },
+    {
+      "epoch": 2.0387203075655638,
+      "grad_norm": 0.34818801283836365,
+      "learning_rate": 0.0001067155067155067,
+      "loss": 0.5365,
+      "step": 1856
+    },
+    {
+      "epoch": 2.03981875600714,
+      "grad_norm": 0.46427440643310547,
+      "learning_rate": 0.0001065934065934066,
+      "loss": 0.7503,
+      "step": 1857
+    },
+    {
+      "epoch": 2.040917204448716,
+      "grad_norm": 0.4782754182815552,
+      "learning_rate": 0.00010647130647130647,
+      "loss": 0.9247,
+      "step": 1858
+    },
+    {
+      "epoch": 2.0420156528902926,
+      "grad_norm": 0.6814667582511902,
+      "learning_rate": 0.00010634920634920634,
+      "loss": 0.5365,
+      "step": 1859
+    },
+    {
+      "epoch": 2.0431141013318688,
+      "grad_norm": 0.4782056510448456,
+      "learning_rate": 0.00010622710622710622,
+      "loss": 0.7444,
+      "step": 1860
+    },
+    {
+      "epoch": 2.044212549773445,
+      "grad_norm": 0.768439769744873,
+      "learning_rate": 0.0001061050061050061,
+      "loss": 0.6386,
+      "step": 1861
+    },
+    {
+      "epoch": 2.0453109982150215,
+      "grad_norm": 0.9991740584373474,
+      "learning_rate": 0.00010598290598290597,
+      "loss": 0.4762,
+      "step": 1862
+    },
+    {
+      "epoch": 2.0464094466565976,
+      "grad_norm": 0.4244922995567322,
+      "learning_rate": 0.00010586080586080585,
+      "loss": 0.4469,
+      "step": 1863
+    },
+    {
+      "epoch": 2.0475078950981738,
+      "grad_norm": 0.4085465371608734,
+      "learning_rate": 0.00010573870573870573,
+      "loss": 0.7215,
+      "step": 1864
+    },
+    {
+      "epoch": 2.04860634353975,
+      "grad_norm": 1.3068008422851562,
+      "learning_rate": 0.0001056166056166056,
+      "loss": 0.7781,
+      "step": 1865
+    },
+    {
+      "epoch": 2.0497047919813265,
+      "grad_norm": 0.3995974659919739,
+      "learning_rate": 0.0001054945054945055,
+      "loss": 0.6114,
+      "step": 1866
+    },
+    {
+      "epoch": 2.0508032404229026,
+      "grad_norm": 0.47944560647010803,
+      "learning_rate": 0.00010537240537240537,
+      "loss": 0.7355,
+      "step": 1867
+    },
+    {
+      "epoch": 2.0519016888644788,
+      "grad_norm": 1.6718720197677612,
+      "learning_rate": 0.00010525030525030524,
+      "loss": 0.5987,
+      "step": 1868
+    },
+    {
+      "epoch": 2.0530001373060554,
+      "grad_norm": 0.46015220880508423,
+      "learning_rate": 0.00010512820512820511,
+      "loss": 0.481,
+      "step": 1869
+    },
+    {
+      "epoch": 2.0540985857476315,
+      "grad_norm": 0.4863795042037964,
+      "learning_rate": 0.000105006105006105,
+      "loss": 0.5877,
+      "step": 1870
+    },
+    {
+      "epoch": 2.0551970341892076,
+      "grad_norm": 0.9190402030944824,
+      "learning_rate": 0.00010488400488400487,
+      "loss": 0.7941,
+      "step": 1871
+    },
+    {
+      "epoch": 2.056295482630784,
+      "grad_norm": 0.6056554317474365,
+      "learning_rate": 0.00010476190476190474,
+      "loss": 0.5455,
+      "step": 1872
+    },
+    {
+      "epoch": 2.0573939310723603,
+      "grad_norm": 0.7070736289024353,
+      "learning_rate": 0.00010463980463980464,
+      "loss": 0.6112,
+      "step": 1873
+    },
+    {
+      "epoch": 2.0584923795139365,
+      "grad_norm": 0.5415268540382385,
+      "learning_rate": 0.00010451770451770451,
+      "loss": 0.7141,
+      "step": 1874
+    },
+    {
+      "epoch": 2.0595908279555126,
+      "grad_norm": 0.45696091651916504,
+      "learning_rate": 0.00010439560439560438,
+      "loss": 0.7825,
+      "step": 1875
+    },
+    {
+      "epoch": 2.060689276397089,
+      "grad_norm": 0.5728979706764221,
+      "learning_rate": 0.00010427350427350427,
+      "loss": 0.5869,
+      "step": 1876
+    },
+    {
+      "epoch": 2.0617877248386653,
+      "grad_norm": 0.5910143852233887,
+      "learning_rate": 0.00010415140415140414,
+      "loss": 0.728,
+      "step": 1877
+    },
+    {
+      "epoch": 2.0628861732802415,
+      "grad_norm": 0.530915379524231,
+      "learning_rate": 0.00010402930402930401,
+      "loss": 0.6459,
+      "step": 1878
+    },
+    {
+      "epoch": 2.063984621721818,
+      "grad_norm": 0.36358964443206787,
+      "learning_rate": 0.00010390720390720391,
+      "loss": 0.7536,
+      "step": 1879
+    },
+    {
+      "epoch": 2.065083070163394,
+      "grad_norm": 2.7523410320281982,
+      "learning_rate": 0.00010378510378510379,
+      "loss": 0.6347,
+      "step": 1880
+    },
+    {
+      "epoch": 2.0661815186049703,
+      "grad_norm": 0.6842527389526367,
+      "learning_rate": 0.00010366300366300366,
+      "loss": 0.4943,
+      "step": 1881
+    },
+    {
+      "epoch": 2.067279967046547,
+      "grad_norm": 0.5830293297767639,
+      "learning_rate": 0.00010354090354090353,
+      "loss": 0.5855,
+      "step": 1882
+    },
+    {
+      "epoch": 2.068378415488123,
+      "grad_norm": 0.981920599937439,
+      "learning_rate": 0.00010341880341880341,
+      "loss": 0.4425,
+      "step": 1883
+    },
+    {
+      "epoch": 2.069476863929699,
+      "grad_norm": 2.0826029777526855,
+      "learning_rate": 0.00010329670329670329,
+      "loss": 0.5399,
+      "step": 1884
+    },
+    {
+      "epoch": 2.0705753123712753,
+      "grad_norm": 0.4648442268371582,
+      "learning_rate": 0.00010317460317460316,
+      "loss": 0.6203,
+      "step": 1885
+    },
+    {
+      "epoch": 2.071673760812852,
+      "grad_norm": 0.5086346864700317,
+      "learning_rate": 0.00010305250305250304,
+      "loss": 0.6091,
+      "step": 1886
+    },
+    {
+      "epoch": 2.072772209254428,
+      "grad_norm": 0.40404266119003296,
+      "learning_rate": 0.00010293040293040292,
+      "loss": 0.5013,
+      "step": 1887
+    },
+    {
+      "epoch": 2.073870657696004,
+      "grad_norm": 2.0507569313049316,
+      "learning_rate": 0.0001028083028083028,
+      "loss": 0.7822,
+      "step": 1888
+    },
+    {
+      "epoch": 2.074969106137581,
+      "grad_norm": 0.9318211078643799,
+      "learning_rate": 0.00010268620268620269,
+      "loss": 0.6638,
+      "step": 1889
+    },
+    {
+      "epoch": 2.076067554579157,
+      "grad_norm": 0.7601054310798645,
+      "learning_rate": 0.00010256410256410256,
+      "loss": 0.6085,
+      "step": 1890
+    },
+    {
+      "epoch": 2.077166003020733,
+      "grad_norm": 1.1299306154251099,
+      "learning_rate": 0.00010244200244200243,
+      "loss": 0.682,
+      "step": 1891
+    },
+    {
+      "epoch": 2.0782644514623096,
+      "grad_norm": 0.5009475350379944,
+      "learning_rate": 0.0001023199023199023,
+      "loss": 0.7229,
+      "step": 1892
+    },
+    {
+      "epoch": 2.079362899903886,
+      "grad_norm": 0.3432561159133911,
+      "learning_rate": 0.00010219780219780219,
+      "loss": 0.5991,
+      "step": 1893
+    },
+    {
+      "epoch": 2.080461348345462,
+      "grad_norm": 0.5224031805992126,
+      "learning_rate": 0.00010207570207570206,
+      "loss": 0.3687,
+      "step": 1894
+    },
+    {
+      "epoch": 2.0815597967870385,
+      "grad_norm": 0.4849548935890198,
+      "learning_rate": 0.00010195360195360193,
+      "loss": 0.507,
+      "step": 1895
+    },
+    {
+      "epoch": 2.0826582452286146,
+      "grad_norm": 0.6093185544013977,
+      "learning_rate": 0.00010183150183150183,
+      "loss": 0.7019,
+      "step": 1896
+    },
+    {
+      "epoch": 2.083756693670191,
+      "grad_norm": 0.7408457398414612,
+      "learning_rate": 0.0001017094017094017,
+      "loss": 0.6331,
+      "step": 1897
+    },
+    {
+      "epoch": 2.084855142111767,
+      "grad_norm": 0.67701655626297,
+      "learning_rate": 0.00010158730158730157,
+      "loss": 0.6685,
+      "step": 1898
+    },
+    {
+      "epoch": 2.0859535905533435,
+      "grad_norm": 0.2880030870437622,
+      "learning_rate": 0.00010146520146520146,
+      "loss": 0.4043,
+      "step": 1899
+    },
+    {
+      "epoch": 2.0870520389949196,
+      "grad_norm": 0.45890796184539795,
+      "learning_rate": 0.00010134310134310133,
+      "loss": 0.3695,
+      "step": 1900
+    },
+    {
+      "epoch": 2.088150487436496,
+      "grad_norm": 0.7898344397544861,
+      "learning_rate": 0.0001012210012210012,
+      "loss": 0.7875,
+      "step": 1901
+    },
+    {
+      "epoch": 2.0892489358780724,
+      "grad_norm": 0.5648753046989441,
+      "learning_rate": 0.0001010989010989011,
+      "loss": 0.6058,
+      "step": 1902
+    },
+    {
+      "epoch": 2.0903473843196485,
+      "grad_norm": 0.7880465984344482,
+      "learning_rate": 0.00010097680097680098,
+      "loss": 0.6403,
+      "step": 1903
+    },
+    {
+      "epoch": 2.0914458327612246,
+      "grad_norm": 0.4169737696647644,
+      "learning_rate": 0.00010085470085470085,
+      "loss": 0.71,
+      "step": 1904
+    },
+    {
+      "epoch": 2.0925442812028012,
+      "grad_norm": 0.33653560280799866,
+      "learning_rate": 0.00010073260073260072,
+      "loss": 0.6278,
+      "step": 1905
+    },
+    {
+      "epoch": 2.0936427296443774,
+      "grad_norm": 0.6861558556556702,
+      "learning_rate": 0.0001006105006105006,
+      "loss": 0.8463,
+      "step": 1906
+    },
+    {
+      "epoch": 2.0947411780859535,
+      "grad_norm": 0.29407018423080444,
+      "learning_rate": 0.00010048840048840048,
+      "loss": 0.5644,
+      "step": 1907
+    },
+    {
+      "epoch": 2.09583962652753,
+      "grad_norm": 0.673083484172821,
+      "learning_rate": 0.00010036630036630035,
+      "loss": 0.8353,
+      "step": 1908
+    },
+    {
+      "epoch": 2.0969380749691062,
+      "grad_norm": 0.429061621427536,
+      "learning_rate": 0.00010024420024420023,
+      "loss": 0.6381,
+      "step": 1909
+    },
+    {
+      "epoch": 2.0980365234106824,
+      "grad_norm": 0.5113368630409241,
+      "learning_rate": 0.00010012210012210012,
+      "loss": 0.7603,
+      "step": 1910
+    },
+    {
+      "epoch": 2.0991349718522585,
+      "grad_norm": 0.9005820751190186,
+      "learning_rate": 9.999999999999999e-05,
+      "loss": 0.6331,
+      "step": 1911
+    },
+    {
+      "epoch": 2.100233420293835,
+      "grad_norm": 0.489851176738739,
+      "learning_rate": 9.987789987789988e-05,
+      "loss": 0.8564,
+      "step": 1912
+    },
+    {
+      "epoch": 2.1013318687354112,
+      "grad_norm": 0.42647236585617065,
+      "learning_rate": 9.975579975579975e-05,
+      "loss": 0.5496,
+      "step": 1913
+    },
+    {
+      "epoch": 2.1024303171769874,
+      "grad_norm": 0.9061693549156189,
+      "learning_rate": 9.963369963369962e-05,
+      "loss": 0.4478,
+      "step": 1914
+    },
+    {
+      "epoch": 2.103528765618564,
+      "grad_norm": 0.4721933901309967,
+      "learning_rate": 9.95115995115995e-05,
+      "loss": 0.6066,
+      "step": 1915
+    },
+    {
+      "epoch": 2.10462721406014,
+      "grad_norm": 0.7265921831130981,
+      "learning_rate": 9.938949938949938e-05,
+      "loss": 0.7195,
+      "step": 1916
+    },
+    {
+      "epoch": 2.1057256625017162,
+      "grad_norm": 0.4521386921405792,
+      "learning_rate": 9.926739926739925e-05,
+      "loss": 0.6476,
+      "step": 1917
+    },
+    {
+      "epoch": 2.106824110943293,
+      "grad_norm": 0.42982912063598633,
+      "learning_rate": 9.914529914529912e-05,
+      "loss": 0.535,
+      "step": 1918
+    },
+    {
+      "epoch": 2.107922559384869,
+      "grad_norm": 0.4758259952068329,
+      "learning_rate": 9.902319902319902e-05,
+      "loss": 0.8106,
+      "step": 1919
+    },
+    {
+      "epoch": 2.109021007826445,
+      "grad_norm": 0.69195157289505,
+      "learning_rate": 9.890109890109889e-05,
+      "loss": 0.6643,
+      "step": 1920
+    },
+    {
+      "epoch": 2.110119456268021,
+      "grad_norm": 0.8207395672798157,
+      "learning_rate": 9.877899877899876e-05,
+      "loss": 0.7535,
+      "step": 1921
+    },
+    {
+      "epoch": 2.111217904709598,
+      "grad_norm": 1.4245035648345947,
+      "learning_rate": 9.865689865689865e-05,
+      "loss": 0.6721,
+      "step": 1922
+    },
+    {
+      "epoch": 2.112316353151174,
+      "grad_norm": 0.5496362447738647,
+      "learning_rate": 9.853479853479852e-05,
+      "loss": 0.5367,
+      "step": 1923
+    },
+    {
+      "epoch": 2.11341480159275,
+      "grad_norm": 0.5466665625572205,
+      "learning_rate": 9.84126984126984e-05,
+      "loss": 0.6083,
+      "step": 1924
+    },
+    {
+      "epoch": 2.1145132500343267,
+      "grad_norm": 0.7750464677810669,
+      "learning_rate": 9.829059829059829e-05,
+      "loss": 0.663,
+      "step": 1925
+    },
+    {
+      "epoch": 2.115611698475903,
+      "grad_norm": 0.4978208541870117,
+      "learning_rate": 9.816849816849817e-05,
+      "loss": 0.6334,
+      "step": 1926
+    },
+    {
+      "epoch": 2.116710146917479,
+      "grad_norm": 0.6415550708770752,
+      "learning_rate": 9.804639804639804e-05,
+      "loss": 0.6477,
+      "step": 1927
+    },
+    {
+      "epoch": 2.1178085953590555,
+      "grad_norm": 0.644123911857605,
+      "learning_rate": 9.792429792429792e-05,
+      "loss": 0.668,
+      "step": 1928
+    },
+    {
+      "epoch": 2.1189070438006317,
+      "grad_norm": 0.39706236124038696,
+      "learning_rate": 9.78021978021978e-05,
+      "loss": 0.5875,
+      "step": 1929
+    },
+    {
+      "epoch": 2.120005492242208,
+      "grad_norm": 1.3733233213424683,
+      "learning_rate": 9.768009768009767e-05,
+      "loss": 0.6023,
+      "step": 1930
+    },
+    {
+      "epoch": 2.121103940683784,
+      "grad_norm": 0.48839983344078064,
+      "learning_rate": 9.755799755799754e-05,
+      "loss": 0.5693,
+      "step": 1931
+    },
+    {
+      "epoch": 2.1222023891253605,
+      "grad_norm": 0.3107692301273346,
+      "learning_rate": 9.743589743589744e-05,
+      "loss": 0.5822,
+      "step": 1932
+    },
+    {
+      "epoch": 2.1233008375669367,
+      "grad_norm": 0.3988654911518097,
+      "learning_rate": 9.731379731379731e-05,
+      "loss": 0.5989,
+      "step": 1933
+    },
+    {
+      "epoch": 2.124399286008513,
+      "grad_norm": 1.1887754201889038,
+      "learning_rate": 9.719169719169718e-05,
+      "loss": 0.6382,
+      "step": 1934
+    },
+    {
+      "epoch": 2.1254977344500894,
+      "grad_norm": 0.43282651901245117,
+      "learning_rate": 9.706959706959707e-05,
+      "loss": 0.5649,
+      "step": 1935
+    },
+    {
+      "epoch": 2.1265961828916655,
+      "grad_norm": 0.39243975281715393,
+      "learning_rate": 9.694749694749694e-05,
+      "loss": 0.7005,
+      "step": 1936
+    },
+    {
+      "epoch": 2.1276946313332417,
+      "grad_norm": 0.7401454448699951,
+      "learning_rate": 9.682539682539681e-05,
+      "loss": 1.0632,
+      "step": 1937
+    },
+    {
+      "epoch": 2.1287930797748182,
+      "grad_norm": 0.6976983547210693,
+      "learning_rate": 9.67032967032967e-05,
+      "loss": 0.562,
+      "step": 1938
+    },
+    {
+      "epoch": 2.1298915282163944,
+      "grad_norm": 0.9784336686134338,
+      "learning_rate": 9.658119658119657e-05,
+      "loss": 0.8115,
+      "step": 1939
+    },
+    {
+      "epoch": 2.1309899766579705,
+      "grad_norm": 0.5289125442504883,
+      "learning_rate": 9.645909645909644e-05,
+      "loss": 0.6161,
+      "step": 1940
+    },
+    {
+      "epoch": 2.132088425099547,
+      "grad_norm": 1.414559006690979,
+      "learning_rate": 9.633699633699634e-05,
+      "loss": 0.7115,
+      "step": 1941
+    },
+    {
+      "epoch": 2.1331868735411232,
+      "grad_norm": 0.5444177389144897,
+      "learning_rate": 9.621489621489621e-05,
+      "loss": 0.6211,
+      "step": 1942
+    },
+    {
+      "epoch": 2.1342853219826994,
+      "grad_norm": 0.637030839920044,
+      "learning_rate": 9.609279609279608e-05,
+      "loss": 0.8747,
+      "step": 1943
+    },
+    {
+      "epoch": 2.1353837704242755,
+      "grad_norm": 0.5926198363304138,
+      "learning_rate": 9.597069597069595e-05,
+      "loss": 0.8673,
+      "step": 1944
+    },
+    {
+      "epoch": 2.136482218865852,
+      "grad_norm": 0.3638801872730255,
+      "learning_rate": 9.584859584859584e-05,
+      "loss": 0.4698,
+      "step": 1945
+    },
+    {
+      "epoch": 2.1375806673074282,
+      "grad_norm": 0.5823031067848206,
+      "learning_rate": 9.572649572649571e-05,
+      "loss": 0.6988,
+      "step": 1946
+    },
+    {
+      "epoch": 2.1386791157490044,
+      "grad_norm": 0.44348934292793274,
+      "learning_rate": 9.560439560439558e-05,
+      "loss": 0.6667,
+      "step": 1947
+    },
+    {
+      "epoch": 2.139777564190581,
+      "grad_norm": 3.177112579345703,
+      "learning_rate": 9.548229548229548e-05,
+      "loss": 0.8738,
+      "step": 1948
+    },
+    {
+      "epoch": 2.140876012632157,
+      "grad_norm": 1.3834997415542603,
+      "learning_rate": 9.536019536019536e-05,
+      "loss": 0.528,
+      "step": 1949
+    },
+    {
+      "epoch": 2.1419744610737332,
+      "grad_norm": 0.5514722466468811,
+      "learning_rate": 9.523809523809523e-05,
+      "loss": 0.5058,
+      "step": 1950
+    },
+    {
+      "epoch": 2.14307290951531,
+      "grad_norm": 0.8795000314712524,
+      "learning_rate": 9.511599511599511e-05,
+      "loss": 0.6368,
+      "step": 1951
+    },
+    {
+      "epoch": 2.144171357956886,
+      "grad_norm": 1.0043178796768188,
+      "learning_rate": 9.499389499389498e-05,
+      "loss": 0.5701,
+      "step": 1952
+    },
+    {
+      "epoch": 2.145269806398462,
+      "grad_norm": 1.8537780046463013,
+      "learning_rate": 9.487179487179486e-05,
+      "loss": 0.6978,
+      "step": 1953
+    },
+    {
+      "epoch": 2.1463682548400387,
+      "grad_norm": 0.5239475965499878,
+      "learning_rate": 9.474969474969476e-05,
+      "loss": 0.7093,
+      "step": 1954
+    },
+    {
+      "epoch": 2.147466703281615,
+      "grad_norm": 0.7944377064704895,
+      "learning_rate": 9.462759462759463e-05,
+      "loss": 0.7625,
+      "step": 1955
+    },
+    {
+      "epoch": 2.148565151723191,
+      "grad_norm": 0.7356003522872925,
+      "learning_rate": 9.45054945054945e-05,
+      "loss": 0.6845,
+      "step": 1956
+    },
+    {
+      "epoch": 2.149663600164767,
+      "grad_norm": 1.3590694665908813,
+      "learning_rate": 9.438339438339437e-05,
+      "loss": 0.6964,
+      "step": 1957
+    },
+    {
+      "epoch": 2.1507620486063437,
+      "grad_norm": 0.40889453887939453,
+      "learning_rate": 9.426129426129426e-05,
+      "loss": 0.6643,
+      "step": 1958
+    },
+    {
+      "epoch": 2.15186049704792,
+      "grad_norm": 0.6347643136978149,
+      "learning_rate": 9.413919413919413e-05,
+      "loss": 1.0002,
+      "step": 1959
+    },
+    {
+      "epoch": 2.152958945489496,
+      "grad_norm": 0.3661377429962158,
+      "learning_rate": 9.4017094017094e-05,
+      "loss": 0.5084,
+      "step": 1960
+    },
+    {
+      "epoch": 2.1540573939310725,
+      "grad_norm": 0.8262574672698975,
+      "learning_rate": 9.389499389499389e-05,
+      "loss": 0.5658,
+      "step": 1961
+    },
+    {
+      "epoch": 2.1551558423726487,
+      "grad_norm": 0.6054818034172058,
+      "learning_rate": 9.377289377289376e-05,
+      "loss": 0.6349,
+      "step": 1962
+    },
+    {
+      "epoch": 2.156254290814225,
+      "grad_norm": 0.3696078658103943,
+      "learning_rate": 9.365079365079364e-05,
+      "loss": 0.5746,
+      "step": 1963
+    },
+    {
+      "epoch": 2.157352739255801,
+      "grad_norm": 0.7613049745559692,
+      "learning_rate": 9.352869352869353e-05,
+      "loss": 0.5204,
+      "step": 1964
+    },
+    {
+      "epoch": 2.1584511876973775,
+      "grad_norm": 0.6841816306114197,
+      "learning_rate": 9.34065934065934e-05,
+      "loss": 0.813,
+      "step": 1965
+    },
+    {
+      "epoch": 2.1595496361389537,
+      "grad_norm": 0.902998685836792,
+      "learning_rate": 9.328449328449327e-05,
+      "loss": 0.6288,
+      "step": 1966
+    },
+    {
+      "epoch": 2.16064808458053,
+      "grad_norm": 0.5367470979690552,
+      "learning_rate": 9.316239316239316e-05,
+      "loss": 0.6689,
+      "step": 1967
+    },
+    {
+      "epoch": 2.1617465330221064,
+      "grad_norm": 0.9443572163581848,
+      "learning_rate": 9.304029304029303e-05,
+      "loss": 0.6864,
+      "step": 1968
+    },
+    {
+      "epoch": 2.1628449814636825,
+      "grad_norm": 0.42191457748413086,
+      "learning_rate": 9.29181929181929e-05,
+      "loss": 0.6509,
+      "step": 1969
+    },
+    {
+      "epoch": 2.1639434299052587,
+      "grad_norm": 0.6019404530525208,
+      "learning_rate": 9.279609279609277e-05,
+      "loss": 0.5252,
+      "step": 1970
+    },
+    {
+      "epoch": 2.1650418783468353,
+      "grad_norm": 1.9933907985687256,
+      "learning_rate": 9.267399267399267e-05,
+      "loss": 0.6042,
+      "step": 1971
+    },
+    {
+      "epoch": 2.1661403267884114,
+      "grad_norm": 0.33075836300849915,
+      "learning_rate": 9.255189255189255e-05,
+      "loss": 0.579,
+      "step": 1972
+    },
+    {
+      "epoch": 2.1672387752299875,
+      "grad_norm": 0.37899547815322876,
+      "learning_rate": 9.242979242979242e-05,
+      "loss": 0.5006,
+      "step": 1973
+    },
+    {
+      "epoch": 2.168337223671564,
+      "grad_norm": 0.6482734680175781,
+      "learning_rate": 9.23076923076923e-05,
+      "loss": 0.4844,
+      "step": 1974
+    },
+    {
+      "epoch": 2.1694356721131403,
+      "grad_norm": 0.47632062435150146,
+      "learning_rate": 9.218559218559217e-05,
+      "loss": 0.5844,
+      "step": 1975
+    },
+    {
+      "epoch": 2.1705341205547164,
+      "grad_norm": 0.3402813971042633,
+      "learning_rate": 9.206349206349205e-05,
+      "loss": 0.6397,
+      "step": 1976
+    },
+    {
+      "epoch": 2.1716325689962925,
+      "grad_norm": 0.47405871748924255,
+      "learning_rate": 9.194139194139195e-05,
+      "loss": 0.6436,
+      "step": 1977
+    },
+    {
+      "epoch": 2.172731017437869,
+      "grad_norm": 0.5474234223365784,
+      "learning_rate": 9.181929181929182e-05,
+      "loss": 0.5758,
+      "step": 1978
+    },
+    {
+      "epoch": 2.1738294658794453,
+      "grad_norm": 0.5423378348350525,
+      "learning_rate": 9.169719169719169e-05,
+      "loss": 0.5882,
+      "step": 1979
+    },
+    {
+      "epoch": 2.1749279143210214,
+      "grad_norm": 0.32848963141441345,
+      "learning_rate": 9.157509157509158e-05,
+      "loss": 0.5828,
+      "step": 1980
+    },
+    {
+      "epoch": 2.176026362762598,
+      "grad_norm": 0.6646802425384521,
+      "learning_rate": 9.145299145299145e-05,
+      "loss": 0.551,
+      "step": 1981
+    },
+    {
+      "epoch": 2.177124811204174,
+      "grad_norm": 0.4560980200767517,
+      "learning_rate": 9.133089133089132e-05,
+      "loss": 0.705,
+      "step": 1982
+    },
+    {
+      "epoch": 2.1782232596457503,
+      "grad_norm": 0.4531053304672241,
+      "learning_rate": 9.120879120879119e-05,
+      "loss": 0.7471,
+      "step": 1983
+    },
+    {
+      "epoch": 2.179321708087327,
+      "grad_norm": 0.5881507992744446,
+      "learning_rate": 9.108669108669108e-05,
+      "loss": 0.7559,
+      "step": 1984
+    },
+    {
+      "epoch": 2.180420156528903,
+      "grad_norm": 0.41462886333465576,
+      "learning_rate": 9.096459096459096e-05,
+      "loss": 0.5674,
+      "step": 1985
+    },
+    {
+      "epoch": 2.181518604970479,
+      "grad_norm": 0.46718108654022217,
+      "learning_rate": 9.084249084249083e-05,
+      "loss": 0.7149,
+      "step": 1986
+    },
+    {
+      "epoch": 2.1826170534120557,
+      "grad_norm": 0.49290111660957336,
+      "learning_rate": 9.072039072039072e-05,
+      "loss": 0.5641,
+      "step": 1987
+    },
+    {
+      "epoch": 2.183715501853632,
+      "grad_norm": 0.398296594619751,
+      "learning_rate": 9.059829059829059e-05,
+      "loss": 0.5177,
+      "step": 1988
+    },
+    {
+      "epoch": 2.184813950295208,
+      "grad_norm": 0.8241115212440491,
+      "learning_rate": 9.047619047619046e-05,
+      "loss": 0.7864,
+      "step": 1989
+    },
+    {
+      "epoch": 2.185912398736784,
+      "grad_norm": 1.1335865259170532,
+      "learning_rate": 9.035409035409035e-05,
+      "loss": 0.6167,
+      "step": 1990
+    },
+    {
+      "epoch": 2.1870108471783607,
+      "grad_norm": 0.4479789435863495,
+      "learning_rate": 9.023199023199022e-05,
+      "loss": 0.6365,
+      "step": 1991
+    },
+    {
+      "epoch": 2.188109295619937,
+      "grad_norm": 0.4892582297325134,
+      "learning_rate": 9.010989010989009e-05,
+      "loss": 0.6283,
+      "step": 1992
+    },
+    {
+      "epoch": 2.189207744061513,
+      "grad_norm": 0.8397974371910095,
+      "learning_rate": 8.998778998778999e-05,
+      "loss": 0.7123,
+      "step": 1993
+    },
+    {
+      "epoch": 2.1903061925030896,
+      "grad_norm": 0.5295377969741821,
+      "learning_rate": 8.986568986568986e-05,
+      "loss": 0.4033,
+      "step": 1994
+    },
+    {
+      "epoch": 2.1914046409446657,
+      "grad_norm": 0.464832067489624,
+      "learning_rate": 8.974358974358974e-05,
+      "loss": 0.8228,
+      "step": 1995
+    },
+    {
+      "epoch": 2.192503089386242,
+      "grad_norm": 0.381369024515152,
+      "learning_rate": 8.962148962148961e-05,
+      "loss": 0.6267,
+      "step": 1996
+    },
+    {
+      "epoch": 2.193601537827818,
+      "grad_norm": 0.7176710963249207,
+      "learning_rate": 8.949938949938949e-05,
+      "loss": 0.7008,
+      "step": 1997
+    },
+    {
+      "epoch": 2.1946999862693946,
+      "grad_norm": 2.569753885269165,
+      "learning_rate": 8.937728937728936e-05,
+      "loss": 0.6899,
+      "step": 1998
+    },
+    {
+      "epoch": 2.1957984347109707,
+      "grad_norm": 0.5020056962966919,
+      "learning_rate": 8.925518925518924e-05,
+      "loss": 0.527,
+      "step": 1999
+    },
+    {
+      "epoch": 2.196896883152547,
+      "grad_norm": 1.7054524421691895,
+      "learning_rate": 8.913308913308914e-05,
+      "loss": 0.5455,
+      "step": 2000
+    },
+    {
+      "epoch": 2.1979953315941234,
+      "grad_norm": 0.5037225484848022,
+      "learning_rate": 8.901098901098901e-05,
+      "loss": 0.7445,
+      "step": 2001
+    },
+    {
+      "epoch": 2.1990937800356996,
+      "grad_norm": 0.8109555840492249,
+      "learning_rate": 8.888888888888888e-05,
+      "loss": 0.624,
+      "step": 2002
+    },
+    {
+      "epoch": 2.2001922284772757,
+      "grad_norm": 0.47120043635368347,
+      "learning_rate": 8.876678876678877e-05,
+      "loss": 0.6858,
+      "step": 2003
+    },
+    {
+      "epoch": 2.2012906769188523,
+      "grad_norm": 0.6166191101074219,
+      "learning_rate": 8.864468864468864e-05,
+      "loss": 0.4528,
+      "step": 2004
+    },
+    {
+      "epoch": 2.2023891253604284,
+      "grad_norm": 0.4999128580093384,
+      "learning_rate": 8.852258852258851e-05,
+      "loss": 0.712,
+      "step": 2005
+    },
+    {
+      "epoch": 2.2034875738020046,
+      "grad_norm": 1.1858354806900024,
+      "learning_rate": 8.84004884004884e-05,
+      "loss": 0.7647,
+      "step": 2006
+    },
+    {
+      "epoch": 2.204586022243581,
+      "grad_norm": 0.4223528206348419,
+      "learning_rate": 8.827838827838828e-05,
+      "loss": 0.6553,
+      "step": 2007
+    },
+    {
+      "epoch": 2.2056844706851573,
+      "grad_norm": 0.41678956151008606,
+      "learning_rate": 8.815628815628815e-05,
+      "loss": 0.6033,
+      "step": 2008
+    },
+    {
+      "epoch": 2.2067829191267334,
+      "grad_norm": 0.5812666416168213,
+      "learning_rate": 8.803418803418802e-05,
+      "loss": 0.6016,
+      "step": 2009
+    },
+    {
+      "epoch": 2.2078813675683095,
+      "grad_norm": 0.5553560256958008,
+      "learning_rate": 8.791208791208791e-05,
+      "loss": 0.7621,
+      "step": 2010
+    },
+    {
+      "epoch": 2.208979816009886,
+      "grad_norm": 0.6392796635627747,
+      "learning_rate": 8.778998778998778e-05,
+      "loss": 0.567,
+      "step": 2011
+    },
+    {
+      "epoch": 2.2100782644514623,
+      "grad_norm": 1.0086902379989624,
+      "learning_rate": 8.766788766788765e-05,
+      "loss": 0.9432,
+      "step": 2012
+    },
+    {
+      "epoch": 2.2111767128930384,
+      "grad_norm": 1.3578602075576782,
+      "learning_rate": 8.754578754578754e-05,
+      "loss": 0.5107,
+      "step": 2013
+    },
+    {
+      "epoch": 2.212275161334615,
+      "grad_norm": 0.5530524849891663,
+      "learning_rate": 8.742368742368741e-05,
+      "loss": 0.6078,
+      "step": 2014
+    },
+    {
+      "epoch": 2.213373609776191,
+      "grad_norm": 0.3795104920864105,
+      "learning_rate": 8.730158730158728e-05,
+      "loss": 0.4889,
+      "step": 2015
+    },
+    {
+      "epoch": 2.2144720582177673,
+      "grad_norm": 0.40977227687835693,
+      "learning_rate": 8.717948717948718e-05,
+      "loss": 0.6295,
+      "step": 2016
+    },
+    {
+      "epoch": 2.215570506659344,
+      "grad_norm": 0.4882934093475342,
+      "learning_rate": 8.705738705738705e-05,
+      "loss": 0.7219,
+      "step": 2017
+    },
+    {
+      "epoch": 2.21666895510092,
+      "grad_norm": 0.7966530919075012,
+      "learning_rate": 8.693528693528693e-05,
+      "loss": 0.5342,
+      "step": 2018
+    },
+    {
+      "epoch": 2.217767403542496,
+      "grad_norm": 0.6992311477661133,
+      "learning_rate": 8.681318681318681e-05,
+      "loss": 0.5932,
+      "step": 2019
+    },
+    {
+      "epoch": 2.2188658519840727,
+      "grad_norm": 0.396427720785141,
+      "learning_rate": 8.669108669108668e-05,
+      "loss": 0.5838,
+      "step": 2020
+    },
+    {
+      "epoch": 2.219964300425649,
+      "grad_norm": 0.5625690817832947,
+      "learning_rate": 8.656898656898655e-05,
+      "loss": 0.7605,
+      "step": 2021
+    },
+    {
+      "epoch": 2.221062748867225,
+      "grad_norm": 0.6052583456039429,
+      "learning_rate": 8.644688644688643e-05,
+      "loss": 0.6572,
+      "step": 2022
+    },
+    {
+      "epoch": 2.222161197308801,
+      "grad_norm": 0.7201973795890808,
+      "learning_rate": 8.632478632478633e-05,
+      "loss": 0.4924,
+      "step": 2023
+    },
+    {
+      "epoch": 2.2232596457503777,
+      "grad_norm": 0.4222647249698639,
+      "learning_rate": 8.62026862026862e-05,
+      "loss": 0.7764,
+      "step": 2024
+    },
+    {
+      "epoch": 2.224358094191954,
+      "grad_norm": 0.5168121457099915,
+      "learning_rate": 8.608058608058607e-05,
+      "loss": 0.5766,
+      "step": 2025
+    },
+    {
+      "epoch": 2.22545654263353,
+      "grad_norm": 0.886203408241272,
+      "learning_rate": 8.595848595848596e-05,
+      "loss": 0.3804,
+      "step": 2026
+    },
+    {
+      "epoch": 2.2265549910751066,
+      "grad_norm": 1.7365875244140625,
+      "learning_rate": 8.583638583638583e-05,
+      "loss": 0.6583,
+      "step": 2027
+    },
+    {
+      "epoch": 2.2276534395166827,
+      "grad_norm": 0.44519639015197754,
+      "learning_rate": 8.57142857142857e-05,
+      "loss": 0.7322,
+      "step": 2028
+    },
+    {
+      "epoch": 2.228751887958259,
+      "grad_norm": 0.4888206422328949,
+      "learning_rate": 8.55921855921856e-05,
+      "loss": 0.6645,
+      "step": 2029
+    },
+    {
+      "epoch": 2.2298503363998354,
+      "grad_norm": 0.598225474357605,
+      "learning_rate": 8.547008547008547e-05,
+      "loss": 0.7903,
+      "step": 2030
+    },
+    {
+      "epoch": 2.2309487848414116,
+      "grad_norm": 0.8521910905838013,
+      "learning_rate": 8.534798534798534e-05,
+      "loss": 0.8573,
+      "step": 2031
+    },
+    {
+      "epoch": 2.2320472332829877,
+      "grad_norm": 1.6346311569213867,
+      "learning_rate": 8.522588522588523e-05,
+      "loss": 0.5653,
+      "step": 2032
+    },
+    {
+      "epoch": 2.233145681724564,
+      "grad_norm": 0.6574315428733826,
+      "learning_rate": 8.51037851037851e-05,
+      "loss": 0.5289,
+      "step": 2033
+    },
+    {
+      "epoch": 2.2342441301661404,
+      "grad_norm": 0.3821216821670532,
+      "learning_rate": 8.498168498168497e-05,
+      "loss": 0.4627,
+      "step": 2034
+    },
+    {
+      "epoch": 2.2353425786077166,
+      "grad_norm": 0.28965023159980774,
+      "learning_rate": 8.485958485958484e-05,
+      "loss": 0.3696,
+      "step": 2035
+    },
+    {
+      "epoch": 2.2364410270492927,
+      "grad_norm": 0.8256242275238037,
+      "learning_rate": 8.473748473748473e-05,
+      "loss": 0.6305,
+      "step": 2036
+    },
+    {
+      "epoch": 2.2375394754908693,
+      "grad_norm": 0.8374451398849487,
+      "learning_rate": 8.46153846153846e-05,
+      "loss": 0.5038,
+      "step": 2037
+    },
+    {
+      "epoch": 2.2386379239324454,
+      "grad_norm": 0.5931464433670044,
+      "learning_rate": 8.449328449328449e-05,
+      "loss": 0.6928,
+      "step": 2038
+    },
+    {
+      "epoch": 2.2397363723740216,
+      "grad_norm": 0.5120035409927368,
+      "learning_rate": 8.437118437118437e-05,
+      "loss": 0.6004,
+      "step": 2039
+    },
+    {
+      "epoch": 2.240834820815598,
+      "grad_norm": 0.6345282196998596,
+      "learning_rate": 8.424908424908424e-05,
+      "loss": 0.866,
+      "step": 2040
+    },
+    {
+      "epoch": 2.2419332692571743,
+      "grad_norm": 0.5632284283638,
+      "learning_rate": 8.412698412698412e-05,
+      "loss": 0.406,
+      "step": 2041
+    },
+    {
+      "epoch": 2.2430317176987504,
+      "grad_norm": 0.4784685969352722,
+      "learning_rate": 8.4004884004884e-05,
+      "loss": 0.4732,
+      "step": 2042
+    },
+    {
+      "epoch": 2.2441301661403266,
+      "grad_norm": 0.47678086161613464,
+      "learning_rate": 8.388278388278387e-05,
+      "loss": 0.502,
+      "step": 2043
+    },
+    {
+      "epoch": 2.245228614581903,
+      "grad_norm": 0.6543307304382324,
+      "learning_rate": 8.376068376068374e-05,
+      "loss": 0.7183,
+      "step": 2044
+    },
+    {
+      "epoch": 2.2463270630234793,
+      "grad_norm": 0.6147063374519348,
+      "learning_rate": 8.363858363858364e-05,
+      "loss": 0.618,
+      "step": 2045
+    },
+    {
+      "epoch": 2.2474255114650554,
+      "grad_norm": 0.5867168307304382,
+      "learning_rate": 8.351648351648352e-05,
+      "loss": 0.7749,
+      "step": 2046
+    },
+    {
+      "epoch": 2.248523959906632,
+      "grad_norm": 1.164838433265686,
+      "learning_rate": 8.339438339438339e-05,
+      "loss": 0.6261,
+      "step": 2047
+    },
+    {
+      "epoch": 2.249622408348208,
+      "grad_norm": 0.6695102453231812,
+      "learning_rate": 8.327228327228326e-05,
+      "loss": 0.6172,
+      "step": 2048
+    },
+    {
+      "epoch": 2.2507208567897843,
+      "grad_norm": 0.43873751163482666,
+      "learning_rate": 8.315018315018315e-05,
+      "loss": 0.7032,
+      "step": 2049
+    },
+    {
+      "epoch": 2.251819305231361,
+      "grad_norm": 0.439897745847702,
+      "learning_rate": 8.302808302808302e-05,
+      "loss": 0.7744,
+      "step": 2050
+    },
+    {
+      "epoch": 2.252917753672937,
+      "grad_norm": 0.6671053767204285,
+      "learning_rate": 8.290598290598289e-05,
+      "loss": 0.6877,
+      "step": 2051
+    },
+    {
+      "epoch": 2.254016202114513,
+      "grad_norm": 0.37354105710983276,
+      "learning_rate": 8.278388278388279e-05,
+      "loss": 0.5653,
+      "step": 2052
+    },
+    {
+      "epoch": 2.2551146505560897,
+      "grad_norm": 0.5615684390068054,
+      "learning_rate": 8.266178266178266e-05,
+      "loss": 0.5961,
+      "step": 2053
+    },
+    {
+      "epoch": 2.256213098997666,
+      "grad_norm": 2.0932323932647705,
+      "learning_rate": 8.253968253968253e-05,
+      "loss": 0.6139,
+      "step": 2054
+    },
+    {
+      "epoch": 2.257311547439242,
+      "grad_norm": 0.5486952066421509,
+      "learning_rate": 8.241758241758242e-05,
+      "loss": 0.7816,
+      "step": 2055
+    },
+    {
+      "epoch": 2.258409995880818,
+      "grad_norm": 0.7377699017524719,
+      "learning_rate": 8.229548229548229e-05,
+      "loss": 0.5036,
+      "step": 2056
+    },
+    {
+      "epoch": 2.2595084443223947,
+      "grad_norm": 0.7057545781135559,
+      "learning_rate": 8.217338217338216e-05,
+      "loss": 0.5788,
+      "step": 2057
+    },
+    {
+      "epoch": 2.260606892763971,
+      "grad_norm": 0.5388674736022949,
+      "learning_rate": 8.205128205128205e-05,
+      "loss": 0.7079,
+      "step": 2058
+    },
+    {
+      "epoch": 2.261705341205547,
+      "grad_norm": 0.620943546295166,
+      "learning_rate": 8.192918192918192e-05,
+      "loss": 0.6223,
+      "step": 2059
+    },
+    {
+      "epoch": 2.2628037896471236,
+      "grad_norm": 0.6159489154815674,
+      "learning_rate": 8.18070818070818e-05,
+      "loss": 0.7277,
+      "step": 2060
+    },
+    {
+      "epoch": 2.2639022380886997,
+      "grad_norm": 0.5745131373405457,
+      "learning_rate": 8.168498168498168e-05,
+      "loss": 0.6356,
+      "step": 2061
+    },
+    {
+      "epoch": 2.265000686530276,
+      "grad_norm": 0.4925720989704132,
+      "learning_rate": 8.156288156288156e-05,
+      "loss": 0.6342,
+      "step": 2062
+    },
+    {
+      "epoch": 2.2660991349718524,
+      "grad_norm": 0.410692036151886,
+      "learning_rate": 8.144078144078143e-05,
+      "loss": 0.5903,
+      "step": 2063
+    },
+    {
+      "epoch": 2.2671975834134286,
+      "grad_norm": 0.8246005177497864,
+      "learning_rate": 8.13186813186813e-05,
+      "loss": 0.4048,
+      "step": 2064
+    },
+    {
+      "epoch": 2.2682960318550047,
+      "grad_norm": 0.5054492950439453,
+      "learning_rate": 8.119658119658119e-05,
+      "loss": 0.5797,
+      "step": 2065
+    },
+    {
+      "epoch": 2.2693944802965813,
+      "grad_norm": 0.6249692440032959,
+      "learning_rate": 8.107448107448106e-05,
+      "loss": 0.5434,
+      "step": 2066
+    },
+    {
+      "epoch": 2.2704929287381574,
+      "grad_norm": 0.5582659244537354,
+      "learning_rate": 8.095238095238093e-05,
+      "loss": 0.5925,
+      "step": 2067
+    },
+    {
+      "epoch": 2.2715913771797336,
+      "grad_norm": 0.38472238183021545,
+      "learning_rate": 8.083028083028083e-05,
+      "loss": 0.7325,
+      "step": 2068
+    },
+    {
+      "epoch": 2.2726898256213097,
+      "grad_norm": 0.4649077355861664,
+      "learning_rate": 8.07081807081807e-05,
+      "loss": 0.6244,
+      "step": 2069
+    },
+    {
+      "epoch": 2.2737882740628863,
+      "grad_norm": 0.38582849502563477,
+      "learning_rate": 8.058608058608058e-05,
+      "loss": 0.7696,
+      "step": 2070
+    },
+    {
+      "epoch": 2.2748867225044624,
+      "grad_norm": 0.4612105190753937,
+      "learning_rate": 8.046398046398045e-05,
+      "loss": 0.6453,
+      "step": 2071
+    },
+    {
+      "epoch": 2.2759851709460386,
+      "grad_norm": 0.6572852730751038,
+      "learning_rate": 8.034188034188034e-05,
+      "loss": 0.7417,
+      "step": 2072
+    },
+    {
+      "epoch": 2.277083619387615,
+      "grad_norm": 0.6322109699249268,
+      "learning_rate": 8.021978021978021e-05,
+      "loss": 0.2827,
+      "step": 2073
+    },
+    {
+      "epoch": 2.2781820678291913,
+      "grad_norm": 1.2452771663665771,
+      "learning_rate": 8.009768009768008e-05,
+      "loss": 0.7441,
+      "step": 2074
+    },
+    {
+      "epoch": 2.2792805162707674,
+      "grad_norm": 0.32154834270477295,
+      "learning_rate": 7.997557997557998e-05,
+      "loss": 0.4606,
+      "step": 2075
+    },
+    {
+      "epoch": 2.2803789647123436,
+      "grad_norm": 1.0170034170150757,
+      "learning_rate": 7.985347985347985e-05,
+      "loss": 0.7003,
+      "step": 2076
+    },
+    {
+      "epoch": 2.28147741315392,
+      "grad_norm": 0.7780435085296631,
+      "learning_rate": 7.973137973137972e-05,
+      "loss": 0.5847,
+      "step": 2077
+    },
+    {
+      "epoch": 2.2825758615954963,
+      "grad_norm": 0.6422854661941528,
+      "learning_rate": 7.960927960927961e-05,
+      "loss": 0.6278,
+      "step": 2078
+    },
+    {
+      "epoch": 2.2836743100370724,
+      "grad_norm": 0.5440393090248108,
+      "learning_rate": 7.948717948717948e-05,
+      "loss": 0.6313,
+      "step": 2079
+    },
+    {
+      "epoch": 2.284772758478649,
+      "grad_norm": 0.5774940848350525,
+      "learning_rate": 7.936507936507935e-05,
+      "loss": 0.7504,
+      "step": 2080
+    },
+    {
+      "epoch": 2.285871206920225,
+      "grad_norm": 0.44180789589881897,
+      "learning_rate": 7.924297924297924e-05,
+      "loss": 0.5806,
+      "step": 2081
+    },
+    {
+      "epoch": 2.2869696553618013,
+      "grad_norm": 0.8452728390693665,
+      "learning_rate": 7.912087912087912e-05,
+      "loss": 0.5753,
+      "step": 2082
+    },
+    {
+      "epoch": 2.288068103803378,
+      "grad_norm": 0.40172943472862244,
+      "learning_rate": 7.8998778998779e-05,
+      "loss": 0.5565,
+      "step": 2083
+    },
+    {
+      "epoch": 2.289166552244954,
+      "grad_norm": 0.3919180929660797,
+      "learning_rate": 7.887667887667887e-05,
+      "loss": 0.4951,
+      "step": 2084
+    },
+    {
+      "epoch": 2.29026500068653,
+      "grad_norm": 1.0796260833740234,
+      "learning_rate": 7.875457875457875e-05,
+      "loss": 0.733,
+      "step": 2085
+    },
+    {
+      "epoch": 2.2913634491281067,
+      "grad_norm": 0.5640047788619995,
+      "learning_rate": 7.863247863247862e-05,
+      "loss": 0.4625,
+      "step": 2086
+    },
+    {
+      "epoch": 2.292461897569683,
+      "grad_norm": 0.8736083507537842,
+      "learning_rate": 7.85103785103785e-05,
+      "loss": 0.5532,
+      "step": 2087
+    },
+    {
+      "epoch": 2.293560346011259,
+      "grad_norm": 0.5358221530914307,
+      "learning_rate": 7.838827838827838e-05,
+      "loss": 0.6397,
+      "step": 2088
+    },
+    {
+      "epoch": 2.294658794452835,
+      "grad_norm": 5.207391262054443,
+      "learning_rate": 7.826617826617825e-05,
+      "loss": 0.6402,
+      "step": 2089
+    },
+    {
+      "epoch": 2.2957572428944117,
+      "grad_norm": 0.4122523069381714,
+      "learning_rate": 7.814407814407813e-05,
+      "loss": 0.474,
+      "step": 2090
+    },
+    {
+      "epoch": 2.296855691335988,
+      "grad_norm": 2.8296186923980713,
+      "learning_rate": 7.802197802197802e-05,
+      "loss": 0.5197,
+      "step": 2091
+    },
+    {
+      "epoch": 2.297954139777564,
+      "grad_norm": 0.6898410320281982,
+      "learning_rate": 7.78998778998779e-05,
+      "loss": 0.782,
+      "step": 2092
+    },
+    {
+      "epoch": 2.2990525882191406,
+      "grad_norm": 0.37363025546073914,
+      "learning_rate": 7.777777777777777e-05,
+      "loss": 0.5824,
+      "step": 2093
+    },
+    {
+      "epoch": 2.3001510366607167,
+      "grad_norm": 0.5120764374732971,
+      "learning_rate": 7.765567765567765e-05,
+      "loss": 0.7326,
+      "step": 2094
+    },
+    {
+      "epoch": 2.301249485102293,
+      "grad_norm": 0.6517985463142395,
+      "learning_rate": 7.753357753357753e-05,
+      "loss": 0.6274,
+      "step": 2095
+    },
+    {
+      "epoch": 2.3023479335438695,
+      "grad_norm": 0.8033846020698547,
+      "learning_rate": 7.74114774114774e-05,
+      "loss": 0.7093,
+      "step": 2096
+    },
+    {
+      "epoch": 2.3034463819854456,
+      "grad_norm": 0.896397590637207,
+      "learning_rate": 7.728937728937727e-05,
+      "loss": 0.6685,
+      "step": 2097
+    },
+    {
+      "epoch": 2.3045448304270217,
+      "grad_norm": 0.4606597423553467,
+      "learning_rate": 7.716727716727717e-05,
+      "loss": 0.5821,
+      "step": 2098
+    },
+    {
+      "epoch": 2.3056432788685983,
+      "grad_norm": 0.9286845922470093,
+      "learning_rate": 7.704517704517704e-05,
+      "loss": 0.7537,
+      "step": 2099
+    },
+    {
+      "epoch": 2.3067417273101745,
+      "grad_norm": 0.6514043211936951,
+      "learning_rate": 7.692307692307691e-05,
+      "loss": 0.5644,
+      "step": 2100
+    },
+    {
+      "epoch": 2.3078401757517506,
+      "grad_norm": 0.4881083369255066,
+      "learning_rate": 7.68009768009768e-05,
+      "loss": 0.5348,
+      "step": 2101
+    },
+    {
+      "epoch": 2.3089386241933267,
+      "grad_norm": 2.688716173171997,
+      "learning_rate": 7.667887667887667e-05,
+      "loss": 0.6732,
+      "step": 2102
+    },
+    {
+      "epoch": 2.3100370726349033,
+      "grad_norm": 0.4597708582878113,
+      "learning_rate": 7.655677655677654e-05,
+      "loss": 0.6166,
+      "step": 2103
+    },
+    {
+      "epoch": 2.3111355210764795,
+      "grad_norm": 0.7629315853118896,
+      "learning_rate": 7.643467643467644e-05,
+      "loss": 0.4677,
+      "step": 2104
+    },
+    {
+      "epoch": 2.3122339695180556,
+      "grad_norm": 0.7282788753509521,
+      "learning_rate": 7.631257631257631e-05,
+      "loss": 0.6841,
+      "step": 2105
+    },
+    {
+      "epoch": 2.313332417959632,
+      "grad_norm": 0.5421862006187439,
+      "learning_rate": 7.619047619047618e-05,
+      "loss": 0.7274,
+      "step": 2106
+    },
+    {
+      "epoch": 2.3144308664012083,
+      "grad_norm": 0.7396867871284485,
+      "learning_rate": 7.606837606837607e-05,
+      "loss": 0.6546,
+      "step": 2107
+    },
+    {
+      "epoch": 2.3155293148427845,
+      "grad_norm": 0.34731313586235046,
+      "learning_rate": 7.594627594627594e-05,
+      "loss": 0.72,
+      "step": 2108
+    },
+    {
+      "epoch": 2.3166277632843606,
+      "grad_norm": 1.1024978160858154,
+      "learning_rate": 7.582417582417581e-05,
+      "loss": 0.7304,
+      "step": 2109
+    },
+    {
+      "epoch": 2.317726211725937,
+      "grad_norm": 0.5866183638572693,
+      "learning_rate": 7.570207570207569e-05,
+      "loss": 0.4912,
+      "step": 2110
+    },
+    {
+      "epoch": 2.3188246601675133,
+      "grad_norm": 0.8068836331367493,
+      "learning_rate": 7.557997557997557e-05,
+      "loss": 0.5342,
+      "step": 2111
+    },
+    {
+      "epoch": 2.31992310860909,
+      "grad_norm": 0.6417646408081055,
+      "learning_rate": 7.545787545787544e-05,
+      "loss": 0.7642,
+      "step": 2112
+    },
+    {
+      "epoch": 2.321021557050666,
+      "grad_norm": 0.4545808434486389,
+      "learning_rate": 7.533577533577533e-05,
+      "loss": 0.5681,
+      "step": 2113
+    },
+    {
+      "epoch": 2.322120005492242,
+      "grad_norm": 0.3567211329936981,
+      "learning_rate": 7.521367521367521e-05,
+      "loss": 0.6368,
+      "step": 2114
+    },
+    {
+      "epoch": 2.3232184539338183,
+      "grad_norm": 0.5747010707855225,
+      "learning_rate": 7.509157509157509e-05,
+      "loss": 0.5848,
+      "step": 2115
+    },
+    {
+      "epoch": 2.324316902375395,
+      "grad_norm": 0.46303555369377136,
+      "learning_rate": 7.496947496947497e-05,
+      "loss": 0.6577,
+      "step": 2116
+    },
+    {
+      "epoch": 2.325415350816971,
+      "grad_norm": 0.5343080759048462,
+      "learning_rate": 7.484737484737484e-05,
+      "loss": 0.8531,
+      "step": 2117
+    },
+    {
+      "epoch": 2.326513799258547,
+      "grad_norm": 0.9027140736579895,
+      "learning_rate": 7.472527472527472e-05,
+      "loss": 0.6271,
+      "step": 2118
+    },
+    {
+      "epoch": 2.3276122477001238,
+      "grad_norm": 0.6390063166618347,
+      "learning_rate": 7.460317460317459e-05,
+      "loss": 0.5669,
+      "step": 2119
+    },
+    {
+      "epoch": 2.3287106961417,
+      "grad_norm": 0.4965013563632965,
+      "learning_rate": 7.448107448107447e-05,
+      "loss": 0.6362,
+      "step": 2120
+    },
+    {
+      "epoch": 2.329809144583276,
+      "grad_norm": 0.49252766370773315,
+      "learning_rate": 7.435897435897436e-05,
+      "loss": 0.6703,
+      "step": 2121
+    },
+    {
+      "epoch": 2.330907593024852,
+      "grad_norm": 0.7043023705482483,
+      "learning_rate": 7.423687423687423e-05,
+      "loss": 0.7114,
+      "step": 2122
+    },
+    {
+      "epoch": 2.3320060414664288,
+      "grad_norm": 0.4373185634613037,
+      "learning_rate": 7.41147741147741e-05,
+      "loss": 0.5656,
+      "step": 2123
+    },
+    {
+      "epoch": 2.333104489908005,
+      "grad_norm": 1.0036537647247314,
+      "learning_rate": 7.399267399267399e-05,
+      "loss": 0.6652,
+      "step": 2124
+    },
+    {
+      "epoch": 2.334202938349581,
+      "grad_norm": 2.06589937210083,
+      "learning_rate": 7.387057387057386e-05,
+      "loss": 0.6502,
+      "step": 2125
+    },
+    {
+      "epoch": 2.3353013867911576,
+      "grad_norm": 1.1616554260253906,
+      "learning_rate": 7.374847374847375e-05,
+      "loss": 0.7288,
+      "step": 2126
+    },
+    {
+      "epoch": 2.3363998352327338,
+      "grad_norm": 0.4532950520515442,
+      "learning_rate": 7.362637362637362e-05,
+      "loss": 0.7696,
+      "step": 2127
+    },
+    {
+      "epoch": 2.33749828367431,
+      "grad_norm": 1.0143449306488037,
+      "learning_rate": 7.35042735042735e-05,
+      "loss": 1.0185,
+      "step": 2128
+    },
+    {
+      "epoch": 2.3385967321158865,
+      "grad_norm": 2.2059850692749023,
+      "learning_rate": 7.338217338217337e-05,
+      "loss": 0.6267,
+      "step": 2129
+    },
+    {
+      "epoch": 2.3396951805574626,
+      "grad_norm": 0.4883456826210022,
+      "learning_rate": 7.326007326007325e-05,
+      "loss": 0.6081,
+      "step": 2130
+    },
+    {
+      "epoch": 2.3407936289990388,
+      "grad_norm": 0.42373138666152954,
+      "learning_rate": 7.313797313797313e-05,
+      "loss": 0.6204,
+      "step": 2131
+    },
+    {
+      "epoch": 2.3418920774406153,
+      "grad_norm": 0.43958979845046997,
+      "learning_rate": 7.3015873015873e-05,
+      "loss": 0.7608,
+      "step": 2132
+    },
+    {
+      "epoch": 2.3429905258821915,
+      "grad_norm": 0.4493010342121124,
+      "learning_rate": 7.289377289377289e-05,
+      "loss": 0.5985,
+      "step": 2133
+    },
+    {
+      "epoch": 2.3440889743237676,
+      "grad_norm": 0.38533085584640503,
+      "learning_rate": 7.277167277167276e-05,
+      "loss": 0.445,
+      "step": 2134
+    },
+    {
+      "epoch": 2.3451874227653438,
+      "grad_norm": 0.37900710105895996,
+      "learning_rate": 7.264957264957265e-05,
+      "loss": 0.8466,
+      "step": 2135
+    },
+    {
+      "epoch": 2.3462858712069203,
+      "grad_norm": 1.7598285675048828,
+      "learning_rate": 7.252747252747252e-05,
+      "loss": 0.6881,
+      "step": 2136
+    },
+    {
+      "epoch": 2.3473843196484965,
+      "grad_norm": 0.5551338791847229,
+      "learning_rate": 7.24053724053724e-05,
+      "loss": 0.5908,
+      "step": 2137
+    },
+    {
+      "epoch": 2.3484827680900726,
+      "grad_norm": 0.42995861172676086,
+      "learning_rate": 7.228327228327228e-05,
+      "loss": 0.689,
+      "step": 2138
+    },
+    {
+      "epoch": 2.349581216531649,
+      "grad_norm": 0.6428760290145874,
+      "learning_rate": 7.216117216117216e-05,
+      "loss": 0.5879,
+      "step": 2139
+    },
+    {
+      "epoch": 2.3506796649732253,
+      "grad_norm": 0.6199445724487305,
+      "learning_rate": 7.203907203907203e-05,
+      "loss": 0.5275,
+      "step": 2140
+    },
+    {
+      "epoch": 2.3517781134148015,
+      "grad_norm": 0.4687311053276062,
+      "learning_rate": 7.19169719169719e-05,
+      "loss": 0.7046,
+      "step": 2141
+    },
+    {
+      "epoch": 2.352876561856378,
+      "grad_norm": 0.47645121812820435,
+      "learning_rate": 7.179487179487179e-05,
+      "loss": 0.4787,
+      "step": 2142
+    },
+    {
+      "epoch": 2.353975010297954,
+      "grad_norm": 1.3774843215942383,
+      "learning_rate": 7.167277167277166e-05,
+      "loss": 0.565,
+      "step": 2143
+    },
+    {
+      "epoch": 2.3550734587395303,
+      "grad_norm": 0.9585548043251038,
+      "learning_rate": 7.155067155067155e-05,
+      "loss": 0.7496,
+      "step": 2144
+    },
+    {
+      "epoch": 2.356171907181107,
+      "grad_norm": 0.9073938131332397,
+      "learning_rate": 7.142857142857142e-05,
+      "loss": 0.6785,
+      "step": 2145
+    },
+    {
+      "epoch": 2.357270355622683,
+      "grad_norm": 1.4543087482452393,
+      "learning_rate": 7.13064713064713e-05,
+      "loss": 0.4827,
+      "step": 2146
+    },
+    {
+      "epoch": 2.358368804064259,
+      "grad_norm": 0.49685895442962646,
+      "learning_rate": 7.118437118437118e-05,
+      "loss": 0.5624,
+      "step": 2147
+    },
+    {
+      "epoch": 2.3594672525058353,
+      "grad_norm": 0.3820716142654419,
+      "learning_rate": 7.106227106227105e-05,
+      "loss": 0.5326,
+      "step": 2148
+    },
+    {
+      "epoch": 2.360565700947412,
+      "grad_norm": 0.6018278002738953,
+      "learning_rate": 7.094017094017094e-05,
+      "loss": 0.7372,
+      "step": 2149
+    },
+    {
+      "epoch": 2.361664149388988,
+      "grad_norm": 0.49245381355285645,
+      "learning_rate": 7.081807081807082e-05,
+      "loss": 0.714,
+      "step": 2150
+    },
+    {
+      "epoch": 2.362762597830564,
+      "grad_norm": 0.5913417339324951,
+      "learning_rate": 7.069597069597069e-05,
+      "loss": 0.6395,
+      "step": 2151
+    },
+    {
+      "epoch": 2.3638610462721408,
+      "grad_norm": 0.3142958879470825,
+      "learning_rate": 7.057387057387056e-05,
+      "loss": 0.4363,
+      "step": 2152
+    },
+    {
+      "epoch": 2.364959494713717,
+      "grad_norm": 0.44251006841659546,
+      "learning_rate": 7.045177045177044e-05,
+      "loss": 0.5751,
+      "step": 2153
+    },
+    {
+      "epoch": 2.366057943155293,
+      "grad_norm": 0.7642143964767456,
+      "learning_rate": 7.032967032967032e-05,
+      "loss": 0.9707,
+      "step": 2154
+    },
+    {
+      "epoch": 2.367156391596869,
+      "grad_norm": 0.3676380217075348,
+      "learning_rate": 7.020757020757021e-05,
+      "loss": 0.6142,
+      "step": 2155
+    },
+    {
+      "epoch": 2.3682548400384458,
+      "grad_norm": 0.43112027645111084,
+      "learning_rate": 7.008547008547008e-05,
+      "loss": 0.6194,
+      "step": 2156
+    },
+    {
+      "epoch": 2.369353288480022,
+      "grad_norm": 0.5463792681694031,
+      "learning_rate": 6.996336996336996e-05,
+      "loss": 0.5478,
+      "step": 2157
+    },
+    {
+      "epoch": 2.370451736921598,
+      "grad_norm": 0.5498053431510925,
+      "learning_rate": 6.984126984126984e-05,
+      "loss": 0.8373,
+      "step": 2158
+    },
+    {
+      "epoch": 2.3715501853631746,
+      "grad_norm": 0.5144299268722534,
+      "learning_rate": 6.971916971916971e-05,
+      "loss": 0.7033,
+      "step": 2159
+    },
+    {
+      "epoch": 2.3726486338047508,
+      "grad_norm": 0.4049033522605896,
+      "learning_rate": 6.95970695970696e-05,
+      "loss": 0.6257,
+      "step": 2160
+    },
+    {
+      "epoch": 2.373747082246327,
+      "grad_norm": 0.8007866740226746,
+      "learning_rate": 6.947496947496947e-05,
+      "loss": 1.1859,
+      "step": 2161
+    },
+    {
+      "epoch": 2.3748455306879035,
+      "grad_norm": 0.6302816867828369,
+      "learning_rate": 6.935286935286935e-05,
+      "loss": 0.4972,
+      "step": 2162
+    },
+    {
+      "epoch": 2.3759439791294796,
+      "grad_norm": 0.4181542694568634,
+      "learning_rate": 6.923076923076922e-05,
+      "loss": 0.5543,
+      "step": 2163
+    },
+    {
+      "epoch": 2.3770424275710558,
+      "grad_norm": 0.45409703254699707,
+      "learning_rate": 6.91086691086691e-05,
+      "loss": 0.6237,
+      "step": 2164
+    },
+    {
+      "epoch": 2.3781408760126324,
+      "grad_norm": 0.5172666907310486,
+      "learning_rate": 6.898656898656898e-05,
+      "loss": 0.5798,
+      "step": 2165
+    },
+    {
+      "epoch": 2.3792393244542085,
+      "grad_norm": 0.7849127054214478,
+      "learning_rate": 6.886446886446885e-05,
+      "loss": 0.8282,
+      "step": 2166
+    },
+    {
+      "epoch": 2.3803377728957846,
+      "grad_norm": 0.4041041135787964,
+      "learning_rate": 6.874236874236874e-05,
+      "loss": 0.5046,
+      "step": 2167
+    },
+    {
+      "epoch": 2.3814362213373608,
+      "grad_norm": 0.35880064964294434,
+      "learning_rate": 6.862026862026862e-05,
+      "loss": 0.4096,
+      "step": 2168
+    },
+    {
+      "epoch": 2.3825346697789374,
+      "grad_norm": 0.5949457883834839,
+      "learning_rate": 6.84981684981685e-05,
+      "loss": 0.6666,
+      "step": 2169
+    },
+    {
+      "epoch": 2.3836331182205135,
+      "grad_norm": 0.6332186460494995,
+      "learning_rate": 6.837606837606837e-05,
+      "loss": 0.9715,
+      "step": 2170
+    },
+    {
+      "epoch": 2.3847315666620896,
+      "grad_norm": 0.3173432946205139,
+      "learning_rate": 6.825396825396824e-05,
+      "loss": 0.6792,
+      "step": 2171
+    },
+    {
+      "epoch": 2.385830015103666,
+      "grad_norm": 0.7556782364845276,
+      "learning_rate": 6.813186813186813e-05,
+      "loss": 0.7267,
+      "step": 2172
+    },
+    {
+      "epoch": 2.3869284635452424,
+      "grad_norm": 0.43191683292388916,
+      "learning_rate": 6.800976800976801e-05,
+      "loss": 0.5841,
+      "step": 2173
+    },
+    {
+      "epoch": 2.3880269119868185,
+      "grad_norm": 0.4010660946369171,
+      "learning_rate": 6.788766788766788e-05,
+      "loss": 0.7491,
+      "step": 2174
+    },
+    {
+      "epoch": 2.389125360428395,
+      "grad_norm": 0.6889204382896423,
+      "learning_rate": 6.776556776556775e-05,
+      "loss": 0.4539,
+      "step": 2175
+    },
+    {
+      "epoch": 2.390223808869971,
+      "grad_norm": 0.4509136974811554,
+      "learning_rate": 6.764346764346764e-05,
+      "loss": 0.7066,
+      "step": 2176
+    },
+    {
+      "epoch": 2.3913222573115474,
+      "grad_norm": 0.4313298463821411,
+      "learning_rate": 6.752136752136751e-05,
+      "loss": 0.6292,
+      "step": 2177
+    },
+    {
+      "epoch": 2.392420705753124,
+      "grad_norm": 0.7713265419006348,
+      "learning_rate": 6.73992673992674e-05,
+      "loss": 0.8392,
+      "step": 2178
+    },
+    {
+      "epoch": 2.3935191541947,
+      "grad_norm": 0.5283428430557251,
+      "learning_rate": 6.727716727716727e-05,
+      "loss": 0.6912,
+      "step": 2179
+    },
+    {
+      "epoch": 2.394617602636276,
+      "grad_norm": 0.40429314970970154,
+      "learning_rate": 6.715506715506716e-05,
+      "loss": 0.4335,
+      "step": 2180
+    },
+    {
+      "epoch": 2.3957160510778523,
+      "grad_norm": 0.6888754367828369,
+      "learning_rate": 6.703296703296703e-05,
+      "loss": 0.6276,
+      "step": 2181
+    },
+    {
+      "epoch": 2.396814499519429,
+      "grad_norm": 0.5595026612281799,
+      "learning_rate": 6.69108669108669e-05,
+      "loss": 0.7806,
+      "step": 2182
+    },
+    {
+      "epoch": 2.397912947961005,
+      "grad_norm": 0.32394587993621826,
+      "learning_rate": 6.678876678876678e-05,
+      "loss": 0.5531,
+      "step": 2183
+    },
+    {
+      "epoch": 2.399011396402581,
+      "grad_norm": 0.5909039974212646,
+      "learning_rate": 6.666666666666666e-05,
+      "loss": 0.4932,
+      "step": 2184
+    },
+    {
+      "epoch": 2.400109844844158,
+      "grad_norm": 0.4148501455783844,
+      "learning_rate": 6.654456654456654e-05,
+      "loss": 0.5637,
+      "step": 2185
+    },
+    {
+      "epoch": 2.401208293285734,
+      "grad_norm": 0.558403491973877,
+      "learning_rate": 6.642246642246641e-05,
+      "loss": 0.5733,
+      "step": 2186
+    },
+    {
+      "epoch": 2.40230674172731,
+      "grad_norm": 0.5171149373054504,
+      "learning_rate": 6.630036630036629e-05,
+      "loss": 0.6931,
+      "step": 2187
+    },
+    {
+      "epoch": 2.403405190168886,
+      "grad_norm": 0.44966164231300354,
+      "learning_rate": 6.617826617826617e-05,
+      "loss": 0.5061,
+      "step": 2188
+    },
+    {
+      "epoch": 2.404503638610463,
+      "grad_norm": 0.45499417185783386,
+      "learning_rate": 6.605616605616606e-05,
+      "loss": 0.3726,
+      "step": 2189
+    },
+    {
+      "epoch": 2.405602087052039,
+      "grad_norm": 0.5790139436721802,
+      "learning_rate": 6.593406593406593e-05,
+      "loss": 0.6647,
+      "step": 2190
+    },
+    {
+      "epoch": 2.4067005354936155,
+      "grad_norm": 0.5948793292045593,
+      "learning_rate": 6.581196581196581e-05,
+      "loss": 0.765,
+      "step": 2191
+    },
+    {
+      "epoch": 2.4077989839351917,
+      "grad_norm": 0.5925643444061279,
+      "learning_rate": 6.568986568986569e-05,
+      "loss": 0.889,
+      "step": 2192
+    },
+    {
+      "epoch": 2.408897432376768,
+      "grad_norm": 0.5776219964027405,
+      "learning_rate": 6.556776556776556e-05,
+      "loss": 0.5506,
+      "step": 2193
+    },
+    {
+      "epoch": 2.409995880818344,
+      "grad_norm": 0.44397997856140137,
+      "learning_rate": 6.544566544566544e-05,
+      "loss": 0.5372,
+      "step": 2194
+    },
+    {
+      "epoch": 2.4110943292599205,
+      "grad_norm": 0.45733606815338135,
+      "learning_rate": 6.532356532356532e-05,
+      "loss": 0.7207,
+      "step": 2195
+    },
+    {
+      "epoch": 2.4121927777014966,
+      "grad_norm": 0.38223645091056824,
+      "learning_rate": 6.52014652014652e-05,
+      "loss": 0.5888,
+      "step": 2196
+    },
+    {
+      "epoch": 2.413291226143073,
+      "grad_norm": 0.3642580211162567,
+      "learning_rate": 6.507936507936507e-05,
+      "loss": 0.5687,
+      "step": 2197
+    },
+    {
+      "epoch": 2.4143896745846494,
+      "grad_norm": 0.42435723543167114,
+      "learning_rate": 6.495726495726494e-05,
+      "loss": 0.6056,
+      "step": 2198
+    },
+    {
+      "epoch": 2.4154881230262255,
+      "grad_norm": 0.4998740255832672,
+      "learning_rate": 6.483516483516483e-05,
+      "loss": 0.6813,
+      "step": 2199
+    },
+    {
+      "epoch": 2.4165865714678016,
+      "grad_norm": 0.47158849239349365,
+      "learning_rate": 6.47130647130647e-05,
+      "loss": 0.5585,
+      "step": 2200
+    },
+    {
+      "epoch": 2.417685019909378,
+      "grad_norm": 0.4780612289905548,
+      "learning_rate": 6.459096459096459e-05,
+      "loss": 0.4941,
+      "step": 2201
+    },
+    {
+      "epoch": 2.4187834683509544,
+      "grad_norm": 0.5073630809783936,
+      "learning_rate": 6.446886446886447e-05,
+      "loss": 0.4549,
+      "step": 2202
+    },
+    {
+      "epoch": 2.4198819167925305,
+      "grad_norm": 0.4311310052871704,
+      "learning_rate": 6.434676434676435e-05,
+      "loss": 0.4419,
+      "step": 2203
+    },
+    {
+      "epoch": 2.4209803652341066,
+      "grad_norm": 0.3557896316051483,
+      "learning_rate": 6.422466422466422e-05,
+      "loss": 0.6973,
+      "step": 2204
+    },
+    {
+      "epoch": 2.4220788136756832,
+      "grad_norm": 0.6171516180038452,
+      "learning_rate": 6.410256410256409e-05,
+      "loss": 0.7554,
+      "step": 2205
+    },
+    {
+      "epoch": 2.4231772621172594,
+      "grad_norm": 0.4687957465648651,
+      "learning_rate": 6.398046398046397e-05,
+      "loss": 0.7429,
+      "step": 2206
+    },
+    {
+      "epoch": 2.4242757105588355,
+      "grad_norm": 0.8685696125030518,
+      "learning_rate": 6.385836385836386e-05,
+      "loss": 0.5896,
+      "step": 2207
+    },
+    {
+      "epoch": 2.425374159000412,
+      "grad_norm": 0.39599040150642395,
+      "learning_rate": 6.373626373626373e-05,
+      "loss": 0.4744,
+      "step": 2208
+    },
+    {
+      "epoch": 2.4264726074419882,
+      "grad_norm": 0.9079630970954895,
+      "learning_rate": 6.36141636141636e-05,
+      "loss": 0.6067,
+      "step": 2209
+    },
+    {
+      "epoch": 2.4275710558835644,
+      "grad_norm": 0.5051462054252625,
+      "learning_rate": 6.349206349206349e-05,
+      "loss": 0.7314,
+      "step": 2210
+    },
+    {
+      "epoch": 2.428669504325141,
+      "grad_norm": 0.4899844825267792,
+      "learning_rate": 6.336996336996336e-05,
+      "loss": 0.7086,
+      "step": 2211
+    },
+    {
+      "epoch": 2.429767952766717,
+      "grad_norm": 0.5135432481765747,
+      "learning_rate": 6.324786324786325e-05,
+      "loss": 0.5261,
+      "step": 2212
+    },
+    {
+      "epoch": 2.4308664012082932,
+      "grad_norm": 0.6025048494338989,
+      "learning_rate": 6.312576312576312e-05,
+      "loss": 0.5276,
+      "step": 2213
+    },
+    {
+      "epoch": 2.4319648496498694,
+      "grad_norm": 0.6931442022323608,
+      "learning_rate": 6.3003663003663e-05,
+      "loss": 0.6535,
+      "step": 2214
+    },
+    {
+      "epoch": 2.433063298091446,
+      "grad_norm": 0.695106565952301,
+      "learning_rate": 6.288156288156288e-05,
+      "loss": 0.9183,
+      "step": 2215
+    },
+    {
+      "epoch": 2.434161746533022,
+      "grad_norm": 0.450100302696228,
+      "learning_rate": 6.275946275946275e-05,
+      "loss": 0.5049,
+      "step": 2216
+    },
+    {
+      "epoch": 2.4352601949745982,
+      "grad_norm": 0.5539785623550415,
+      "learning_rate": 6.263736263736263e-05,
+      "loss": 0.5735,
+      "step": 2217
+    },
+    {
+      "epoch": 2.436358643416175,
+      "grad_norm": 0.5560977458953857,
+      "learning_rate": 6.25152625152625e-05,
+      "loss": 0.7364,
+      "step": 2218
+    },
+    {
+      "epoch": 2.437457091857751,
+      "grad_norm": 0.740195095539093,
+      "learning_rate": 6.239316239316239e-05,
+      "loss": 0.7839,
+      "step": 2219
+    },
+    {
+      "epoch": 2.438555540299327,
+      "grad_norm": 0.9324271082878113,
+      "learning_rate": 6.227106227106226e-05,
+      "loss": 0.6365,
+      "step": 2220
+    },
+    {
+      "epoch": 2.4396539887409037,
+      "grad_norm": 0.5540104508399963,
+      "learning_rate": 6.214896214896215e-05,
+      "loss": 0.6586,
+      "step": 2221
+    },
+    {
+      "epoch": 2.44075243718248,
+      "grad_norm": 0.5028054714202881,
+      "learning_rate": 6.202686202686202e-05,
+      "loss": 0.4422,
+      "step": 2222
+    },
+    {
+      "epoch": 2.441850885624056,
+      "grad_norm": 0.7052125930786133,
+      "learning_rate": 6.190476190476189e-05,
+      "loss": 0.7248,
+      "step": 2223
+    },
+    {
+      "epoch": 2.4429493340656325,
+      "grad_norm": 0.6705207824707031,
+      "learning_rate": 6.178266178266178e-05,
+      "loss": 0.81,
+      "step": 2224
+    },
+    {
+      "epoch": 2.4440477825072087,
+      "grad_norm": 0.7996514439582825,
+      "learning_rate": 6.166056166056166e-05,
+      "loss": 0.382,
+      "step": 2225
+    },
+    {
+      "epoch": 2.445146230948785,
+      "grad_norm": 1.5169689655303955,
+      "learning_rate": 6.153846153846154e-05,
+      "loss": 0.7373,
+      "step": 2226
+    },
+    {
+      "epoch": 2.446244679390361,
+      "grad_norm": 0.8039339780807495,
+      "learning_rate": 6.141636141636141e-05,
+      "loss": 0.8609,
+      "step": 2227
+    },
+    {
+      "epoch": 2.4473431278319375,
+      "grad_norm": 0.6489125490188599,
+      "learning_rate": 6.129426129426128e-05,
+      "loss": 0.6309,
+      "step": 2228
+    },
+    {
+      "epoch": 2.4484415762735137,
+      "grad_norm": 0.533184826374054,
+      "learning_rate": 6.117216117216116e-05,
+      "loss": 0.5166,
+      "step": 2229
+    },
+    {
+      "epoch": 2.44954002471509,
+      "grad_norm": 0.5699225068092346,
+      "learning_rate": 6.105006105006105e-05,
+      "loss": 0.7276,
+      "step": 2230
+    },
+    {
+      "epoch": 2.4506384731566664,
+      "grad_norm": 0.5552012324333191,
+      "learning_rate": 6.092796092796092e-05,
+      "loss": 0.636,
+      "step": 2231
+    },
+    {
+      "epoch": 2.4517369215982425,
+      "grad_norm": 0.4785599112510681,
+      "learning_rate": 6.08058608058608e-05,
+      "loss": 0.6362,
+      "step": 2232
+    },
+    {
+      "epoch": 2.4528353700398187,
+      "grad_norm": 0.740872859954834,
+      "learning_rate": 6.068376068376068e-05,
+      "loss": 0.5603,
+      "step": 2233
+    },
+    {
+      "epoch": 2.453933818481395,
+      "grad_norm": 0.5217441916465759,
+      "learning_rate": 6.056166056166056e-05,
+      "loss": 0.6306,
+      "step": 2234
+    },
+    {
+      "epoch": 2.4550322669229714,
+      "grad_norm": 0.446481853723526,
+      "learning_rate": 6.043956043956044e-05,
+      "loss": 0.8156,
+      "step": 2235
+    },
+    {
+      "epoch": 2.4561307153645475,
+      "grad_norm": 0.6527410745620728,
+      "learning_rate": 6.031746031746031e-05,
+      "loss": 0.7057,
+      "step": 2236
+    },
+    {
+      "epoch": 2.4572291638061237,
+      "grad_norm": 0.6801958680152893,
+      "learning_rate": 6.019536019536019e-05,
+      "loss": 0.7718,
+      "step": 2237
+    },
+    {
+      "epoch": 2.4583276122477002,
+      "grad_norm": 1.0723007917404175,
+      "learning_rate": 6.007326007326007e-05,
+      "loss": 0.5552,
+      "step": 2238
+    },
+    {
+      "epoch": 2.4594260606892764,
+      "grad_norm": 0.4058208763599396,
+      "learning_rate": 5.9951159951159945e-05,
+      "loss": 0.5035,
+      "step": 2239
+    },
+    {
+      "epoch": 2.4605245091308525,
+      "grad_norm": 0.5384330153465271,
+      "learning_rate": 5.9829059829059824e-05,
+      "loss": 0.5059,
+      "step": 2240
+    },
+    {
+      "epoch": 2.461622957572429,
+      "grad_norm": 0.7797716856002808,
+      "learning_rate": 5.9706959706959696e-05,
+      "loss": 0.5613,
+      "step": 2241
+    },
+    {
+      "epoch": 2.4627214060140052,
+      "grad_norm": 2.9689226150512695,
+      "learning_rate": 5.958485958485958e-05,
+      "loss": 0.6219,
+      "step": 2242
+    },
+    {
+      "epoch": 2.4638198544555814,
+      "grad_norm": 0.47863152623176575,
+      "learning_rate": 5.946275946275946e-05,
+      "loss": 0.5498,
+      "step": 2243
+    },
+    {
+      "epoch": 2.464918302897158,
+      "grad_norm": 0.49707144498825073,
+      "learning_rate": 5.934065934065933e-05,
+      "loss": 0.775,
+      "step": 2244
+    },
+    {
+      "epoch": 2.466016751338734,
+      "grad_norm": 0.3437495529651642,
+      "learning_rate": 5.921855921855922e-05,
+      "loss": 0.4592,
+      "step": 2245
+    },
+    {
+      "epoch": 2.4671151997803102,
+      "grad_norm": 0.7298309206962585,
+      "learning_rate": 5.9096459096459096e-05,
+      "loss": 0.5374,
+      "step": 2246
+    },
+    {
+      "epoch": 2.4682136482218864,
+      "grad_norm": 0.6666691303253174,
+      "learning_rate": 5.897435897435897e-05,
+      "loss": 0.424,
+      "step": 2247
+    },
+    {
+      "epoch": 2.469312096663463,
+      "grad_norm": 0.5841661691665649,
+      "learning_rate": 5.8852258852258847e-05,
+      "loss": 0.5316,
+      "step": 2248
+    },
+    {
+      "epoch": 2.470410545105039,
+      "grad_norm": 0.4921081066131592,
+      "learning_rate": 5.873015873015872e-05,
+      "loss": 0.6901,
+      "step": 2249
+    },
+    {
+      "epoch": 2.4715089935466152,
+      "grad_norm": 0.4779987633228302,
+      "learning_rate": 5.8608058608058604e-05,
+      "loss": 0.8976,
+      "step": 2250
+    },
+    {
+      "epoch": 2.472607441988192,
+      "grad_norm": 0.43142780661582947,
+      "learning_rate": 5.848595848595848e-05,
+      "loss": 0.4915,
+      "step": 2251
+    },
+    {
+      "epoch": 2.473705890429768,
+      "grad_norm": 1.132870078086853,
+      "learning_rate": 5.8363858363858355e-05,
+      "loss": 0.6633,
+      "step": 2252
+    },
+    {
+      "epoch": 2.474804338871344,
+      "grad_norm": 0.5674893856048584,
+      "learning_rate": 5.824175824175824e-05,
+      "loss": 0.5023,
+      "step": 2253
+    },
+    {
+      "epoch": 2.4759027873129207,
+      "grad_norm": 0.42495957016944885,
+      "learning_rate": 5.811965811965811e-05,
+      "loss": 0.6544,
+      "step": 2254
+    },
+    {
+      "epoch": 2.477001235754497,
+      "grad_norm": 0.8031434416770935,
+      "learning_rate": 5.799755799755799e-05,
+      "loss": 0.892,
+      "step": 2255
+    },
+    {
+      "epoch": 2.478099684196073,
+      "grad_norm": 0.7715115547180176,
+      "learning_rate": 5.7875457875457876e-05,
+      "loss": 0.5659,
+      "step": 2256
+    },
+    {
+      "epoch": 2.4791981326376495,
+      "grad_norm": 0.6882114410400391,
+      "learning_rate": 5.775335775335775e-05,
+      "loss": 0.5154,
+      "step": 2257
+    },
+    {
+      "epoch": 2.4802965810792257,
+      "grad_norm": 0.4994114935398102,
+      "learning_rate": 5.763125763125763e-05,
+      "loss": 0.6001,
+      "step": 2258
+    },
+    {
+      "epoch": 2.481395029520802,
+      "grad_norm": 0.45008450746536255,
+      "learning_rate": 5.7509157509157506e-05,
+      "loss": 0.7076,
+      "step": 2259
+    },
+    {
+      "epoch": 2.482493477962378,
+      "grad_norm": 0.654270350933075,
+      "learning_rate": 5.738705738705738e-05,
+      "loss": 0.5809,
+      "step": 2260
+    },
+    {
+      "epoch": 2.4835919264039545,
+      "grad_norm": 0.6344896554946899,
+      "learning_rate": 5.726495726495726e-05,
+      "loss": 0.6059,
+      "step": 2261
+    },
+    {
+      "epoch": 2.4846903748455307,
+      "grad_norm": 0.44090238213539124,
+      "learning_rate": 5.7142857142857135e-05,
+      "loss": 0.7953,
+      "step": 2262
+    },
+    {
+      "epoch": 2.485788823287107,
+      "grad_norm": 0.47564128041267395,
+      "learning_rate": 5.7020757020757014e-05,
+      "loss": 0.5062,
+      "step": 2263
+    },
+    {
+      "epoch": 2.4868872717286834,
+      "grad_norm": 0.3644583225250244,
+      "learning_rate": 5.68986568986569e-05,
+      "loss": 0.6417,
+      "step": 2264
+    },
+    {
+      "epoch": 2.4879857201702595,
+      "grad_norm": 0.5264548659324646,
+      "learning_rate": 5.677655677655677e-05,
+      "loss": 0.5971,
+      "step": 2265
+    },
+    {
+      "epoch": 2.4890841686118357,
+      "grad_norm": 0.7300589680671692,
+      "learning_rate": 5.665445665445665e-05,
+      "loss": 0.6249,
+      "step": 2266
+    },
+    {
+      "epoch": 2.490182617053412,
+      "grad_norm": 0.9016311764717102,
+      "learning_rate": 5.653235653235652e-05,
+      "loss": 0.5761,
+      "step": 2267
+    },
+    {
+      "epoch": 2.4912810654949884,
+      "grad_norm": 0.7480237483978271,
+      "learning_rate": 5.641025641025641e-05,
+      "loss": 0.4026,
+      "step": 2268
+    },
+    {
+      "epoch": 2.4923795139365645,
+      "grad_norm": 0.5738864541053772,
+      "learning_rate": 5.6288156288156286e-05,
+      "loss": 0.8657,
+      "step": 2269
+    },
+    {
+      "epoch": 2.493477962378141,
+      "grad_norm": 0.7320820093154907,
+      "learning_rate": 5.616605616605616e-05,
+      "loss": 0.7341,
+      "step": 2270
+    },
+    {
+      "epoch": 2.4945764108197173,
+      "grad_norm": 0.7029497623443604,
+      "learning_rate": 5.6043956043956037e-05,
+      "loss": 0.7597,
+      "step": 2271
+    },
+    {
+      "epoch": 2.4956748592612934,
+      "grad_norm": 0.5160001516342163,
+      "learning_rate": 5.592185592185592e-05,
+      "loss": 0.6488,
+      "step": 2272
+    },
+    {
+      "epoch": 2.4967733077028695,
+      "grad_norm": 0.5425933003425598,
+      "learning_rate": 5.5799755799755794e-05,
+      "loss": 0.7102,
+      "step": 2273
+    },
+    {
+      "epoch": 2.497871756144446,
+      "grad_norm": 0.5881295204162598,
+      "learning_rate": 5.567765567765567e-05,
+      "loss": 0.8123,
+      "step": 2274
+    },
+    {
+      "epoch": 2.4989702045860223,
+      "grad_norm": 0.6021397113800049,
+      "learning_rate": 5.5555555555555545e-05,
+      "loss": 0.8887,
+      "step": 2275
+    },
+    {
+      "epoch": 2.5000686530275984,
+      "grad_norm": 0.4754411578178406,
+      "learning_rate": 5.543345543345543e-05,
+      "loss": 0.8162,
+      "step": 2276
+    },
+    {
+      "epoch": 2.501167101469175,
+      "grad_norm": 0.46976983547210693,
+      "learning_rate": 5.531135531135531e-05,
+      "loss": 0.4177,
+      "step": 2277
+    },
+    {
+      "epoch": 2.502265549910751,
+      "grad_norm": 0.4946482181549072,
+      "learning_rate": 5.518925518925518e-05,
+      "loss": 0.6997,
+      "step": 2278
+    },
+    {
+      "epoch": 2.5033639983523273,
+      "grad_norm": 0.49166280031204224,
+      "learning_rate": 5.5067155067155066e-05,
+      "loss": 0.6436,
+      "step": 2279
+    },
+    {
+      "epoch": 2.5044624467939034,
+      "grad_norm": 0.40157628059387207,
+      "learning_rate": 5.494505494505494e-05,
+      "loss": 0.6998,
+      "step": 2280
+    },
+    {
+      "epoch": 2.50556089523548,
+      "grad_norm": 0.4139937162399292,
+      "learning_rate": 5.482295482295482e-05,
+      "loss": 0.4021,
+      "step": 2281
+    },
+    {
+      "epoch": 2.506659343677056,
+      "grad_norm": 3.6814892292022705,
+      "learning_rate": 5.4700854700854696e-05,
+      "loss": 0.6402,
+      "step": 2282
+    },
+    {
+      "epoch": 2.5077577921186327,
+      "grad_norm": 0.3136257529258728,
+      "learning_rate": 5.4578754578754574e-05,
+      "loss": 0.5364,
+      "step": 2283
+    },
+    {
+      "epoch": 2.508856240560209,
+      "grad_norm": 0.42901432514190674,
+      "learning_rate": 5.445665445665445e-05,
+      "loss": 0.6838,
+      "step": 2284
+    },
+    {
+      "epoch": 2.509954689001785,
+      "grad_norm": 0.8462406992912292,
+      "learning_rate": 5.433455433455433e-05,
+      "loss": 0.4232,
+      "step": 2285
+    },
+    {
+      "epoch": 2.511053137443361,
+      "grad_norm": 1.244150996208191,
+      "learning_rate": 5.4212454212454204e-05,
+      "loss": 0.6192,
+      "step": 2286
+    },
+    {
+      "epoch": 2.5121515858849373,
+      "grad_norm": 0.834296703338623,
+      "learning_rate": 5.409035409035409e-05,
+      "loss": 0.548,
+      "step": 2287
+    },
+    {
+      "epoch": 2.513250034326514,
+      "grad_norm": 0.4279276430606842,
+      "learning_rate": 5.396825396825396e-05,
+      "loss": 0.7549,
+      "step": 2288
+    },
+    {
+      "epoch": 2.51434848276809,
+      "grad_norm": 0.5770757794380188,
+      "learning_rate": 5.384615384615384e-05,
+      "loss": 0.6156,
+      "step": 2289
+    },
+    {
+      "epoch": 2.5154469312096666,
+      "grad_norm": 0.41763821244239807,
+      "learning_rate": 5.3724053724053725e-05,
+      "loss": 0.5019,
+      "step": 2290
+    },
+    {
+      "epoch": 2.5165453796512427,
+      "grad_norm": 0.5212944746017456,
+      "learning_rate": 5.36019536019536e-05,
+      "loss": 0.6132,
+      "step": 2291
+    },
+    {
+      "epoch": 2.517643828092819,
+      "grad_norm": 0.44493457674980164,
+      "learning_rate": 5.3479853479853476e-05,
+      "loss": 0.4162,
+      "step": 2292
+    },
+    {
+      "epoch": 2.518742276534395,
+      "grad_norm": 0.46922022104263306,
+      "learning_rate": 5.335775335775335e-05,
+      "loss": 0.4624,
+      "step": 2293
+    },
+    {
+      "epoch": 2.5198407249759716,
+      "grad_norm": 0.41906213760375977,
+      "learning_rate": 5.3235653235653233e-05,
+      "loss": 0.612,
+      "step": 2294
+    },
+    {
+      "epoch": 2.5209391734175477,
+      "grad_norm": 0.620276153087616,
+      "learning_rate": 5.311355311355311e-05,
+      "loss": 0.6322,
+      "step": 2295
+    },
+    {
+      "epoch": 2.522037621859124,
+      "grad_norm": 0.6597051620483398,
+      "learning_rate": 5.2991452991452984e-05,
+      "loss": 0.7659,
+      "step": 2296
+    },
+    {
+      "epoch": 2.5231360703007004,
+      "grad_norm": 4.377660274505615,
+      "learning_rate": 5.286935286935286e-05,
+      "loss": 0.8294,
+      "step": 2297
+    },
+    {
+      "epoch": 2.5242345187422766,
+      "grad_norm": 0.6086331009864807,
+      "learning_rate": 5.274725274725275e-05,
+      "loss": 0.5164,
+      "step": 2298
+    },
+    {
+      "epoch": 2.5253329671838527,
+      "grad_norm": 0.5100352168083191,
+      "learning_rate": 5.262515262515262e-05,
+      "loss": 0.6319,
+      "step": 2299
+    },
+    {
+      "epoch": 2.526431415625429,
+      "grad_norm": 0.6642487049102783,
+      "learning_rate": 5.25030525030525e-05,
+      "loss": 0.533,
+      "step": 2300
+    },
+    {
+      "epoch": 2.5275298640670054,
+      "grad_norm": 0.5834927558898926,
+      "learning_rate": 5.238095238095237e-05,
+      "loss": 0.5669,
+      "step": 2301
+    },
+    {
+      "epoch": 2.5286283125085816,
+      "grad_norm": 0.530815064907074,
+      "learning_rate": 5.2258852258852256e-05,
+      "loss": 0.6189,
+      "step": 2302
+    },
+    {
+      "epoch": 2.529726760950158,
+      "grad_norm": 0.6275864243507385,
+      "learning_rate": 5.2136752136752135e-05,
+      "loss": 0.8403,
+      "step": 2303
+    },
+    {
+      "epoch": 2.5308252093917343,
+      "grad_norm": 0.5878366827964783,
+      "learning_rate": 5.201465201465201e-05,
+      "loss": 0.6176,
+      "step": 2304
+    },
+    {
+      "epoch": 2.5319236578333104,
+      "grad_norm": 0.37410980463027954,
+      "learning_rate": 5.189255189255189e-05,
+      "loss": 0.6337,
+      "step": 2305
+    },
+    {
+      "epoch": 2.5330221062748866,
+      "grad_norm": 0.43912917375564575,
+      "learning_rate": 5.1770451770451764e-05,
+      "loss": 0.5348,
+      "step": 2306
+    },
+    {
+      "epoch": 2.534120554716463,
+      "grad_norm": 1.4737471342086792,
+      "learning_rate": 5.164835164835164e-05,
+      "loss": 0.4862,
+      "step": 2307
+    },
+    {
+      "epoch": 2.5352190031580393,
+      "grad_norm": 0.3978705108165741,
+      "learning_rate": 5.152625152625152e-05,
+      "loss": 0.7929,
+      "step": 2308
+    },
+    {
+      "epoch": 2.5363174515996154,
+      "grad_norm": 0.3852058947086334,
+      "learning_rate": 5.14041514041514e-05,
+      "loss": 0.5895,
+      "step": 2309
+    },
+    {
+      "epoch": 2.537415900041192,
+      "grad_norm": 17.968448638916016,
+      "learning_rate": 5.128205128205128e-05,
+      "loss": 0.4661,
+      "step": 2310
+    },
+    {
+      "epoch": 2.538514348482768,
+      "grad_norm": 0.9369175434112549,
+      "learning_rate": 5.115995115995115e-05,
+      "loss": 0.5957,
+      "step": 2311
+    },
+    {
+      "epoch": 2.5396127969243443,
+      "grad_norm": 0.612750768661499,
+      "learning_rate": 5.103785103785103e-05,
+      "loss": 0.6786,
+      "step": 2312
+    },
+    {
+      "epoch": 2.5407112453659204,
+      "grad_norm": 0.588512659072876,
+      "learning_rate": 5.0915750915750915e-05,
+      "loss": 1.0482,
+      "step": 2313
+    },
+    {
+      "epoch": 2.541809693807497,
+      "grad_norm": 0.4964143633842468,
+      "learning_rate": 5.079365079365079e-05,
+      "loss": 0.5673,
+      "step": 2314
+    },
+    {
+      "epoch": 2.542908142249073,
+      "grad_norm": 0.5807982683181763,
+      "learning_rate": 5.0671550671550666e-05,
+      "loss": 0.5493,
+      "step": 2315
+    },
+    {
+      "epoch": 2.5440065906906497,
+      "grad_norm": 0.5131386518478394,
+      "learning_rate": 5.054945054945055e-05,
+      "loss": 0.5947,
+      "step": 2316
+    },
+    {
+      "epoch": 2.545105039132226,
+      "grad_norm": 0.4521124064922333,
+      "learning_rate": 5.0427350427350424e-05,
+      "loss": 0.5554,
+      "step": 2317
+    },
+    {
+      "epoch": 2.546203487573802,
+      "grad_norm": 0.9441378712654114,
+      "learning_rate": 5.03052503052503e-05,
+      "loss": 0.6991,
+      "step": 2318
+    },
+    {
+      "epoch": 2.547301936015378,
+      "grad_norm": 0.6353013515472412,
+      "learning_rate": 5.0183150183150174e-05,
+      "loss": 0.5308,
+      "step": 2319
+    },
+    {
+      "epoch": 2.5484003844569547,
+      "grad_norm": 0.5940631628036499,
+      "learning_rate": 5.006105006105006e-05,
+      "loss": 0.6536,
+      "step": 2320
+    },
+    {
+      "epoch": 2.549498832898531,
+      "grad_norm": 0.5457591414451599,
+      "learning_rate": 4.993894993894994e-05,
+      "loss": 0.6927,
+      "step": 2321
+    },
+    {
+      "epoch": 2.550597281340107,
+      "grad_norm": 0.6265937685966492,
+      "learning_rate": 4.981684981684981e-05,
+      "loss": 0.6341,
+      "step": 2322
+    },
+    {
+      "epoch": 2.5516957297816836,
+      "grad_norm": 0.5842925310134888,
+      "learning_rate": 4.969474969474969e-05,
+      "loss": 0.4583,
+      "step": 2323
+    },
+    {
+      "epoch": 2.5527941782232597,
+      "grad_norm": 0.5363351106643677,
+      "learning_rate": 4.957264957264956e-05,
+      "loss": 0.6882,
+      "step": 2324
+    },
+    {
+      "epoch": 2.553892626664836,
+      "grad_norm": 0.3677682876586914,
+      "learning_rate": 4.9450549450549446e-05,
+      "loss": 0.5671,
+      "step": 2325
+    },
+    {
+      "epoch": 2.554991075106412,
+      "grad_norm": 1.222985863685608,
+      "learning_rate": 4.9328449328449325e-05,
+      "loss": 0.4936,
+      "step": 2326
+    },
+    {
+      "epoch": 2.5560895235479886,
+      "grad_norm": 1.187898874282837,
+      "learning_rate": 4.92063492063492e-05,
+      "loss": 0.4893,
+      "step": 2327
+    },
+    {
+      "epoch": 2.5571879719895647,
+      "grad_norm": 0.38843801617622375,
+      "learning_rate": 4.908424908424908e-05,
+      "loss": 0.6512,
+      "step": 2328
+    },
+    {
+      "epoch": 2.558286420431141,
+      "grad_norm": 0.9550036191940308,
+      "learning_rate": 4.896214896214896e-05,
+      "loss": 0.6055,
+      "step": 2329
+    },
+    {
+      "epoch": 2.5593848688727174,
+      "grad_norm": 0.80762779712677,
+      "learning_rate": 4.884004884004883e-05,
+      "loss": 0.8852,
+      "step": 2330
+    },
+    {
+      "epoch": 2.5604833173142936,
+      "grad_norm": 0.7496643662452698,
+      "learning_rate": 4.871794871794872e-05,
+      "loss": 0.6535,
+      "step": 2331
+    },
+    {
+      "epoch": 2.5615817657558697,
+      "grad_norm": 0.5532578825950623,
+      "learning_rate": 4.859584859584859e-05,
+      "loss": 0.6336,
+      "step": 2332
+    },
+    {
+      "epoch": 2.562680214197446,
+      "grad_norm": 0.4058012366294861,
+      "learning_rate": 4.847374847374847e-05,
+      "loss": 0.6529,
+      "step": 2333
+    },
+    {
+      "epoch": 2.5637786626390224,
+      "grad_norm": 3.1913115978240967,
+      "learning_rate": 4.835164835164835e-05,
+      "loss": 0.548,
+      "step": 2334
+    },
+    {
+      "epoch": 2.5648771110805986,
+      "grad_norm": 0.47375988960266113,
+      "learning_rate": 4.822954822954822e-05,
+      "loss": 0.7567,
+      "step": 2335
+    },
+    {
+      "epoch": 2.565975559522175,
+      "grad_norm": 0.5287726521492004,
+      "learning_rate": 4.8107448107448106e-05,
+      "loss": 0.6009,
+      "step": 2336
+    },
+    {
+      "epoch": 2.5670740079637513,
+      "grad_norm": 0.43966931104660034,
+      "learning_rate": 4.798534798534798e-05,
+      "loss": 0.5538,
+      "step": 2337
+    },
+    {
+      "epoch": 2.5681724564053274,
+      "grad_norm": 0.6683239340782166,
+      "learning_rate": 4.7863247863247856e-05,
+      "loss": 0.3999,
+      "step": 2338
+    },
+    {
+      "epoch": 2.5692709048469036,
+      "grad_norm": 0.5260687470436096,
+      "learning_rate": 4.774114774114774e-05,
+      "loss": 0.7212,
+      "step": 2339
+    },
+    {
+      "epoch": 2.57036935328848,
+      "grad_norm": 1.086850881576538,
+      "learning_rate": 4.7619047619047614e-05,
+      "loss": 0.7439,
+      "step": 2340
+    },
+    {
+      "epoch": 2.5714678017300563,
+      "grad_norm": 0.9744517207145691,
+      "learning_rate": 4.749694749694749e-05,
+      "loss": 0.5625,
+      "step": 2341
+    },
+    {
+      "epoch": 2.5725662501716324,
+      "grad_norm": 0.6829352974891663,
+      "learning_rate": 4.737484737484738e-05,
+      "loss": 0.5241,
+      "step": 2342
+    },
+    {
+      "epoch": 2.573664698613209,
+      "grad_norm": 0.9441612958908081,
+      "learning_rate": 4.725274725274725e-05,
+      "loss": 0.8815,
+      "step": 2343
+    },
+    {
+      "epoch": 2.574763147054785,
+      "grad_norm": 0.9406607151031494,
+      "learning_rate": 4.713064713064713e-05,
+      "loss": 0.7176,
+      "step": 2344
+    },
+    {
+      "epoch": 2.5758615954963613,
+      "grad_norm": 0.6601364016532898,
+      "learning_rate": 4.7008547008547e-05,
+      "loss": 0.7713,
+      "step": 2345
+    },
+    {
+      "epoch": 2.5769600439379374,
+      "grad_norm": 2.5189599990844727,
+      "learning_rate": 4.688644688644688e-05,
+      "loss": 0.5572,
+      "step": 2346
+    },
+    {
+      "epoch": 2.578058492379514,
+      "grad_norm": 0.7295210957527161,
+      "learning_rate": 4.6764346764346765e-05,
+      "loss": 0.4431,
+      "step": 2347
+    },
+    {
+      "epoch": 2.57915694082109,
+      "grad_norm": 0.5053385496139526,
+      "learning_rate": 4.6642246642246637e-05,
+      "loss": 0.4881,
+      "step": 2348
+    },
+    {
+      "epoch": 2.5802553892626667,
+      "grad_norm": 0.6556063890457153,
+      "learning_rate": 4.6520146520146515e-05,
+      "loss": 0.5168,
+      "step": 2349
+    },
+    {
+      "epoch": 2.581353837704243,
+      "grad_norm": 0.37052014470100403,
+      "learning_rate": 4.639804639804639e-05,
+      "loss": 0.3954,
+      "step": 2350
+    },
+    {
+      "epoch": 2.582452286145819,
+      "grad_norm": 0.5975561738014221,
+      "learning_rate": 4.627594627594627e-05,
+      "loss": 0.5714,
+      "step": 2351
+    },
+    {
+      "epoch": 2.583550734587395,
+      "grad_norm": 0.7273014187812805,
+      "learning_rate": 4.615384615384615e-05,
+      "loss": 0.7287,
+      "step": 2352
+    },
+    {
+      "epoch": 2.5846491830289717,
+      "grad_norm": 0.566586971282959,
+      "learning_rate": 4.603174603174602e-05,
+      "loss": 0.5589,
+      "step": 2353
+    },
+    {
+      "epoch": 2.585747631470548,
+      "grad_norm": 0.5846517086029053,
+      "learning_rate": 4.590964590964591e-05,
+      "loss": 0.5061,
+      "step": 2354
+    },
+    {
+      "epoch": 2.586846079912124,
+      "grad_norm": 0.7470859885215759,
+      "learning_rate": 4.578754578754579e-05,
+      "loss": 0.5433,
+      "step": 2355
+    },
+    {
+      "epoch": 2.5879445283537006,
+      "grad_norm": 0.5419175624847412,
+      "learning_rate": 4.566544566544566e-05,
+      "loss": 0.5502,
+      "step": 2356
+    },
+    {
+      "epoch": 2.5890429767952767,
+      "grad_norm": 1.507851004600525,
+      "learning_rate": 4.554334554334554e-05,
+      "loss": 0.7399,
+      "step": 2357
+    },
+    {
+      "epoch": 2.590141425236853,
+      "grad_norm": 1.4420006275177002,
+      "learning_rate": 4.542124542124542e-05,
+      "loss": 0.4233,
+      "step": 2358
+    },
+    {
+      "epoch": 2.591239873678429,
+      "grad_norm": 0.6471789479255676,
+      "learning_rate": 4.5299145299145296e-05,
+      "loss": 0.4052,
+      "step": 2359
+    },
+    {
+      "epoch": 2.5923383221200056,
+      "grad_norm": 0.5886567831039429,
+      "learning_rate": 4.5177045177045174e-05,
+      "loss": 0.7197,
+      "step": 2360
+    },
+    {
+      "epoch": 2.5934367705615817,
+      "grad_norm": 0.843024492263794,
+      "learning_rate": 4.5054945054945046e-05,
+      "loss": 0.7636,
+      "step": 2361
+    },
+    {
+      "epoch": 2.5945352190031583,
+      "grad_norm": 0.8689064979553223,
+      "learning_rate": 4.493284493284493e-05,
+      "loss": 0.6694,
+      "step": 2362
+    },
+    {
+      "epoch": 2.5956336674447344,
+      "grad_norm": 0.5112485289573669,
+      "learning_rate": 4.4810744810744804e-05,
+      "loss": 0.5338,
+      "step": 2363
+    },
+    {
+      "epoch": 2.5967321158863106,
+      "grad_norm": 0.4828614294528961,
+      "learning_rate": 4.468864468864468e-05,
+      "loss": 0.8519,
+      "step": 2364
+    },
+    {
+      "epoch": 2.5978305643278867,
+      "grad_norm": 0.5644575357437134,
+      "learning_rate": 4.456654456654457e-05,
+      "loss": 0.5605,
+      "step": 2365
+    },
+    {
+      "epoch": 2.598929012769463,
+      "grad_norm": 0.7749584913253784,
+      "learning_rate": 4.444444444444444e-05,
+      "loss": 0.6697,
+      "step": 2366
+    },
+    {
+      "epoch": 2.6000274612110394,
+      "grad_norm": 0.9038271307945251,
+      "learning_rate": 4.432234432234432e-05,
+      "loss": 0.7242,
+      "step": 2367
+    },
+    {
+      "epoch": 2.6011259096526156,
+      "grad_norm": 0.5102944374084473,
+      "learning_rate": 4.42002442002442e-05,
+      "loss": 0.5841,
+      "step": 2368
+    },
+    {
+      "epoch": 2.602224358094192,
+      "grad_norm": 0.5072823762893677,
+      "learning_rate": 4.4078144078144076e-05,
+      "loss": 0.4927,
+      "step": 2369
+    },
+    {
+      "epoch": 2.6033228065357683,
+      "grad_norm": 0.3654184341430664,
+      "learning_rate": 4.3956043956043955e-05,
+      "loss": 0.6449,
+      "step": 2370
+    },
+    {
+      "epoch": 2.6044212549773444,
+      "grad_norm": 1.7309939861297607,
+      "learning_rate": 4.3833943833943827e-05,
+      "loss": 0.6979,
+      "step": 2371
+    },
+    {
+      "epoch": 2.6055197034189206,
+      "grad_norm": 0.7982075214385986,
+      "learning_rate": 4.3711843711843705e-05,
+      "loss": 0.6589,
+      "step": 2372
+    },
+    {
+      "epoch": 2.606618151860497,
+      "grad_norm": 0.6989462375640869,
+      "learning_rate": 4.358974358974359e-05,
+      "loss": 0.7104,
+      "step": 2373
+    },
+    {
+      "epoch": 2.6077166003020733,
+      "grad_norm": 0.7331676483154297,
+      "learning_rate": 4.346764346764346e-05,
+      "loss": 0.7565,
+      "step": 2374
+    },
+    {
+      "epoch": 2.6088150487436494,
+      "grad_norm": 1.0566400289535522,
+      "learning_rate": 4.334554334554334e-05,
+      "loss": 0.6967,
+      "step": 2375
+    },
+    {
+      "epoch": 2.609913497185226,
+      "grad_norm": 0.5988017320632935,
+      "learning_rate": 4.322344322344321e-05,
+      "loss": 0.7871,
+      "step": 2376
+    },
+    {
+      "epoch": 2.611011945626802,
+      "grad_norm": 0.4248102307319641,
+      "learning_rate": 4.31013431013431e-05,
+      "loss": 0.6891,
+      "step": 2377
+    },
+    {
+      "epoch": 2.6121103940683783,
+      "grad_norm": 1.9839611053466797,
+      "learning_rate": 4.297924297924298e-05,
+      "loss": 0.6647,
+      "step": 2378
+    },
+    {
+      "epoch": 2.6132088425099544,
+      "grad_norm": 0.4382665455341339,
+      "learning_rate": 4.285714285714285e-05,
+      "loss": 0.5969,
+      "step": 2379
+    },
+    {
+      "epoch": 2.614307290951531,
+      "grad_norm": 1.1918715238571167,
+      "learning_rate": 4.2735042735042735e-05,
+      "loss": 0.7788,
+      "step": 2380
+    },
+    {
+      "epoch": 2.615405739393107,
+      "grad_norm": 0.38117820024490356,
+      "learning_rate": 4.2612942612942614e-05,
+      "loss": 0.4967,
+      "step": 2381
+    },
+    {
+      "epoch": 2.6165041878346837,
+      "grad_norm": 0.6454489827156067,
+      "learning_rate": 4.2490842490842486e-05,
+      "loss": 0.7724,
+      "step": 2382
+    },
+    {
+      "epoch": 2.61760263627626,
+      "grad_norm": 1.0696319341659546,
+      "learning_rate": 4.2368742368742364e-05,
+      "loss": 0.5292,
+      "step": 2383
+    },
+    {
+      "epoch": 2.618701084717836,
+      "grad_norm": 0.5887579321861267,
+      "learning_rate": 4.224664224664224e-05,
+      "loss": 0.5317,
+      "step": 2384
+    },
+    {
+      "epoch": 2.619799533159412,
+      "grad_norm": 0.557188093662262,
+      "learning_rate": 4.212454212454212e-05,
+      "loss": 0.7172,
+      "step": 2385
+    },
+    {
+      "epoch": 2.6208979816009887,
+      "grad_norm": 0.5122195482254028,
+      "learning_rate": 4.2002442002442e-05,
+      "loss": 0.6398,
+      "step": 2386
+    },
+    {
+      "epoch": 2.621996430042565,
+      "grad_norm": 0.520722508430481,
+      "learning_rate": 4.188034188034187e-05,
+      "loss": 0.3984,
+      "step": 2387
+    },
+    {
+      "epoch": 2.623094878484141,
+      "grad_norm": 1.2077422142028809,
+      "learning_rate": 4.175824175824176e-05,
+      "loss": 0.6686,
+      "step": 2388
+    },
+    {
+      "epoch": 2.6241933269257176,
+      "grad_norm": 1.1437829732894897,
+      "learning_rate": 4.163614163614163e-05,
+      "loss": 0.6653,
+      "step": 2389
+    },
+    {
+      "epoch": 2.6252917753672937,
+      "grad_norm": 0.6157158017158508,
+      "learning_rate": 4.151404151404151e-05,
+      "loss": 0.7074,
+      "step": 2390
+    },
+    {
+      "epoch": 2.62639022380887,
+      "grad_norm": 1.8944931030273438,
+      "learning_rate": 4.1391941391941394e-05,
+      "loss": 0.5991,
+      "step": 2391
+    },
+    {
+      "epoch": 2.627488672250446,
+      "grad_norm": 0.6598528623580933,
+      "learning_rate": 4.1269841269841266e-05,
+      "loss": 0.6051,
+      "step": 2392
+    },
+    {
+      "epoch": 2.6285871206920226,
+      "grad_norm": 0.9341129660606384,
+      "learning_rate": 4.1147741147741145e-05,
+      "loss": 0.3795,
+      "step": 2393
+    },
+    {
+      "epoch": 2.6296855691335987,
+      "grad_norm": 0.4246079921722412,
+      "learning_rate": 4.1025641025641023e-05,
+      "loss": 0.4603,
+      "step": 2394
+    },
+    {
+      "epoch": 2.6307840175751753,
+      "grad_norm": 0.6639881134033203,
+      "learning_rate": 4.09035409035409e-05,
+      "loss": 0.5862,
+      "step": 2395
+    },
+    {
+      "epoch": 2.6318824660167515,
+      "grad_norm": 1.297917366027832,
+      "learning_rate": 4.078144078144078e-05,
+      "loss": 0.6175,
+      "step": 2396
+    },
+    {
+      "epoch": 2.6329809144583276,
+      "grad_norm": 0.7880698442459106,
+      "learning_rate": 4.065934065934065e-05,
+      "loss": 0.7034,
+      "step": 2397
+    },
+    {
+      "epoch": 2.6340793628999037,
+      "grad_norm": 0.6197066903114319,
+      "learning_rate": 4.053724053724053e-05,
+      "loss": 0.659,
+      "step": 2398
+    },
+    {
+      "epoch": 2.6351778113414803,
+      "grad_norm": 0.7560408711433411,
+      "learning_rate": 4.041514041514042e-05,
+      "loss": 0.5543,
+      "step": 2399
+    },
+    {
+      "epoch": 2.6362762597830565,
+      "grad_norm": 2.2571635246276855,
+      "learning_rate": 4.029304029304029e-05,
+      "loss": 0.712,
+      "step": 2400
+    },
+    {
+      "epoch": 2.6373747082246326,
+      "grad_norm": 0.8119613528251648,
+      "learning_rate": 4.017094017094017e-05,
+      "loss": 0.6407,
+      "step": 2401
+    },
+    {
+      "epoch": 2.638473156666209,
+      "grad_norm": 3.9773592948913574,
+      "learning_rate": 4.004884004884004e-05,
+      "loss": 0.6434,
+      "step": 2402
+    },
+    {
+      "epoch": 2.6395716051077853,
+      "grad_norm": 1.2648125886917114,
+      "learning_rate": 3.9926739926739925e-05,
+      "loss": 0.689,
+      "step": 2403
+    },
+    {
+      "epoch": 2.6406700535493615,
+      "grad_norm": 0.7015364170074463,
+      "learning_rate": 3.9804639804639804e-05,
+      "loss": 0.4175,
+      "step": 2404
+    },
+    {
+      "epoch": 2.6417685019909376,
+      "grad_norm": 0.941303551197052,
+      "learning_rate": 3.9682539682539676e-05,
+      "loss": 0.4126,
+      "step": 2405
+    },
+    {
+      "epoch": 2.642866950432514,
+      "grad_norm": 0.7533726096153259,
+      "learning_rate": 3.956043956043956e-05,
+      "loss": 0.7401,
+      "step": 2406
+    },
+    {
+      "epoch": 2.6439653988740903,
+      "grad_norm": 0.5480525493621826,
+      "learning_rate": 3.943833943833943e-05,
+      "loss": 0.5567,
+      "step": 2407
+    },
+    {
+      "epoch": 2.6450638473156665,
+      "grad_norm": 0.6171422004699707,
+      "learning_rate": 3.931623931623931e-05,
+      "loss": 0.721,
+      "step": 2408
+    },
+    {
+      "epoch": 2.646162295757243,
+      "grad_norm": 0.6719728708267212,
+      "learning_rate": 3.919413919413919e-05,
+      "loss": 0.5015,
+      "step": 2409
+    },
+    {
+      "epoch": 2.647260744198819,
+      "grad_norm": 1.8106555938720703,
+      "learning_rate": 3.907203907203906e-05,
+      "loss": 0.6954,
+      "step": 2410
+    },
+    {
+      "epoch": 2.6483591926403953,
+      "grad_norm": 0.42534878849983215,
+      "learning_rate": 3.894993894993895e-05,
+      "loss": 0.5241,
+      "step": 2411
+    },
+    {
+      "epoch": 2.6494576410819715,
+      "grad_norm": 0.8733202219009399,
+      "learning_rate": 3.882783882783883e-05,
+      "loss": 0.4485,
+      "step": 2412
+    },
+    {
+      "epoch": 2.650556089523548,
+      "grad_norm": 0.9050257802009583,
+      "learning_rate": 3.87057387057387e-05,
+      "loss": 0.6202,
+      "step": 2413
+    },
+    {
+      "epoch": 2.651654537965124,
+      "grad_norm": 0.650347888469696,
+      "learning_rate": 3.8583638583638584e-05,
+      "loss": 0.621,
+      "step": 2414
+    },
+    {
+      "epoch": 2.6527529864067008,
+      "grad_norm": 6.092042446136475,
+      "learning_rate": 3.8461538461538456e-05,
+      "loss": 0.5143,
+      "step": 2415
+    },
+    {
+      "epoch": 2.653851434848277,
+      "grad_norm": 0.7801241874694824,
+      "learning_rate": 3.8339438339438335e-05,
+      "loss": 0.5424,
+      "step": 2416
+    },
+    {
+      "epoch": 2.654949883289853,
+      "grad_norm": 0.5492686629295349,
+      "learning_rate": 3.821733821733822e-05,
+      "loss": 0.642,
+      "step": 2417
+    },
+    {
+      "epoch": 2.656048331731429,
+      "grad_norm": 0.4257514774799347,
+      "learning_rate": 3.809523809523809e-05,
+      "loss": 0.8273,
+      "step": 2418
+    },
+    {
+      "epoch": 2.6571467801730058,
+      "grad_norm": 1.0180964469909668,
+      "learning_rate": 3.797313797313797e-05,
+      "loss": 0.6962,
+      "step": 2419
+    },
+    {
+      "epoch": 2.658245228614582,
+      "grad_norm": 0.3844882547855377,
+      "learning_rate": 3.785103785103784e-05,
+      "loss": 0.7315,
+      "step": 2420
+    },
+    {
+      "epoch": 2.659343677056158,
+      "grad_norm": 0.46182385087013245,
+      "learning_rate": 3.772893772893772e-05,
+      "loss": 0.3889,
+      "step": 2421
+    },
+    {
+      "epoch": 2.6604421254977346,
+      "grad_norm": 0.562627375125885,
+      "learning_rate": 3.760683760683761e-05,
+      "loss": 0.6415,
+      "step": 2422
+    },
+    {
+      "epoch": 2.6615405739393108,
+      "grad_norm": 0.3234645128250122,
+      "learning_rate": 3.7484737484737486e-05,
+      "loss": 0.4819,
+      "step": 2423
+    },
+    {
+      "epoch": 2.662639022380887,
+      "grad_norm": 0.6804205775260925,
+      "learning_rate": 3.736263736263736e-05,
+      "loss": 0.4248,
+      "step": 2424
+    },
+    {
+      "epoch": 2.663737470822463,
+      "grad_norm": 0.5543864369392395,
+      "learning_rate": 3.7240537240537236e-05,
+      "loss": 0.5259,
+      "step": 2425
+    },
+    {
+      "epoch": 2.6648359192640396,
+      "grad_norm": 0.8411497473716736,
+      "learning_rate": 3.7118437118437115e-05,
+      "loss": 0.5448,
+      "step": 2426
+    },
+    {
+      "epoch": 2.6659343677056158,
+      "grad_norm": 0.4386245608329773,
+      "learning_rate": 3.6996336996336994e-05,
+      "loss": 0.9601,
+      "step": 2427
+    },
+    {
+      "epoch": 2.6670328161471923,
+      "grad_norm": 0.773210346698761,
+      "learning_rate": 3.687423687423687e-05,
+      "loss": 0.8601,
+      "step": 2428
+    },
+    {
+      "epoch": 2.6681312645887685,
+      "grad_norm": 0.4636232852935791,
+      "learning_rate": 3.675213675213675e-05,
+      "loss": 0.6322,
+      "step": 2429
+    },
+    {
+      "epoch": 2.6692297130303446,
+      "grad_norm": 1.6318496465682983,
+      "learning_rate": 3.663003663003662e-05,
+      "loss": 0.4402,
+      "step": 2430
+    },
+    {
+      "epoch": 2.6703281614719208,
+      "grad_norm": 0.5299782156944275,
+      "learning_rate": 3.65079365079365e-05,
+      "loss": 0.5622,
+      "step": 2431
+    },
+    {
+      "epoch": 2.6714266099134973,
+      "grad_norm": 1.1223825216293335,
+      "learning_rate": 3.638583638583638e-05,
+      "loss": 0.5994,
+      "step": 2432
+    },
+    {
+      "epoch": 2.6725250583550735,
+      "grad_norm": 1.8495402336120605,
+      "learning_rate": 3.626373626373626e-05,
+      "loss": 0.669,
+      "step": 2433
+    },
+    {
+      "epoch": 2.6736235067966496,
+      "grad_norm": 0.4963383972644806,
+      "learning_rate": 3.614163614163614e-05,
+      "loss": 0.5412,
+      "step": 2434
+    },
+    {
+      "epoch": 2.674721955238226,
+      "grad_norm": 0.5644822716712952,
+      "learning_rate": 3.601953601953602e-05,
+      "loss": 0.5768,
+      "step": 2435
+    },
+    {
+      "epoch": 2.6758204036798023,
+      "grad_norm": 0.5272318720817566,
+      "learning_rate": 3.5897435897435896e-05,
+      "loss": 0.5909,
+      "step": 2436
+    },
+    {
+      "epoch": 2.6769188521213785,
+      "grad_norm": 0.29838863015174866,
+      "learning_rate": 3.5775335775335774e-05,
+      "loss": 0.5625,
+      "step": 2437
+    },
+    {
+      "epoch": 2.6780173005629546,
+      "grad_norm": 0.5375344157218933,
+      "learning_rate": 3.565323565323565e-05,
+      "loss": 0.5932,
+      "step": 2438
+    },
+    {
+      "epoch": 2.679115749004531,
+      "grad_norm": 0.7850833535194397,
+      "learning_rate": 3.5531135531135525e-05,
+      "loss": 0.6706,
+      "step": 2439
+    },
+    {
+      "epoch": 2.6802141974461073,
+      "grad_norm": 0.5286651253700256,
+      "learning_rate": 3.540903540903541e-05,
+      "loss": 0.6865,
+      "step": 2440
+    },
+    {
+      "epoch": 2.681312645887684,
+      "grad_norm": 0.9832364320755005,
+      "learning_rate": 3.528693528693528e-05,
+      "loss": 0.7941,
+      "step": 2441
+    },
+    {
+      "epoch": 2.68241109432926,
+      "grad_norm": 0.4431805908679962,
+      "learning_rate": 3.516483516483516e-05,
+      "loss": 0.4706,
+      "step": 2442
+    },
+    {
+      "epoch": 2.683509542770836,
+      "grad_norm": 1.7264482975006104,
+      "learning_rate": 3.504273504273504e-05,
+      "loss": 0.6308,
+      "step": 2443
+    },
+    {
+      "epoch": 2.6846079912124123,
+      "grad_norm": 0.6196084022521973,
+      "learning_rate": 3.492063492063492e-05,
+      "loss": 1.0233,
+      "step": 2444
+    },
+    {
+      "epoch": 2.6857064396539885,
+      "grad_norm": 0.855876088142395,
+      "learning_rate": 3.47985347985348e-05,
+      "loss": 0.5522,
+      "step": 2445
+    },
+    {
+      "epoch": 2.686804888095565,
+      "grad_norm": 0.45323798060417175,
+      "learning_rate": 3.4676434676434676e-05,
+      "loss": 0.6232,
+      "step": 2446
+    },
+    {
+      "epoch": 2.687903336537141,
+      "grad_norm": 0.577273964881897,
+      "learning_rate": 3.455433455433455e-05,
+      "loss": 0.5051,
+      "step": 2447
+    },
+    {
+      "epoch": 2.689001784978718,
+      "grad_norm": 0.4999620020389557,
+      "learning_rate": 3.4432234432234427e-05,
+      "loss": 0.4881,
+      "step": 2448
+    },
+    {
+      "epoch": 2.690100233420294,
+      "grad_norm": 0.5028046369552612,
+      "learning_rate": 3.431013431013431e-05,
+      "loss": 0.6575,
+      "step": 2449
+    },
+    {
+      "epoch": 2.69119868186187,
+      "grad_norm": 2.122028350830078,
+      "learning_rate": 3.4188034188034184e-05,
+      "loss": 0.7226,
+      "step": 2450
+    },
+    {
+      "epoch": 2.692297130303446,
+      "grad_norm": 0.4979703426361084,
+      "learning_rate": 3.406593406593406e-05,
+      "loss": 0.5768,
+      "step": 2451
+    },
+    {
+      "epoch": 2.693395578745023,
+      "grad_norm": 0.9270527958869934,
+      "learning_rate": 3.394383394383394e-05,
+      "loss": 0.6464,
+      "step": 2452
+    },
+    {
+      "epoch": 2.694494027186599,
+      "grad_norm": 1.0739809274673462,
+      "learning_rate": 3.382173382173382e-05,
+      "loss": 0.753,
+      "step": 2453
+    },
+    {
+      "epoch": 2.695592475628175,
+      "grad_norm": 0.6039335131645203,
+      "learning_rate": 3.36996336996337e-05,
+      "loss": 0.7909,
+      "step": 2454
+    },
+    {
+      "epoch": 2.6966909240697516,
+      "grad_norm": 0.49040424823760986,
+      "learning_rate": 3.357753357753358e-05,
+      "loss": 0.6112,
+      "step": 2455
+    },
+    {
+      "epoch": 2.6977893725113278,
+      "grad_norm": 0.6890440583229065,
+      "learning_rate": 3.345543345543345e-05,
+      "loss": 0.6849,
+      "step": 2456
+    },
+    {
+      "epoch": 2.698887820952904,
+      "grad_norm": 0.7819212675094604,
+      "learning_rate": 3.333333333333333e-05,
+      "loss": 0.6797,
+      "step": 2457
+    },
+    {
+      "epoch": 2.69998626939448,
+      "grad_norm": 1.0147050619125366,
+      "learning_rate": 3.321123321123321e-05,
+      "loss": 0.6867,
+      "step": 2458
+    },
+    {
+      "epoch": 2.7010847178360566,
+      "grad_norm": 1.3562036752700806,
+      "learning_rate": 3.3089133089133086e-05,
+      "loss": 0.7811,
+      "step": 2459
+    },
+    {
+      "epoch": 2.7021831662776328,
+      "grad_norm": 0.5813838839530945,
+      "learning_rate": 3.2967032967032964e-05,
+      "loss": 0.5405,
+      "step": 2460
+    },
+    {
+      "epoch": 2.7032816147192094,
+      "grad_norm": 0.6152640581130981,
+      "learning_rate": 3.284493284493284e-05,
+      "loss": 0.425,
+      "step": 2461
+    },
+    {
+      "epoch": 2.7043800631607855,
+      "grad_norm": 1.1984590291976929,
+      "learning_rate": 3.272283272283272e-05,
+      "loss": 0.592,
+      "step": 2462
+    },
+    {
+      "epoch": 2.7054785116023616,
+      "grad_norm": 0.48487693071365356,
+      "learning_rate": 3.26007326007326e-05,
+      "loss": 0.5223,
+      "step": 2463
+    },
+    {
+      "epoch": 2.7065769600439378,
+      "grad_norm": 0.47191065549850464,
+      "learning_rate": 3.247863247863247e-05,
+      "loss": 0.6479,
+      "step": 2464
+    },
+    {
+      "epoch": 2.7076754084855144,
+      "grad_norm": 1.3167297840118408,
+      "learning_rate": 3.235653235653235e-05,
+      "loss": 0.4552,
+      "step": 2465
+    },
+    {
+      "epoch": 2.7087738569270905,
+      "grad_norm": 1.3219714164733887,
+      "learning_rate": 3.2234432234432237e-05,
+      "loss": 0.5839,
+      "step": 2466
+    },
+    {
+      "epoch": 2.7098723053686666,
+      "grad_norm": 0.8047394752502441,
+      "learning_rate": 3.211233211233211e-05,
+      "loss": 0.795,
+      "step": 2467
+    },
+    {
+      "epoch": 2.710970753810243,
+      "grad_norm": 0.6053475737571716,
+      "learning_rate": 3.199023199023199e-05,
+      "loss": 0.743,
+      "step": 2468
+    },
+    {
+      "epoch": 2.7120692022518194,
+      "grad_norm": 0.4619985818862915,
+      "learning_rate": 3.1868131868131866e-05,
+      "loss": 0.642,
+      "step": 2469
+    },
+    {
+      "epoch": 2.7131676506933955,
+      "grad_norm": 0.8241426944732666,
+      "learning_rate": 3.1746031746031745e-05,
+      "loss": 0.521,
+      "step": 2470
+    },
+    {
+      "epoch": 2.7142660991349716,
+      "grad_norm": 0.4344565272331238,
+      "learning_rate": 3.162393162393162e-05,
+      "loss": 0.4615,
+      "step": 2471
+    },
+    {
+      "epoch": 2.715364547576548,
+      "grad_norm": 0.9640605449676514,
+      "learning_rate": 3.15018315018315e-05,
+      "loss": 0.4735,
+      "step": 2472
+    },
+    {
+      "epoch": 2.7164629960181244,
+      "grad_norm": 0.49423810839653015,
+      "learning_rate": 3.1379731379731374e-05,
+      "loss": 0.7547,
+      "step": 2473
+    },
+    {
+      "epoch": 2.717561444459701,
+      "grad_norm": 0.7234408855438232,
+      "learning_rate": 3.125763125763125e-05,
+      "loss": 0.464,
+      "step": 2474
+    },
+    {
+      "epoch": 2.718659892901277,
+      "grad_norm": 0.542647123336792,
+      "learning_rate": 3.113553113553113e-05,
+      "loss": 0.5563,
+      "step": 2475
+    },
+    {
+      "epoch": 2.719758341342853,
+      "grad_norm": 0.555722177028656,
+      "learning_rate": 3.101343101343101e-05,
+      "loss": 0.6899,
+      "step": 2476
+    },
+    {
+      "epoch": 2.7208567897844294,
+      "grad_norm": 0.6171600222587585,
+      "learning_rate": 3.089133089133089e-05,
+      "loss": 0.6088,
+      "step": 2477
+    },
+    {
+      "epoch": 2.7219552382260055,
+      "grad_norm": 0.9118738770484924,
+      "learning_rate": 3.076923076923077e-05,
+      "loss": 0.7778,
+      "step": 2478
+    },
+    {
+      "epoch": 2.723053686667582,
+      "grad_norm": 0.6610655784606934,
+      "learning_rate": 3.064713064713064e-05,
+      "loss": 0.6935,
+      "step": 2479
+    },
+    {
+      "epoch": 2.724152135109158,
+      "grad_norm": 0.6729289889335632,
+      "learning_rate": 3.0525030525030525e-05,
+      "loss": 0.792,
+      "step": 2480
+    },
+    {
+      "epoch": 2.725250583550735,
+      "grad_norm": 0.4955647587776184,
+      "learning_rate": 3.04029304029304e-05,
+      "loss": 0.6746,
+      "step": 2481
+    },
+    {
+      "epoch": 2.726349031992311,
+      "grad_norm": 0.42975953221321106,
+      "learning_rate": 3.028083028083028e-05,
+      "loss": 0.5318,
+      "step": 2482
+    },
+    {
+      "epoch": 2.727447480433887,
+      "grad_norm": 0.3555055856704712,
+      "learning_rate": 3.0158730158730154e-05,
+      "loss": 0.6377,
+      "step": 2483
+    },
+    {
+      "epoch": 2.728545928875463,
+      "grad_norm": 3.138209342956543,
+      "learning_rate": 3.0036630036630036e-05,
+      "loss": 0.6296,
+      "step": 2484
+    },
+    {
+      "epoch": 2.72964437731704,
+      "grad_norm": 0.5710242390632629,
+      "learning_rate": 2.9914529914529912e-05,
+      "loss": 0.8987,
+      "step": 2485
+    },
+    {
+      "epoch": 2.730742825758616,
+      "grad_norm": 0.5200769305229187,
+      "learning_rate": 2.979242979242979e-05,
+      "loss": 0.5154,
+      "step": 2486
+    },
+    {
+      "epoch": 2.731841274200192,
+      "grad_norm": 0.797572910785675,
+      "learning_rate": 2.9670329670329666e-05,
+      "loss": 0.8039,
+      "step": 2487
+    },
+    {
+      "epoch": 2.7329397226417687,
+      "grad_norm": 0.4667447805404663,
+      "learning_rate": 2.9548229548229548e-05,
+      "loss": 0.586,
+      "step": 2488
+    },
+    {
+      "epoch": 2.734038171083345,
+      "grad_norm": 0.5500869154930115,
+      "learning_rate": 2.9426129426129423e-05,
+      "loss": 0.7007,
+      "step": 2489
+    },
+    {
+      "epoch": 2.735136619524921,
+      "grad_norm": 0.5311625003814697,
+      "learning_rate": 2.9304029304029302e-05,
+      "loss": 0.4257,
+      "step": 2490
+    },
+    {
+      "epoch": 2.736235067966497,
+      "grad_norm": 0.6474941968917847,
+      "learning_rate": 2.9181929181929177e-05,
+      "loss": 0.4747,
+      "step": 2491
+    },
+    {
+      "epoch": 2.7373335164080737,
+      "grad_norm": 1.1186646223068237,
+      "learning_rate": 2.9059829059829056e-05,
+      "loss": 0.8177,
+      "step": 2492
+    },
+    {
+      "epoch": 2.73843196484965,
+      "grad_norm": 2.455371379852295,
+      "learning_rate": 2.8937728937728938e-05,
+      "loss": 0.6535,
+      "step": 2493
+    },
+    {
+      "epoch": 2.7395304132912264,
+      "grad_norm": 0.5033484101295471,
+      "learning_rate": 2.8815628815628813e-05,
+      "loss": 0.525,
+      "step": 2494
+    },
+    {
+      "epoch": 2.7406288617328025,
+      "grad_norm": 0.5826357007026672,
+      "learning_rate": 2.869352869352869e-05,
+      "loss": 0.476,
+      "step": 2495
+    },
+    {
+      "epoch": 2.7417273101743787,
+      "grad_norm": 0.5875104665756226,
+      "learning_rate": 2.8571428571428567e-05,
+      "loss": 0.6903,
+      "step": 2496
+    },
+    {
+      "epoch": 2.742825758615955,
+      "grad_norm": 0.6006028056144714,
+      "learning_rate": 2.844932844932845e-05,
+      "loss": 0.8522,
+      "step": 2497
+    },
+    {
+      "epoch": 2.7439242070575314,
+      "grad_norm": 0.5605003833770752,
+      "learning_rate": 2.8327228327228325e-05,
+      "loss": 0.5312,
+      "step": 2498
+    },
+    {
+      "epoch": 2.7450226554991075,
+      "grad_norm": 0.7641153931617737,
+      "learning_rate": 2.8205128205128204e-05,
+      "loss": 0.6841,
+      "step": 2499
+    },
+    {
+      "epoch": 2.7461211039406836,
+      "grad_norm": 0.5523414015769958,
+      "learning_rate": 2.808302808302808e-05,
+      "loss": 0.6582,
+      "step": 2500
+    },
+    {
+      "epoch": 2.7472195523822602,
+      "grad_norm": 0.40714672207832336,
+      "learning_rate": 2.796092796092796e-05,
+      "loss": 0.7493,
+      "step": 2501
+    },
+    {
+      "epoch": 2.7483180008238364,
+      "grad_norm": 0.6960926651954651,
+      "learning_rate": 2.7838827838827836e-05,
+      "loss": 0.7104,
+      "step": 2502
+    },
+    {
+      "epoch": 2.7494164492654125,
+      "grad_norm": 0.42409783601760864,
+      "learning_rate": 2.7716727716727715e-05,
+      "loss": 0.5643,
+      "step": 2503
+    },
+    {
+      "epoch": 2.7505148977069886,
+      "grad_norm": 0.5174455046653748,
+      "learning_rate": 2.759462759462759e-05,
+      "loss": 0.4545,
+      "step": 2504
+    },
+    {
+      "epoch": 2.7516133461485652,
+      "grad_norm": 0.6353528499603271,
+      "learning_rate": 2.747252747252747e-05,
+      "loss": 0.5068,
+      "step": 2505
+    },
+    {
+      "epoch": 2.7527117945901414,
+      "grad_norm": 0.46814125776290894,
+      "learning_rate": 2.7350427350427348e-05,
+      "loss": 0.7979,
+      "step": 2506
+    },
+    {
+      "epoch": 2.753810243031718,
+      "grad_norm": 0.7229417562484741,
+      "learning_rate": 2.7228327228327227e-05,
+      "loss": 0.6212,
+      "step": 2507
+    },
+    {
+      "epoch": 2.754908691473294,
+      "grad_norm": 1.2155603170394897,
+      "learning_rate": 2.7106227106227102e-05,
+      "loss": 0.8444,
+      "step": 2508
+    },
+    {
+      "epoch": 2.7560071399148702,
+      "grad_norm": 0.462703138589859,
+      "learning_rate": 2.698412698412698e-05,
+      "loss": 0.8263,
+      "step": 2509
+    },
+    {
+      "epoch": 2.7571055883564464,
+      "grad_norm": 0.9474642872810364,
+      "learning_rate": 2.6862026862026863e-05,
+      "loss": 0.7586,
+      "step": 2510
+    },
+    {
+      "epoch": 2.758204036798023,
+      "grad_norm": 4.502622127532959,
+      "learning_rate": 2.6739926739926738e-05,
+      "loss": 0.5806,
+      "step": 2511
+    },
+    {
+      "epoch": 2.759302485239599,
+      "grad_norm": 1.1251213550567627,
+      "learning_rate": 2.6617826617826617e-05,
+      "loss": 0.6333,
+      "step": 2512
+    },
+    {
+      "epoch": 2.7604009336811752,
+      "grad_norm": 0.7035579681396484,
+      "learning_rate": 2.6495726495726492e-05,
+      "loss": 0.4739,
+      "step": 2513
+    },
+    {
+      "epoch": 2.761499382122752,
+      "grad_norm": 0.5279493927955627,
+      "learning_rate": 2.6373626373626374e-05,
+      "loss": 0.597,
+      "step": 2514
+    },
+    {
+      "epoch": 2.762597830564328,
+      "grad_norm": 0.5512554049491882,
+      "learning_rate": 2.625152625152625e-05,
+      "loss": 0.6471,
+      "step": 2515
+    },
+    {
+      "epoch": 2.763696279005904,
+      "grad_norm": 0.857778012752533,
+      "learning_rate": 2.6129426129426128e-05,
+      "loss": 0.6172,
+      "step": 2516
+    },
+    {
+      "epoch": 2.7647947274474802,
+      "grad_norm": 0.5348466634750366,
+      "learning_rate": 2.6007326007326004e-05,
+      "loss": 0.8074,
+      "step": 2517
+    },
+    {
+      "epoch": 2.765893175889057,
+      "grad_norm": 0.5413629412651062,
+      "learning_rate": 2.5885225885225882e-05,
+      "loss": 0.3879,
+      "step": 2518
+    },
+    {
+      "epoch": 2.766991624330633,
+      "grad_norm": 0.569411039352417,
+      "learning_rate": 2.576312576312576e-05,
+      "loss": 0.4392,
+      "step": 2519
+    },
+    {
+      "epoch": 2.7680900727722095,
+      "grad_norm": 0.5127429962158203,
+      "learning_rate": 2.564102564102564e-05,
+      "loss": 0.6566,
+      "step": 2520
+    },
+    {
+      "epoch": 2.7691885212137857,
+      "grad_norm": 0.7328614592552185,
+      "learning_rate": 2.5518925518925515e-05,
+      "loss": 0.6801,
+      "step": 2521
+    },
+    {
+      "epoch": 2.770286969655362,
+      "grad_norm": 0.615686297416687,
+      "learning_rate": 2.5396825396825394e-05,
+      "loss": 0.6366,
+      "step": 2522
+    },
+    {
+      "epoch": 2.771385418096938,
+      "grad_norm": 0.5250161290168762,
+      "learning_rate": 2.5274725274725276e-05,
+      "loss": 0.5737,
+      "step": 2523
+    },
+    {
+      "epoch": 2.772483866538514,
+      "grad_norm": 0.6708832383155823,
+      "learning_rate": 2.515262515262515e-05,
+      "loss": 0.6681,
+      "step": 2524
+    },
+    {
+      "epoch": 2.7735823149800907,
+      "grad_norm": 0.6120278835296631,
+      "learning_rate": 2.503052503052503e-05,
+      "loss": 0.4964,
+      "step": 2525
+    },
+    {
+      "epoch": 2.774680763421667,
+      "grad_norm": 0.7024976015090942,
+      "learning_rate": 2.4908424908424905e-05,
+      "loss": 0.7984,
+      "step": 2526
+    },
+    {
+      "epoch": 2.7757792118632434,
+      "grad_norm": 7.281716823577881,
+      "learning_rate": 2.478632478632478e-05,
+      "loss": 0.7191,
+      "step": 2527
+    },
+    {
+      "epoch": 2.7768776603048195,
+      "grad_norm": 0.7347024083137512,
+      "learning_rate": 2.4664224664224663e-05,
+      "loss": 0.8684,
+      "step": 2528
+    },
+    {
+      "epoch": 2.7779761087463957,
+      "grad_norm": 1.1338274478912354,
+      "learning_rate": 2.454212454212454e-05,
+      "loss": 0.5936,
+      "step": 2529
+    },
+    {
+      "epoch": 2.779074557187972,
+      "grad_norm": 0.4176536202430725,
+      "learning_rate": 2.4420024420024417e-05,
+      "loss": 0.445,
+      "step": 2530
+    },
+    {
+      "epoch": 2.7801730056295484,
+      "grad_norm": 0.9390072822570801,
+      "learning_rate": 2.4297924297924295e-05,
+      "loss": 0.5821,
+      "step": 2531
+    },
+    {
+      "epoch": 2.7812714540711245,
+      "grad_norm": 1.1045840978622437,
+      "learning_rate": 2.4175824175824174e-05,
+      "loss": 0.7372,
+      "step": 2532
+    },
+    {
+      "epoch": 2.7823699025127007,
+      "grad_norm": 0.5568689703941345,
+      "learning_rate": 2.4053724053724053e-05,
+      "loss": 0.5005,
+      "step": 2533
+    },
+    {
+      "epoch": 2.7834683509542772,
+      "grad_norm": 0.2747582793235779,
+      "learning_rate": 2.3931623931623928e-05,
+      "loss": 0.5778,
+      "step": 2534
+    },
+    {
+      "epoch": 2.7845667993958534,
+      "grad_norm": 1.4027804136276245,
+      "learning_rate": 2.3809523809523807e-05,
+      "loss": 0.5368,
+      "step": 2535
+    },
+    {
+      "epoch": 2.7856652478374295,
+      "grad_norm": 0.7523220777511597,
+      "learning_rate": 2.368742368742369e-05,
+      "loss": 0.58,
+      "step": 2536
+    },
+    {
+      "epoch": 2.7867636962790057,
+      "grad_norm": 0.33777353167533875,
+      "learning_rate": 2.3565323565323564e-05,
+      "loss": 0.5269,
+      "step": 2537
+    },
+    {
+      "epoch": 2.7878621447205822,
+      "grad_norm": 0.5818787217140198,
+      "learning_rate": 2.344322344322344e-05,
+      "loss": 0.4459,
+      "step": 2538
+    },
+    {
+      "epoch": 2.7889605931621584,
+      "grad_norm": 0.36858034133911133,
+      "learning_rate": 2.3321123321123318e-05,
+      "loss": 0.712,
+      "step": 2539
+    },
+    {
+      "epoch": 2.790059041603735,
+      "grad_norm": 0.5299241542816162,
+      "learning_rate": 2.3199023199023194e-05,
+      "loss": 0.6086,
+      "step": 2540
+    },
+    {
+      "epoch": 2.791157490045311,
+      "grad_norm": 2.432325601577759,
+      "learning_rate": 2.3076923076923076e-05,
+      "loss": 1.0386,
+      "step": 2541
+    },
+    {
+      "epoch": 2.7922559384868872,
+      "grad_norm": 0.746638834476471,
+      "learning_rate": 2.2954822954822954e-05,
+      "loss": 0.7372,
+      "step": 2542
+    },
+    {
+      "epoch": 2.7933543869284634,
+      "grad_norm": 0.6017647981643677,
+      "learning_rate": 2.283272283272283e-05,
+      "loss": 0.9134,
+      "step": 2543
+    },
+    {
+      "epoch": 2.79445283537004,
+      "grad_norm": 0.7385385036468506,
+      "learning_rate": 2.271062271062271e-05,
+      "loss": 0.6827,
+      "step": 2544
+    },
+    {
+      "epoch": 2.795551283811616,
+      "grad_norm": 0.6607246994972229,
+      "learning_rate": 2.2588522588522587e-05,
+      "loss": 0.6333,
+      "step": 2545
+    },
+    {
+      "epoch": 2.7966497322531922,
+      "grad_norm": 0.40185117721557617,
+      "learning_rate": 2.2466422466422466e-05,
+      "loss": 0.6589,
+      "step": 2546
+    },
+    {
+      "epoch": 2.797748180694769,
+      "grad_norm": 0.48225662112236023,
+      "learning_rate": 2.234432234432234e-05,
+      "loss": 0.6571,
+      "step": 2547
+    },
+    {
+      "epoch": 2.798846629136345,
+      "grad_norm": 0.8996065855026245,
+      "learning_rate": 2.222222222222222e-05,
+      "loss": 0.7518,
+      "step": 2548
+    },
+    {
+      "epoch": 2.799945077577921,
+      "grad_norm": 0.7139112949371338,
+      "learning_rate": 2.21001221001221e-05,
+      "loss": 0.6517,
+      "step": 2549
+    },
+    {
+      "epoch": 2.8010435260194972,
+      "grad_norm": 0.5433416366577148,
+      "learning_rate": 2.1978021978021977e-05,
+      "loss": 0.3799,
+      "step": 2550
+    },
+    {
+      "epoch": 2.802141974461074,
+      "grad_norm": 0.3883088231086731,
+      "learning_rate": 2.1855921855921853e-05,
+      "loss": 0.9269,
+      "step": 2551
+    },
+    {
+      "epoch": 2.80324042290265,
+      "grad_norm": 0.5275357961654663,
+      "learning_rate": 2.173382173382173e-05,
+      "loss": 0.6606,
+      "step": 2552
+    },
+    {
+      "epoch": 2.8043388713442265,
+      "grad_norm": 0.4666341543197632,
+      "learning_rate": 2.1611721611721607e-05,
+      "loss": 0.6982,
+      "step": 2553
+    },
+    {
+      "epoch": 2.8054373197858027,
+      "grad_norm": 0.9221529364585876,
+      "learning_rate": 2.148962148962149e-05,
+      "loss": 0.4769,
+      "step": 2554
+    },
+    {
+      "epoch": 2.806535768227379,
+      "grad_norm": 0.7469640374183655,
+      "learning_rate": 2.1367521367521368e-05,
+      "loss": 0.6985,
+      "step": 2555
+    },
+    {
+      "epoch": 2.807634216668955,
+      "grad_norm": 0.6858775615692139,
+      "learning_rate": 2.1245421245421243e-05,
+      "loss": 0.4511,
+      "step": 2556
+    },
+    {
+      "epoch": 2.808732665110531,
+      "grad_norm": 1.266801357269287,
+      "learning_rate": 2.112332112332112e-05,
+      "loss": 0.421,
+      "step": 2557
+    },
+    {
+      "epoch": 2.8098311135521077,
+      "grad_norm": 0.5506262183189392,
+      "learning_rate": 2.1001221001221e-05,
+      "loss": 0.6082,
+      "step": 2558
+    },
+    {
+      "epoch": 2.810929561993684,
+      "grad_norm": 0.5359029173851013,
+      "learning_rate": 2.087912087912088e-05,
+      "loss": 0.8111,
+      "step": 2559
+    },
+    {
+      "epoch": 2.8120280104352604,
+      "grad_norm": 0.6969206929206848,
+      "learning_rate": 2.0757020757020754e-05,
+      "loss": 0.8331,
+      "step": 2560
+    },
+    {
+      "epoch": 2.8131264588768365,
+      "grad_norm": 0.6040379405021667,
+      "learning_rate": 2.0634920634920633e-05,
+      "loss": 0.575,
+      "step": 2561
+    },
+    {
+      "epoch": 2.8142249073184127,
+      "grad_norm": 1.3847273588180542,
+      "learning_rate": 2.0512820512820512e-05,
+      "loss": 0.5442,
+      "step": 2562
+    },
+    {
+      "epoch": 2.815323355759989,
+      "grad_norm": 0.8050490617752075,
+      "learning_rate": 2.039072039072039e-05,
+      "loss": 0.6267,
+      "step": 2563
+    },
+    {
+      "epoch": 2.8164218042015654,
+      "grad_norm": 0.5663136839866638,
+      "learning_rate": 2.0268620268620266e-05,
+      "loss": 0.5246,
+      "step": 2564
+    },
+    {
+      "epoch": 2.8175202526431415,
+      "grad_norm": 0.3316130042076111,
+      "learning_rate": 2.0146520146520144e-05,
+      "loss": 0.5175,
+      "step": 2565
+    },
+    {
+      "epoch": 2.8186187010847177,
+      "grad_norm": 0.4782855808734894,
+      "learning_rate": 2.002442002442002e-05,
+      "loss": 0.5111,
+      "step": 2566
+    },
+    {
+      "epoch": 2.8197171495262943,
+      "grad_norm": 0.44766396284103394,
+      "learning_rate": 1.9902319902319902e-05,
+      "loss": 0.5825,
+      "step": 2567
+    },
+    {
+      "epoch": 2.8208155979678704,
+      "grad_norm": 0.6830618977546692,
+      "learning_rate": 1.978021978021978e-05,
+      "loss": 0.5685,
+      "step": 2568
+    },
+    {
+      "epoch": 2.8219140464094465,
+      "grad_norm": 0.5860748887062073,
+      "learning_rate": 1.9658119658119656e-05,
+      "loss": 0.7557,
+      "step": 2569
+    },
+    {
+      "epoch": 2.8230124948510227,
+      "grad_norm": 0.49533459544181824,
+      "learning_rate": 1.953601953601953e-05,
+      "loss": 0.7326,
+      "step": 2570
+    },
+    {
+      "epoch": 2.8241109432925993,
+      "grad_norm": 0.4989941418170929,
+      "learning_rate": 1.9413919413919413e-05,
+      "loss": 0.5757,
+      "step": 2571
+    },
+    {
+      "epoch": 2.8252093917341754,
+      "grad_norm": 0.4973461627960205,
+      "learning_rate": 1.9291819291819292e-05,
+      "loss": 0.5357,
+      "step": 2572
+    },
+    {
+      "epoch": 2.826307840175752,
+      "grad_norm": 0.7442370057106018,
+      "learning_rate": 1.9169719169719167e-05,
+      "loss": 0.7283,
+      "step": 2573
+    },
+    {
+      "epoch": 2.827406288617328,
+      "grad_norm": 1.3321865797042847,
+      "learning_rate": 1.9047619047619046e-05,
+      "loss": 0.5107,
+      "step": 2574
+    },
+    {
+      "epoch": 2.8285047370589043,
+      "grad_norm": 0.47394871711730957,
+      "learning_rate": 1.892551892551892e-05,
+      "loss": 0.5495,
+      "step": 2575
+    },
+    {
+      "epoch": 2.8296031855004804,
+      "grad_norm": 0.6102151274681091,
+      "learning_rate": 1.8803418803418804e-05,
+      "loss": 0.5983,
+      "step": 2576
+    },
+    {
+      "epoch": 2.830701633942057,
+      "grad_norm": 0.4657471179962158,
+      "learning_rate": 1.868131868131868e-05,
+      "loss": 0.5937,
+      "step": 2577
+    },
+    {
+      "epoch": 2.831800082383633,
+      "grad_norm": 0.41180238127708435,
+      "learning_rate": 1.8559218559218558e-05,
+      "loss": 0.7775,
+      "step": 2578
+    },
+    {
+      "epoch": 2.8328985308252093,
+      "grad_norm": 3.5043845176696777,
+      "learning_rate": 1.8437118437118436e-05,
+      "loss": 0.5304,
+      "step": 2579
+    },
+    {
+      "epoch": 2.833996979266786,
+      "grad_norm": 0.4502231776714325,
+      "learning_rate": 1.831501831501831e-05,
+      "loss": 0.6556,
+      "step": 2580
+    },
+    {
+      "epoch": 2.835095427708362,
+      "grad_norm": 0.6165898442268372,
+      "learning_rate": 1.819291819291819e-05,
+      "loss": 0.8434,
+      "step": 2581
+    },
+    {
+      "epoch": 2.836193876149938,
+      "grad_norm": 0.5112649202346802,
+      "learning_rate": 1.807081807081807e-05,
+      "loss": 0.7429,
+      "step": 2582
+    },
+    {
+      "epoch": 2.8372923245915143,
+      "grad_norm": 0.4834790527820587,
+      "learning_rate": 1.7948717948717948e-05,
+      "loss": 0.5772,
+      "step": 2583
+    },
+    {
+      "epoch": 2.838390773033091,
+      "grad_norm": 0.4251219630241394,
+      "learning_rate": 1.7826617826617826e-05,
+      "loss": 0.5192,
+      "step": 2584
+    },
+    {
+      "epoch": 2.839489221474667,
+      "grad_norm": 0.7645363807678223,
+      "learning_rate": 1.7704517704517705e-05,
+      "loss": 0.6624,
+      "step": 2585
+    },
+    {
+      "epoch": 2.8405876699162436,
+      "grad_norm": 0.5651314854621887,
+      "learning_rate": 1.758241758241758e-05,
+      "loss": 0.5829,
+      "step": 2586
+    },
+    {
+      "epoch": 2.8416861183578197,
+      "grad_norm": 1.059164047241211,
+      "learning_rate": 1.746031746031746e-05,
+      "loss": 0.6688,
+      "step": 2587
+    },
+    {
+      "epoch": 2.842784566799396,
+      "grad_norm": 2.2424001693725586,
+      "learning_rate": 1.7338217338217338e-05,
+      "loss": 0.4515,
+      "step": 2588
+    },
+    {
+      "epoch": 2.843883015240972,
+      "grad_norm": 0.6211466789245605,
+      "learning_rate": 1.7216117216117213e-05,
+      "loss": 0.836,
+      "step": 2589
+    },
+    {
+      "epoch": 2.8449814636825486,
+      "grad_norm": 0.4224345088005066,
+      "learning_rate": 1.7094017094017092e-05,
+      "loss": 0.536,
+      "step": 2590
+    },
+    {
+      "epoch": 2.8460799121241247,
+      "grad_norm": 0.7985780239105225,
+      "learning_rate": 1.697191697191697e-05,
+      "loss": 0.7433,
+      "step": 2591
+    },
+    {
+      "epoch": 2.847178360565701,
+      "grad_norm": 1.4033039808273315,
+      "learning_rate": 1.684981684981685e-05,
+      "loss": 0.7479,
+      "step": 2592
+    },
+    {
+      "epoch": 2.8482768090072774,
+      "grad_norm": 1.1432255506515503,
+      "learning_rate": 1.6727716727716725e-05,
+      "loss": 0.652,
+      "step": 2593
+    },
+    {
+      "epoch": 2.8493752574488536,
+      "grad_norm": 0.9324535727500916,
+      "learning_rate": 1.6605616605616603e-05,
+      "loss": 0.5225,
+      "step": 2594
+    },
+    {
+      "epoch": 2.8504737058904297,
+      "grad_norm": 0.5573447942733765,
+      "learning_rate": 1.6483516483516482e-05,
+      "loss": 0.6649,
+      "step": 2595
+    },
+    {
+      "epoch": 2.851572154332006,
+      "grad_norm": 0.6875207424163818,
+      "learning_rate": 1.636141636141636e-05,
+      "loss": 0.7334,
+      "step": 2596
+    },
+    {
+      "epoch": 2.8526706027735824,
+      "grad_norm": 0.32099124789237976,
+      "learning_rate": 1.6239316239316236e-05,
+      "loss": 0.5732,
+      "step": 2597
+    },
+    {
+      "epoch": 2.8537690512151586,
+      "grad_norm": 0.4142940938472748,
+      "learning_rate": 1.6117216117216118e-05,
+      "loss": 0.6605,
+      "step": 2598
+    },
+    {
+      "epoch": 2.8548674996567347,
+      "grad_norm": 0.5377205610275269,
+      "learning_rate": 1.5995115995115994e-05,
+      "loss": 0.5556,
+      "step": 2599
+    },
+    {
+      "epoch": 2.8559659480983113,
+      "grad_norm": 0.43509960174560547,
+      "learning_rate": 1.5873015873015872e-05,
+      "loss": 0.8321,
+      "step": 2600
+    },
+    {
+      "epoch": 2.8570643965398874,
+      "grad_norm": 0.4376494586467743,
+      "learning_rate": 1.575091575091575e-05,
+      "loss": 0.6392,
+      "step": 2601
+    },
+    {
+      "epoch": 2.8581628449814636,
+      "grad_norm": 0.507837176322937,
+      "learning_rate": 1.5628815628815626e-05,
+      "loss": 0.5326,
+      "step": 2602
+    },
+    {
+      "epoch": 2.8592612934230397,
+      "grad_norm": 29.0502986907959,
+      "learning_rate": 1.5506715506715505e-05,
+      "loss": 0.5478,
+      "step": 2603
+    },
+    {
+      "epoch": 2.8603597418646163,
+      "grad_norm": 0.6940420866012573,
+      "learning_rate": 1.5384615384615384e-05,
+      "loss": 1.3063,
+      "step": 2604
+    },
+    {
+      "epoch": 2.8614581903061924,
+      "grad_norm": 0.7178813219070435,
+      "learning_rate": 1.5262515262515263e-05,
+      "loss": 0.7447,
+      "step": 2605
+    },
+    {
+      "epoch": 2.862556638747769,
+      "grad_norm": 0.6209506392478943,
+      "learning_rate": 1.514041514041514e-05,
+      "loss": 0.5496,
+      "step": 2606
+    },
+    {
+      "epoch": 2.863655087189345,
+      "grad_norm": 0.5526819825172424,
+      "learning_rate": 1.5018315018315018e-05,
+      "loss": 0.4224,
+      "step": 2607
+    },
+    {
+      "epoch": 2.8647535356309213,
+      "grad_norm": 0.5056405663490295,
+      "learning_rate": 1.4896214896214895e-05,
+      "loss": 0.6248,
+      "step": 2608
+    },
+    {
+      "epoch": 2.8658519840724974,
+      "grad_norm": 2.416952610015869,
+      "learning_rate": 1.4774114774114774e-05,
+      "loss": 0.7551,
+      "step": 2609
+    },
+    {
+      "epoch": 2.866950432514074,
+      "grad_norm": 0.52223140001297,
+      "learning_rate": 1.4652014652014651e-05,
+      "loss": 1.1146,
+      "step": 2610
+    },
+    {
+      "epoch": 2.86804888095565,
+      "grad_norm": 0.685767650604248,
+      "learning_rate": 1.4529914529914528e-05,
+      "loss": 0.715,
+      "step": 2611
+    },
+    {
+      "epoch": 2.8691473293972263,
+      "grad_norm": 0.650374174118042,
+      "learning_rate": 1.4407814407814407e-05,
+      "loss": 0.8844,
+      "step": 2612
+    },
+    {
+      "epoch": 2.870245777838803,
+      "grad_norm": 0.46946465969085693,
+      "learning_rate": 1.4285714285714284e-05,
+      "loss": 0.9545,
+      "step": 2613
+    },
+    {
+      "epoch": 2.871344226280379,
+      "grad_norm": 0.5312052369117737,
+      "learning_rate": 1.4163614163614162e-05,
+      "loss": 0.5204,
+      "step": 2614
+    },
+    {
+      "epoch": 2.872442674721955,
+      "grad_norm": 0.41921889781951904,
+      "learning_rate": 1.404151404151404e-05,
+      "loss": 0.4614,
+      "step": 2615
+    },
+    {
+      "epoch": 2.8735411231635313,
+      "grad_norm": 0.513203501701355,
+      "learning_rate": 1.3919413919413918e-05,
+      "loss": 0.613,
+      "step": 2616
+    },
+    {
+      "epoch": 2.874639571605108,
+      "grad_norm": 1.1020901203155518,
+      "learning_rate": 1.3797313797313795e-05,
+      "loss": 0.525,
+      "step": 2617
+    },
+    {
+      "epoch": 2.875738020046684,
+      "grad_norm": 0.39301392436027527,
+      "learning_rate": 1.3675213675213674e-05,
+      "loss": 0.5799,
+      "step": 2618
+    },
+    {
+      "epoch": 2.8768364684882606,
+      "grad_norm": 1.576910376548767,
+      "learning_rate": 1.3553113553113551e-05,
+      "loss": 0.6286,
+      "step": 2619
+    },
+    {
+      "epoch": 2.8779349169298367,
+      "grad_norm": 0.36711424589157104,
+      "learning_rate": 1.3431013431013431e-05,
+      "loss": 0.7542,
+      "step": 2620
+    },
+    {
+      "epoch": 2.879033365371413,
+      "grad_norm": 1.2777636051177979,
+      "learning_rate": 1.3308913308913308e-05,
+      "loss": 0.6269,
+      "step": 2621
+    },
+    {
+      "epoch": 2.880131813812989,
+      "grad_norm": 0.5584180355072021,
+      "learning_rate": 1.3186813186813187e-05,
+      "loss": 0.5633,
+      "step": 2622
+    },
+    {
+      "epoch": 2.8812302622545656,
+      "grad_norm": 1.2418673038482666,
+      "learning_rate": 1.3064713064713064e-05,
+      "loss": 0.537,
+      "step": 2623
+    },
+    {
+      "epoch": 2.8823287106961417,
+      "grad_norm": 0.5850531458854675,
+      "learning_rate": 1.2942612942612941e-05,
+      "loss": 0.595,
+      "step": 2624
+    },
+    {
+      "epoch": 2.883427159137718,
+      "grad_norm": 1.054592251777649,
+      "learning_rate": 1.282051282051282e-05,
+      "loss": 0.8308,
+      "step": 2625
+    },
+    {
+      "epoch": 2.8845256075792944,
+      "grad_norm": 0.3231412470340729,
+      "learning_rate": 1.2698412698412697e-05,
+      "loss": 0.4044,
+      "step": 2626
+    },
+    {
+      "epoch": 2.8856240560208706,
+      "grad_norm": 0.47942933440208435,
+      "learning_rate": 1.2576312576312576e-05,
+      "loss": 0.6299,
+      "step": 2627
+    },
+    {
+      "epoch": 2.8867225044624467,
+      "grad_norm": 0.4884187579154968,
+      "learning_rate": 1.2454212454212453e-05,
+      "loss": 0.6606,
+      "step": 2628
+    },
+    {
+      "epoch": 2.887820952904023,
+      "grad_norm": 0.6658734083175659,
+      "learning_rate": 1.2332112332112331e-05,
+      "loss": 0.642,
+      "step": 2629
+    },
+    {
+      "epoch": 2.8889194013455994,
+      "grad_norm": 0.24990247189998627,
+      "learning_rate": 1.2210012210012208e-05,
+      "loss": 0.4041,
+      "step": 2630
+    },
+    {
+      "epoch": 2.8900178497871756,
+      "grad_norm": 0.6446508169174194,
+      "learning_rate": 1.2087912087912087e-05,
+      "loss": 0.7126,
+      "step": 2631
+    },
+    {
+      "epoch": 2.891116298228752,
+      "grad_norm": 0.7800988554954529,
+      "learning_rate": 1.1965811965811964e-05,
+      "loss": 0.6733,
+      "step": 2632
+    },
+    {
+      "epoch": 2.8922147466703283,
+      "grad_norm": 0.5319482684135437,
+      "learning_rate": 1.1843711843711844e-05,
+      "loss": 0.6445,
+      "step": 2633
+    },
+    {
+      "epoch": 2.8933131951119044,
+      "grad_norm": 0.6029678583145142,
+      "learning_rate": 1.172161172161172e-05,
+      "loss": 0.7642,
+      "step": 2634
+    },
+    {
+      "epoch": 2.8944116435534806,
+      "grad_norm": 0.9029693007469177,
+      "learning_rate": 1.1599511599511597e-05,
+      "loss": 0.635,
+      "step": 2635
+    },
+    {
+      "epoch": 2.8955100919950567,
+      "grad_norm": 0.6022691130638123,
+      "learning_rate": 1.1477411477411477e-05,
+      "loss": 0.5361,
+      "step": 2636
+    },
+    {
+      "epoch": 2.8966085404366333,
+      "grad_norm": 0.6777801513671875,
+      "learning_rate": 1.1355311355311354e-05,
+      "loss": 0.5099,
+      "step": 2637
+    },
+    {
+      "epoch": 2.8977069888782094,
+      "grad_norm": 0.4157528877258301,
+      "learning_rate": 1.1233211233211233e-05,
+      "loss": 0.5038,
+      "step": 2638
+    },
+    {
+      "epoch": 2.898805437319786,
+      "grad_norm": 2.6101133823394775,
+      "learning_rate": 1.111111111111111e-05,
+      "loss": 0.6324,
+      "step": 2639
+    },
+    {
+      "epoch": 2.899903885761362,
+      "grad_norm": 0.6885612607002258,
+      "learning_rate": 1.0989010989010989e-05,
+      "loss": 0.4931,
+      "step": 2640
+    },
+    {
+      "epoch": 2.9010023342029383,
+      "grad_norm": 0.5510079264640808,
+      "learning_rate": 1.0866910866910866e-05,
+      "loss": 0.5088,
+      "step": 2641
+    },
+    {
+      "epoch": 2.9021007826445144,
+      "grad_norm": 0.6099854111671448,
+      "learning_rate": 1.0744810744810744e-05,
+      "loss": 0.4647,
+      "step": 2642
+    },
+    {
+      "epoch": 2.903199231086091,
+      "grad_norm": 0.4390881657600403,
+      "learning_rate": 1.0622710622710621e-05,
+      "loss": 0.6787,
+      "step": 2643
+    },
+    {
+      "epoch": 2.904297679527667,
+      "grad_norm": 0.46238628029823303,
+      "learning_rate": 1.05006105006105e-05,
+      "loss": 0.5655,
+      "step": 2644
+    },
+    {
+      "epoch": 2.9053961279692433,
+      "grad_norm": 0.479106605052948,
+      "learning_rate": 1.0378510378510377e-05,
+      "loss": 0.7833,
+      "step": 2645
+    },
+    {
+      "epoch": 2.90649457641082,
+      "grad_norm": 0.4643683135509491,
+      "learning_rate": 1.0256410256410256e-05,
+      "loss": 0.4563,
+      "step": 2646
+    },
+    {
+      "epoch": 2.907593024852396,
+      "grad_norm": 0.4173976480960846,
+      "learning_rate": 1.0134310134310133e-05,
+      "loss": 0.6614,
+      "step": 2647
+    },
+    {
+      "epoch": 2.908691473293972,
+      "grad_norm": 0.7158990502357483,
+      "learning_rate": 1.001221001221001e-05,
+      "loss": 0.7342,
+      "step": 2648
+    },
+    {
+      "epoch": 2.9097899217355483,
+      "grad_norm": 0.7276301980018616,
+      "learning_rate": 9.89010989010989e-06,
+      "loss": 0.6883,
+      "step": 2649
+    },
+    {
+      "epoch": 2.910888370177125,
+      "grad_norm": 0.63588947057724,
+      "learning_rate": 9.768009768009766e-06,
+      "loss": 0.7533,
+      "step": 2650
+    },
+    {
+      "epoch": 2.911986818618701,
+      "grad_norm": 1.8038127422332764,
+      "learning_rate": 9.645909645909646e-06,
+      "loss": 0.6238,
+      "step": 2651
+    },
+    {
+      "epoch": 2.9130852670602776,
+      "grad_norm": 0.7289617657661438,
+      "learning_rate": 9.523809523809523e-06,
+      "loss": 0.4767,
+      "step": 2652
+    },
+    {
+      "epoch": 2.9141837155018537,
+      "grad_norm": 0.3828502893447876,
+      "learning_rate": 9.401709401709402e-06,
+      "loss": 0.4812,
+      "step": 2653
+    },
+    {
+      "epoch": 2.91528216394343,
+      "grad_norm": 0.5157826542854309,
+      "learning_rate": 9.279609279609279e-06,
+      "loss": 0.703,
+      "step": 2654
+    },
+    {
+      "epoch": 2.916380612385006,
+      "grad_norm": 0.6833345890045166,
+      "learning_rate": 9.157509157509156e-06,
+      "loss": 0.7471,
+      "step": 2655
+    },
+    {
+      "epoch": 2.9174790608265826,
+      "grad_norm": 1.0189886093139648,
+      "learning_rate": 9.035409035409035e-06,
+      "loss": 0.6065,
+      "step": 2656
+    },
+    {
+      "epoch": 2.9185775092681587,
+      "grad_norm": 0.5197221040725708,
+      "learning_rate": 8.913308913308913e-06,
+      "loss": 0.5904,
+      "step": 2657
+    },
+    {
+      "epoch": 2.919675957709735,
+      "grad_norm": 0.6265780925750732,
+      "learning_rate": 8.79120879120879e-06,
+      "loss": 0.5622,
+      "step": 2658
+    },
+    {
+      "epoch": 2.9207744061513115,
+      "grad_norm": 0.5703533887863159,
+      "learning_rate": 8.669108669108669e-06,
+      "loss": 0.8005,
+      "step": 2659
+    },
+    {
+      "epoch": 2.9218728545928876,
+      "grad_norm": 0.8656613230705261,
+      "learning_rate": 8.547008547008546e-06,
+      "loss": 0.4942,
+      "step": 2660
+    },
+    {
+      "epoch": 2.9229713030344637,
+      "grad_norm": 0.6180423498153687,
+      "learning_rate": 8.424908424908425e-06,
+      "loss": 0.8163,
+      "step": 2661
+    },
+    {
+      "epoch": 2.92406975147604,
+      "grad_norm": 0.7308143377304077,
+      "learning_rate": 8.302808302808302e-06,
+      "loss": 0.7639,
+      "step": 2662
+    },
+    {
+      "epoch": 2.9251681999176165,
+      "grad_norm": 0.585617184638977,
+      "learning_rate": 8.18070818070818e-06,
+      "loss": 0.7614,
+      "step": 2663
+    },
+    {
+      "epoch": 2.9262666483591926,
+      "grad_norm": 0.5277345776557922,
+      "learning_rate": 8.058608058608059e-06,
+      "loss": 0.6489,
+      "step": 2664
+    },
+    {
+      "epoch": 2.927365096800769,
+      "grad_norm": 0.3540293574333191,
+      "learning_rate": 7.936507936507936e-06,
+      "loss": 0.4503,
+      "step": 2665
+    },
+    {
+      "epoch": 2.9284635452423453,
+      "grad_norm": 0.554492175579071,
+      "learning_rate": 7.814407814407813e-06,
+      "loss": 0.5785,
+      "step": 2666
+    },
+    {
+      "epoch": 2.9295619936839215,
+      "grad_norm": 0.5547875761985779,
+      "learning_rate": 7.692307692307692e-06,
+      "loss": 0.5763,
+      "step": 2667
+    },
+    {
+      "epoch": 2.9306604421254976,
+      "grad_norm": 0.745947003364563,
+      "learning_rate": 7.57020757020757e-06,
+      "loss": 0.512,
+      "step": 2668
+    },
+    {
+      "epoch": 2.931758890567074,
+      "grad_norm": 0.47691571712493896,
+      "learning_rate": 7.448107448107448e-06,
+      "loss": 0.7018,
+      "step": 2669
+    },
+    {
+      "epoch": 2.9328573390086503,
+      "grad_norm": 0.9611607789993286,
+      "learning_rate": 7.3260073260073255e-06,
+      "loss": 0.7419,
+      "step": 2670
+    },
+    {
+      "epoch": 2.9339557874502264,
+      "grad_norm": 0.5495268106460571,
+      "learning_rate": 7.203907203907203e-06,
+      "loss": 0.6096,
+      "step": 2671
+    },
+    {
+      "epoch": 2.935054235891803,
+      "grad_norm": 0.8863226771354675,
+      "learning_rate": 7.081807081807081e-06,
+      "loss": 0.7149,
+      "step": 2672
+    },
+    {
+      "epoch": 2.936152684333379,
+      "grad_norm": 0.4234665334224701,
+      "learning_rate": 6.959706959706959e-06,
+      "loss": 0.6913,
+      "step": 2673
+    },
+    {
+      "epoch": 2.9372511327749553,
+      "grad_norm": 0.9667326211929321,
+      "learning_rate": 6.837606837606837e-06,
+      "loss": 0.4181,
+      "step": 2674
+    },
+    {
+      "epoch": 2.9383495812165314,
+      "grad_norm": 0.543683648109436,
+      "learning_rate": 6.715506715506716e-06,
+      "loss": 0.6329,
+      "step": 2675
+    },
+    {
+      "epoch": 2.939448029658108,
+      "grad_norm": 0.5083779692649841,
+      "learning_rate": 6.5934065934065935e-06,
+      "loss": 0.8742,
+      "step": 2676
+    },
+    {
+      "epoch": 2.940546478099684,
+      "grad_norm": 0.7212001085281372,
+      "learning_rate": 6.4713064713064706e-06,
+      "loss": 0.6912,
+      "step": 2677
+    },
+    {
+      "epoch": 2.9416449265412603,
+      "grad_norm": 0.9474835991859436,
+      "learning_rate": 6.349206349206348e-06,
+      "loss": 0.649,
+      "step": 2678
+    },
+    {
+      "epoch": 2.942743374982837,
+      "grad_norm": 0.8142021298408508,
+      "learning_rate": 6.227106227106226e-06,
+      "loss": 0.6136,
+      "step": 2679
+    },
+    {
+      "epoch": 2.943841823424413,
+      "grad_norm": 2.9018187522888184,
+      "learning_rate": 6.105006105006104e-06,
+      "loss": 0.7157,
+      "step": 2680
+    },
+    {
+      "epoch": 2.944940271865989,
+      "grad_norm": 0.4023605287075043,
+      "learning_rate": 5.982905982905982e-06,
+      "loss": 0.5675,
+      "step": 2681
+    },
+    {
+      "epoch": 2.9460387203075653,
+      "grad_norm": 0.3693840801715851,
+      "learning_rate": 5.86080586080586e-06,
+      "loss": 0.5982,
+      "step": 2682
+    },
+    {
+      "epoch": 2.947137168749142,
+      "grad_norm": 0.4298234283924103,
+      "learning_rate": 5.738705738705739e-06,
+      "loss": 0.5379,
+      "step": 2683
+    },
+    {
+      "epoch": 2.948235617190718,
+      "grad_norm": 0.6495395302772522,
+      "learning_rate": 5.6166056166056165e-06,
+      "loss": 0.5411,
+      "step": 2684
+    },
+    {
+      "epoch": 2.9493340656322946,
+      "grad_norm": 0.44857510924339294,
+      "learning_rate": 5.494505494505494e-06,
+      "loss": 0.5154,
+      "step": 2685
+    },
+    {
+      "epoch": 2.9504325140738707,
+      "grad_norm": 0.7485830187797546,
+      "learning_rate": 5.372405372405372e-06,
+      "loss": 0.6595,
+      "step": 2686
+    },
+    {
+      "epoch": 2.951530962515447,
+      "grad_norm": 0.5141469836235046,
+      "learning_rate": 5.25030525030525e-06,
+      "loss": 0.6289,
+      "step": 2687
+    },
+    {
+      "epoch": 2.952629410957023,
+      "grad_norm": 0.8847435712814331,
+      "learning_rate": 5.128205128205128e-06,
+      "loss": 0.6734,
+      "step": 2688
+    },
+    {
+      "epoch": 2.9537278593985996,
+      "grad_norm": 0.570573091506958,
+      "learning_rate": 5.006105006105005e-06,
+      "loss": 0.7013,
+      "step": 2689
+    },
+    {
+      "epoch": 2.9548263078401757,
+      "grad_norm": 0.4376991391181946,
+      "learning_rate": 4.884004884004883e-06,
+      "loss": 0.5918,
+      "step": 2690
+    },
+    {
+      "epoch": 2.955924756281752,
+      "grad_norm": 0.5480318069458008,
+      "learning_rate": 4.7619047619047615e-06,
+      "loss": 0.6227,
+      "step": 2691
+    },
+    {
+      "epoch": 2.9570232047233285,
+      "grad_norm": 0.5831297636032104,
+      "learning_rate": 4.639804639804639e-06,
+      "loss": 0.6264,
+      "step": 2692
+    },
+    {
+      "epoch": 2.9581216531649046,
+      "grad_norm": 1.5778921842575073,
+      "learning_rate": 4.517704517704517e-06,
+      "loss": 0.6352,
+      "step": 2693
+    },
+    {
+      "epoch": 2.9592201016064807,
+      "grad_norm": 0.9567496180534363,
+      "learning_rate": 4.395604395604395e-06,
+      "loss": 0.6067,
+      "step": 2694
+    },
+    {
+      "epoch": 2.960318550048057,
+      "grad_norm": 0.5237869620323181,
+      "learning_rate": 4.273504273504273e-06,
+      "loss": 0.8241,
+      "step": 2695
+    },
+    {
+      "epoch": 2.9614169984896335,
+      "grad_norm": 0.3452164828777313,
+      "learning_rate": 4.151404151404151e-06,
+      "loss": 0.5718,
+      "step": 2696
+    },
+    {
+      "epoch": 2.9625154469312096,
+      "grad_norm": 0.42237767577171326,
+      "learning_rate": 4.0293040293040296e-06,
+      "loss": 0.5199,
+      "step": 2697
+    },
+    {
+      "epoch": 2.963613895372786,
+      "grad_norm": 0.7035055756568909,
+      "learning_rate": 3.907203907203907e-06,
+      "loss": 0.7078,
+      "step": 2698
+    },
+    {
+      "epoch": 2.9647123438143623,
+      "grad_norm": 0.39236482977867126,
+      "learning_rate": 3.785103785103785e-06,
+      "loss": 0.59,
+      "step": 2699
+    },
+    {
+      "epoch": 2.9658107922559385,
+      "grad_norm": 1.1658680438995361,
+      "learning_rate": 3.6630036630036627e-06,
+      "loss": 0.53,
+      "step": 2700
+    },
+    {
+      "epoch": 2.9669092406975146,
+      "grad_norm": 0.6797634363174438,
+      "learning_rate": 3.5409035409035406e-06,
+      "loss": 0.6763,
+      "step": 2701
+    },
+    {
+      "epoch": 2.968007689139091,
+      "grad_norm": 1.0421425104141235,
+      "learning_rate": 3.4188034188034185e-06,
+      "loss": 0.4,
+      "step": 2702
+    },
+    {
+      "epoch": 2.9691061375806673,
+      "grad_norm": 0.36937475204467773,
+      "learning_rate": 3.2967032967032968e-06,
+      "loss": 0.5401,
+      "step": 2703
+    },
+    {
+      "epoch": 2.9702045860222435,
+      "grad_norm": 0.4324638843536377,
+      "learning_rate": 3.174603174603174e-06,
+      "loss": 0.5882,
+      "step": 2704
+    },
+    {
+      "epoch": 2.97130303446382,
+      "grad_norm": 1.2700526714324951,
+      "learning_rate": 3.052503052503052e-06,
+      "loss": 0.613,
+      "step": 2705
+    },
+    {
+      "epoch": 2.972401482905396,
+      "grad_norm": 0.5261131525039673,
+      "learning_rate": 2.93040293040293e-06,
+      "loss": 0.6279,
+      "step": 2706
+    },
+    {
+      "epoch": 2.9734999313469723,
+      "grad_norm": 0.42924660444259644,
+      "learning_rate": 2.8083028083028082e-06,
+      "loss": 1.0058,
+      "step": 2707
+    },
+    {
+      "epoch": 2.9745983797885485,
+      "grad_norm": 3.100399971008301,
+      "learning_rate": 2.686202686202686e-06,
+      "loss": 0.5209,
+      "step": 2708
+    },
+    {
+      "epoch": 2.975696828230125,
+      "grad_norm": 0.3666403293609619,
+      "learning_rate": 2.564102564102564e-06,
+      "loss": 0.5231,
+      "step": 2709
+    },
+    {
+      "epoch": 2.976795276671701,
+      "grad_norm": 1.1315009593963623,
+      "learning_rate": 2.4420024420024414e-06,
+      "loss": 0.4449,
+      "step": 2710
+    },
+    {
+      "epoch": 2.9778937251132778,
+      "grad_norm": 0.3323412537574768,
+      "learning_rate": 2.3199023199023197e-06,
+      "loss": 0.4806,
+      "step": 2711
+    },
+    {
+      "epoch": 2.978992173554854,
+      "grad_norm": 0.7348967790603638,
+      "learning_rate": 2.1978021978021976e-06,
+      "loss": 0.7521,
+      "step": 2712
+    },
+    {
+      "epoch": 2.98009062199643,
+      "grad_norm": 1.018898606300354,
+      "learning_rate": 2.0757020757020754e-06,
+      "loss": 0.8468,
+      "step": 2713
+    },
+    {
+      "epoch": 2.981189070438006,
+      "grad_norm": 0.46808505058288574,
+      "learning_rate": 1.9536019536019533e-06,
+      "loss": 0.6992,
+      "step": 2714
+    },
+    {
+      "epoch": 2.9822875188795823,
+      "grad_norm": 0.5411276817321777,
+      "learning_rate": 1.8315018315018314e-06,
+      "loss": 0.5949,
+      "step": 2715
+    },
+    {
+      "epoch": 2.983385967321159,
+      "grad_norm": 0.45061302185058594,
+      "learning_rate": 1.7094017094017092e-06,
+      "loss": 0.4617,
+      "step": 2716
+    },
+    {
+      "epoch": 2.984484415762735,
+      "grad_norm": 0.44529294967651367,
+      "learning_rate": 1.587301587301587e-06,
+      "loss": 0.5811,
+      "step": 2717
+    },
+    {
+      "epoch": 2.9855828642043116,
+      "grad_norm": 1.255299687385559,
+      "learning_rate": 1.465201465201465e-06,
+      "loss": 1.1899,
+      "step": 2718
+    },
+    {
+      "epoch": 2.9866813126458878,
+      "grad_norm": 0.8325234651565552,
+      "learning_rate": 1.343101343101343e-06,
+      "loss": 0.6344,
+      "step": 2719
+    },
+    {
+      "epoch": 2.987779761087464,
+      "grad_norm": 1.0692095756530762,
+      "learning_rate": 1.2210012210012207e-06,
+      "loss": 0.5136,
+      "step": 2720
+    },
+    {
+      "epoch": 2.98887820952904,
+      "grad_norm": 0.4980855882167816,
+      "learning_rate": 1.0989010989010988e-06,
+      "loss": 0.6352,
+      "step": 2721
+    },
+    {
+      "epoch": 2.9899766579706166,
+      "grad_norm": 0.8502411246299744,
+      "learning_rate": 9.768009768009766e-07,
+      "loss": 0.599,
+      "step": 2722
+    },
+    {
+      "epoch": 2.9910751064121928,
+      "grad_norm": 0.4849570691585541,
+      "learning_rate": 8.547008547008546e-07,
+      "loss": 0.5862,
+      "step": 2723
+    },
+    {
+      "epoch": 2.992173554853769,
+      "grad_norm": 0.5491626858711243,
+      "learning_rate": 7.326007326007325e-07,
+      "loss": 0.5634,
+      "step": 2724
+    },
+    {
+      "epoch": 2.9932720032953455,
+      "grad_norm": 0.7289263606071472,
+      "learning_rate": 6.105006105006104e-07,
+      "loss": 0.6643,
+      "step": 2725
+    },
+    {
+      "epoch": 2.9943704517369216,
+      "grad_norm": 1.5343972444534302,
+      "learning_rate": 4.884004884004883e-07,
+      "loss": 0.71,
+      "step": 2726
+    },
+    {
+      "epoch": 2.9954689001784978,
+      "grad_norm": 0.5619814395904541,
+      "learning_rate": 3.6630036630036624e-07,
+      "loss": 0.721,
+      "step": 2727
+    },
+    {
+      "epoch": 2.996567348620074,
+      "grad_norm": 0.500442624092102,
+      "learning_rate": 2.4420024420024416e-07,
+      "loss": 0.6571,
+      "step": 2728
+    },
+    {
+      "epoch": 2.9976657970616505,
+      "grad_norm": 0.42292630672454834,
+      "learning_rate": 1.2210012210012208e-07,
+      "loss": 0.4772,
+      "step": 2729
+    },
+    {
+      "epoch": 2.9987642455032266,
+      "grad_norm": 0.4350331425666809,
+      "learning_rate": 0.0,
+      "loss": 0.7493,
+      "step": 2730
+    },
+    {
+      "epoch": 2.9987642455032266,
+      "step": 2730,
+      "total_flos": 1.0372510312766669e+18,
+      "train_loss": 0.674373844124022,
+      "train_runtime": 11584.4184,
+      "train_samples_per_second": 1.886,
+      "train_steps_per_second": 0.236
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 2730,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.0372510312766669e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}