diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,19152 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9987642455032266, + "eval_steps": 500, + "global_step": 2730, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0010984484415762735, + "grad_norm": 0.13173329830169678, + "learning_rate": 1.0989010989010988e-06, + "loss": 0.8751, + "step": 1 + }, + { + "epoch": 0.002196896883152547, + "grad_norm": 0.19401921331882477, + "learning_rate": 2.1978021978021976e-06, + "loss": 1.3488, + "step": 2 + }, + { + "epoch": 0.0032953453247288205, + "grad_norm": 0.142131969332695, + "learning_rate": 3.2967032967032968e-06, + "loss": 0.8371, + "step": 3 + }, + { + "epoch": 0.004393793766305094, + "grad_norm": 0.1124999076128006, + "learning_rate": 4.395604395604395e-06, + "loss": 1.0039, + "step": 4 + }, + { + "epoch": 0.005492242207881368, + "grad_norm": 0.20683947205543518, + "learning_rate": 5.494505494505494e-06, + "loss": 1.4423, + "step": 5 + }, + { + "epoch": 0.006590690649457641, + "grad_norm": 0.2007640153169632, + "learning_rate": 6.5934065934065935e-06, + "loss": 0.9797, + "step": 6 + }, + { + "epoch": 0.007689139091033915, + "grad_norm": 0.1362670361995697, + "learning_rate": 7.692307692307692e-06, + "loss": 1.0443, + "step": 7 + }, + { + "epoch": 0.008787587532610188, + "grad_norm": 0.21512511372566223, + "learning_rate": 8.79120879120879e-06, + "loss": 1.2888, + "step": 8 + }, + { + "epoch": 0.009886035974186462, + "grad_norm": 0.13403186202049255, + "learning_rate": 9.89010989010989e-06, + "loss": 0.9637, + "step": 9 + }, + { + "epoch": 0.010984484415762736, + "grad_norm": 0.16911157965660095, + "learning_rate": 1.0989010989010989e-05, + "loss": 0.8824, + "step": 10 + }, + { + "epoch": 0.012082932857339008, + "grad_norm": 0.19280359148979187, + "learning_rate": 1.2087912087912087e-05, + "loss": 0.9843, + "step": 11 + }, + { + "epoch": 0.013181381298915282, + "grad_norm": 0.15720519423484802, + "learning_rate": 1.3186813186813187e-05, + "loss": 0.9769, + "step": 12 + }, + { + "epoch": 0.014279829740491556, + "grad_norm": 0.18622402846813202, + "learning_rate": 1.4285714285714284e-05, + "loss": 0.903, + "step": 13 + }, + { + "epoch": 0.01537827818206783, + "grad_norm": 0.1491895169019699, + "learning_rate": 1.5384615384615384e-05, + "loss": 1.065, + "step": 14 + }, + { + "epoch": 0.016476726623644102, + "grad_norm": 0.16883142292499542, + "learning_rate": 1.6483516483516482e-05, + "loss": 0.9916, + "step": 15 + }, + { + "epoch": 0.017575175065220376, + "grad_norm": 0.155453160405159, + "learning_rate": 1.758241758241758e-05, + "loss": 1.1048, + "step": 16 + }, + { + "epoch": 0.01867362350679665, + "grad_norm": 0.12869666516780853, + "learning_rate": 1.868131868131868e-05, + "loss": 0.9355, + "step": 17 + }, + { + "epoch": 0.019772071948372924, + "grad_norm": 0.18860433995723724, + "learning_rate": 1.978021978021978e-05, + "loss": 1.1779, + "step": 18 + }, + { + "epoch": 0.020870520389949198, + "grad_norm": 0.30738529562950134, + "learning_rate": 2.087912087912088e-05, + "loss": 0.905, + "step": 19 + }, + { + "epoch": 0.021968968831525472, + "grad_norm": 0.30248674750328064, + "learning_rate": 2.1978021978021977e-05, + "loss": 1.0749, + "step": 20 + }, + { + "epoch": 0.023067417273101742, + "grad_norm": 0.17005079984664917, + "learning_rate": 2.3076923076923076e-05, + "loss": 1.0141, + "step": 21 + }, + { + "epoch": 0.024165865714678016, + "grad_norm": 0.5497377514839172, + "learning_rate": 2.4175824175824174e-05, + "loss": 0.804, + "step": 22 + }, + { + "epoch": 0.02526431415625429, + "grad_norm": 0.23464925587177277, + "learning_rate": 2.5274725274725276e-05, + "loss": 1.0592, + "step": 23 + }, + { + "epoch": 0.026362762597830564, + "grad_norm": 0.2906591594219208, + "learning_rate": 2.6373626373626374e-05, + "loss": 1.4096, + "step": 24 + }, + { + "epoch": 0.027461211039406838, + "grad_norm": 0.14552968740463257, + "learning_rate": 2.747252747252747e-05, + "loss": 0.8827, + "step": 25 + }, + { + "epoch": 0.028559659480983112, + "grad_norm": 0.26139914989471436, + "learning_rate": 2.8571428571428567e-05, + "loss": 1.1081, + "step": 26 + }, + { + "epoch": 0.029658107922559386, + "grad_norm": 0.16122505068778992, + "learning_rate": 2.9670329670329666e-05, + "loss": 0.8967, + "step": 27 + }, + { + "epoch": 0.03075655636413566, + "grad_norm": 0.19174647331237793, + "learning_rate": 3.076923076923077e-05, + "loss": 0.7527, + "step": 28 + }, + { + "epoch": 0.031855004805711934, + "grad_norm": 0.24506032466888428, + "learning_rate": 3.1868131868131866e-05, + "loss": 1.0981, + "step": 29 + }, + { + "epoch": 0.032953453247288204, + "grad_norm": 0.18928349018096924, + "learning_rate": 3.2967032967032964e-05, + "loss": 1.2955, + "step": 30 + }, + { + "epoch": 0.03405190168886448, + "grad_norm": 0.20482106506824493, + "learning_rate": 3.406593406593406e-05, + "loss": 0.886, + "step": 31 + }, + { + "epoch": 0.03515035013044075, + "grad_norm": 0.17304010689258575, + "learning_rate": 3.516483516483516e-05, + "loss": 1.0062, + "step": 32 + }, + { + "epoch": 0.03624879857201702, + "grad_norm": 0.17006444931030273, + "learning_rate": 3.626373626373626e-05, + "loss": 0.76, + "step": 33 + }, + { + "epoch": 0.0373472470135933, + "grad_norm": 0.16570955514907837, + "learning_rate": 3.736263736263736e-05, + "loss": 0.7512, + "step": 34 + }, + { + "epoch": 0.03844569545516957, + "grad_norm": 0.4470347464084625, + "learning_rate": 3.8461538461538456e-05, + "loss": 1.051, + "step": 35 + }, + { + "epoch": 0.03954414389674585, + "grad_norm": 0.3013080060482025, + "learning_rate": 3.956043956043956e-05, + "loss": 1.1269, + "step": 36 + }, + { + "epoch": 0.04064259233832212, + "grad_norm": 0.33114469051361084, + "learning_rate": 4.065934065934065e-05, + "loss": 1.046, + "step": 37 + }, + { + "epoch": 0.041741040779898396, + "grad_norm": 0.3496829867362976, + "learning_rate": 4.175824175824176e-05, + "loss": 0.9139, + "step": 38 + }, + { + "epoch": 0.042839489221474666, + "grad_norm": 0.36173877120018005, + "learning_rate": 4.285714285714285e-05, + "loss": 1.16, + "step": 39 + }, + { + "epoch": 0.043937937663050944, + "grad_norm": 0.23047995567321777, + "learning_rate": 4.3956043956043955e-05, + "loss": 0.8623, + "step": 40 + }, + { + "epoch": 0.045036386104627214, + "grad_norm": 0.33733946084976196, + "learning_rate": 4.5054945054945046e-05, + "loss": 0.873, + "step": 41 + }, + { + "epoch": 0.046134834546203485, + "grad_norm": 0.43975624442100525, + "learning_rate": 4.615384615384615e-05, + "loss": 0.9374, + "step": 42 + }, + { + "epoch": 0.04723328298777976, + "grad_norm": 0.5429202318191528, + "learning_rate": 4.725274725274725e-05, + "loss": 1.0699, + "step": 43 + }, + { + "epoch": 0.04833173142935603, + "grad_norm": 0.39317595958709717, + "learning_rate": 4.835164835164835e-05, + "loss": 0.7719, + "step": 44 + }, + { + "epoch": 0.04943017987093231, + "grad_norm": 0.41328710317611694, + "learning_rate": 4.9450549450549446e-05, + "loss": 1.112, + "step": 45 + }, + { + "epoch": 0.05052862831250858, + "grad_norm": 0.5977774858474731, + "learning_rate": 5.054945054945055e-05, + "loss": 0.9408, + "step": 46 + }, + { + "epoch": 0.05162707675408486, + "grad_norm": 0.6984797716140747, + "learning_rate": 5.164835164835164e-05, + "loss": 0.9766, + "step": 47 + }, + { + "epoch": 0.05272552519566113, + "grad_norm": 0.5161548256874084, + "learning_rate": 5.274725274725275e-05, + "loss": 1.3705, + "step": 48 + }, + { + "epoch": 0.0538239736372374, + "grad_norm": 0.5750108361244202, + "learning_rate": 5.384615384615384e-05, + "loss": 0.9492, + "step": 49 + }, + { + "epoch": 0.054922422078813676, + "grad_norm": 0.7861920595169067, + "learning_rate": 5.494505494505494e-05, + "loss": 1.1495, + "step": 50 + }, + { + "epoch": 0.05602087052038995, + "grad_norm": 0.5992287993431091, + "learning_rate": 5.6043956043956037e-05, + "loss": 1.2818, + "step": 51 + }, + { + "epoch": 0.057119318961966224, + "grad_norm": 0.5470016598701477, + "learning_rate": 5.7142857142857135e-05, + "loss": 1.0385, + "step": 52 + }, + { + "epoch": 0.058217767403542495, + "grad_norm": 0.7035269141197205, + "learning_rate": 5.824175824175824e-05, + "loss": 0.785, + "step": 53 + }, + { + "epoch": 0.05931621584511877, + "grad_norm": 0.5253639817237854, + "learning_rate": 5.934065934065933e-05, + "loss": 0.6092, + "step": 54 + }, + { + "epoch": 0.06041466428669504, + "grad_norm": 0.5233064293861389, + "learning_rate": 6.043956043956044e-05, + "loss": 0.7853, + "step": 55 + }, + { + "epoch": 0.06151311272827132, + "grad_norm": 0.4508589804172516, + "learning_rate": 6.153846153846154e-05, + "loss": 0.5737, + "step": 56 + }, + { + "epoch": 0.06261156116984759, + "grad_norm": 1.0521594285964966, + "learning_rate": 6.263736263736263e-05, + "loss": 1.0132, + "step": 57 + }, + { + "epoch": 0.06371000961142387, + "grad_norm": 0.3572557866573334, + "learning_rate": 6.373626373626373e-05, + "loss": 0.655, + "step": 58 + }, + { + "epoch": 0.06480845805300013, + "grad_norm": 0.600371241569519, + "learning_rate": 6.483516483516483e-05, + "loss": 0.8897, + "step": 59 + }, + { + "epoch": 0.06590690649457641, + "grad_norm": 0.6430579423904419, + "learning_rate": 6.593406593406593e-05, + "loss": 0.8058, + "step": 60 + }, + { + "epoch": 0.06700535493615269, + "grad_norm": 0.5309410095214844, + "learning_rate": 6.703296703296703e-05, + "loss": 0.7312, + "step": 61 + }, + { + "epoch": 0.06810380337772896, + "grad_norm": 0.46225860714912415, + "learning_rate": 6.813186813186813e-05, + "loss": 0.8607, + "step": 62 + }, + { + "epoch": 0.06920225181930523, + "grad_norm": 0.8889493346214294, + "learning_rate": 6.923076923076922e-05, + "loss": 0.7791, + "step": 63 + }, + { + "epoch": 0.0703007002608815, + "grad_norm": 0.5721575617790222, + "learning_rate": 7.032967032967032e-05, + "loss": 0.9426, + "step": 64 + }, + { + "epoch": 0.07139914870245778, + "grad_norm": 0.8355056047439575, + "learning_rate": 7.142857142857142e-05, + "loss": 0.621, + "step": 65 + }, + { + "epoch": 0.07249759714403405, + "grad_norm": 1.3048707246780396, + "learning_rate": 7.252747252747252e-05, + "loss": 0.8869, + "step": 66 + }, + { + "epoch": 0.07359604558561032, + "grad_norm": 0.5817797183990479, + "learning_rate": 7.362637362637362e-05, + "loss": 0.8385, + "step": 67 + }, + { + "epoch": 0.0746944940271866, + "grad_norm": 1.2051454782485962, + "learning_rate": 7.472527472527472e-05, + "loss": 0.7566, + "step": 68 + }, + { + "epoch": 0.07579294246876288, + "grad_norm": 0.8565987944602966, + "learning_rate": 7.582417582417581e-05, + "loss": 0.8374, + "step": 69 + }, + { + "epoch": 0.07689139091033914, + "grad_norm": 0.7503894567489624, + "learning_rate": 7.692307692307691e-05, + "loss": 0.6749, + "step": 70 + }, + { + "epoch": 0.07798983935191542, + "grad_norm": 0.6298589706420898, + "learning_rate": 7.802197802197802e-05, + "loss": 0.9096, + "step": 71 + }, + { + "epoch": 0.0790882877934917, + "grad_norm": 0.8327789306640625, + "learning_rate": 7.912087912087912e-05, + "loss": 0.9836, + "step": 72 + }, + { + "epoch": 0.08018673623506796, + "grad_norm": 1.0001461505889893, + "learning_rate": 8.021978021978021e-05, + "loss": 0.6917, + "step": 73 + }, + { + "epoch": 0.08128518467664424, + "grad_norm": 0.8373435735702515, + "learning_rate": 8.13186813186813e-05, + "loss": 0.7703, + "step": 74 + }, + { + "epoch": 0.08238363311822051, + "grad_norm": 0.9785758256912231, + "learning_rate": 8.241758241758242e-05, + "loss": 0.8004, + "step": 75 + }, + { + "epoch": 0.08348208155979679, + "grad_norm": 0.8900540471076965, + "learning_rate": 8.351648351648352e-05, + "loss": 0.8238, + "step": 76 + }, + { + "epoch": 0.08458053000137306, + "grad_norm": 0.7411159873008728, + "learning_rate": 8.46153846153846e-05, + "loss": 1.0364, + "step": 77 + }, + { + "epoch": 0.08567897844294933, + "grad_norm": 0.4975040555000305, + "learning_rate": 8.57142857142857e-05, + "loss": 0.4814, + "step": 78 + }, + { + "epoch": 0.08677742688452561, + "grad_norm": 0.6698398590087891, + "learning_rate": 8.681318681318681e-05, + "loss": 0.6828, + "step": 79 + }, + { + "epoch": 0.08787587532610189, + "grad_norm": 0.5883696675300598, + "learning_rate": 8.791208791208791e-05, + "loss": 0.92, + "step": 80 + }, + { + "epoch": 0.08897432376767815, + "grad_norm": 0.9050906896591187, + "learning_rate": 8.901098901098901e-05, + "loss": 0.7229, + "step": 81 + }, + { + "epoch": 0.09007277220925443, + "grad_norm": 0.5996706485748291, + "learning_rate": 9.010989010989009e-05, + "loss": 0.699, + "step": 82 + }, + { + "epoch": 0.0911712206508307, + "grad_norm": 2.0782630443573, + "learning_rate": 9.120879120879119e-05, + "loss": 1.2118, + "step": 83 + }, + { + "epoch": 0.09226966909240697, + "grad_norm": 0.759730875492096, + "learning_rate": 9.23076923076923e-05, + "loss": 0.6397, + "step": 84 + }, + { + "epoch": 0.09336811753398325, + "grad_norm": 1.1138097047805786, + "learning_rate": 9.34065934065934e-05, + "loss": 0.8973, + "step": 85 + }, + { + "epoch": 0.09446656597555952, + "grad_norm": 0.9852680563926697, + "learning_rate": 9.45054945054945e-05, + "loss": 1.0733, + "step": 86 + }, + { + "epoch": 0.0955650144171358, + "grad_norm": 0.8435002565383911, + "learning_rate": 9.560439560439558e-05, + "loss": 0.8977, + "step": 87 + }, + { + "epoch": 0.09666346285871207, + "grad_norm": 1.3031998872756958, + "learning_rate": 9.67032967032967e-05, + "loss": 0.9852, + "step": 88 + }, + { + "epoch": 0.09776191130028834, + "grad_norm": 0.6343463063240051, + "learning_rate": 9.78021978021978e-05, + "loss": 0.6147, + "step": 89 + }, + { + "epoch": 0.09886035974186462, + "grad_norm": 0.7061794996261597, + "learning_rate": 9.890109890109889e-05, + "loss": 0.7437, + "step": 90 + }, + { + "epoch": 0.09995880818344088, + "grad_norm": 1.2231422662734985, + "learning_rate": 9.999999999999999e-05, + "loss": 0.7944, + "step": 91 + }, + { + "epoch": 0.10105725662501716, + "grad_norm": 0.7199704647064209, + "learning_rate": 0.0001010989010989011, + "loss": 0.7355, + "step": 92 + }, + { + "epoch": 0.10215570506659344, + "grad_norm": 1.2740516662597656, + "learning_rate": 0.00010219780219780219, + "loss": 0.7622, + "step": 93 + }, + { + "epoch": 0.10325415350816972, + "grad_norm": 0.7762659788131714, + "learning_rate": 0.00010329670329670329, + "loss": 0.7074, + "step": 94 + }, + { + "epoch": 0.10435260194974598, + "grad_norm": 0.6618936061859131, + "learning_rate": 0.00010439560439560438, + "loss": 0.7667, + "step": 95 + }, + { + "epoch": 0.10545105039132226, + "grad_norm": 0.7244533896446228, + "learning_rate": 0.0001054945054945055, + "loss": 0.6451, + "step": 96 + }, + { + "epoch": 0.10654949883289853, + "grad_norm": 0.6391953229904175, + "learning_rate": 0.0001065934065934066, + "loss": 0.5637, + "step": 97 + }, + { + "epoch": 0.1076479472744748, + "grad_norm": 0.6992442607879639, + "learning_rate": 0.00010769230769230768, + "loss": 0.7112, + "step": 98 + }, + { + "epoch": 0.10874639571605108, + "grad_norm": 1.0820791721343994, + "learning_rate": 0.00010879120879120878, + "loss": 0.9199, + "step": 99 + }, + { + "epoch": 0.10984484415762735, + "grad_norm": 0.6012185215950012, + "learning_rate": 0.00010989010989010988, + "loss": 0.5574, + "step": 100 + }, + { + "epoch": 0.11094329259920363, + "grad_norm": 0.822455644607544, + "learning_rate": 0.00011098901098901099, + "loss": 0.5185, + "step": 101 + }, + { + "epoch": 0.1120417410407799, + "grad_norm": 0.9417555332183838, + "learning_rate": 0.00011208791208791207, + "loss": 0.6883, + "step": 102 + }, + { + "epoch": 0.11314018948235617, + "grad_norm": 1.0258208513259888, + "learning_rate": 0.00011318681318681317, + "loss": 0.7588, + "step": 103 + }, + { + "epoch": 0.11423863792393245, + "grad_norm": 1.904179573059082, + "learning_rate": 0.00011428571428571427, + "loss": 0.7425, + "step": 104 + }, + { + "epoch": 0.11533708636550873, + "grad_norm": 1.5453238487243652, + "learning_rate": 0.00011538461538461538, + "loss": 0.658, + "step": 105 + }, + { + "epoch": 0.11643553480708499, + "grad_norm": 0.8801619410514832, + "learning_rate": 0.00011648351648351648, + "loss": 0.8432, + "step": 106 + }, + { + "epoch": 0.11753398324866127, + "grad_norm": 0.8567579388618469, + "learning_rate": 0.00011758241758241756, + "loss": 0.5904, + "step": 107 + }, + { + "epoch": 0.11863243169023754, + "grad_norm": 0.9351131319999695, + "learning_rate": 0.00011868131868131866, + "loss": 0.7228, + "step": 108 + }, + { + "epoch": 0.11973088013181381, + "grad_norm": 0.8817545175552368, + "learning_rate": 0.00011978021978021978, + "loss": 0.7853, + "step": 109 + }, + { + "epoch": 0.12082932857339009, + "grad_norm": 1.0484094619750977, + "learning_rate": 0.00012087912087912087, + "loss": 0.7049, + "step": 110 + }, + { + "epoch": 0.12192777701496636, + "grad_norm": 1.80658757686615, + "learning_rate": 0.00012197802197802197, + "loss": 0.669, + "step": 111 + }, + { + "epoch": 0.12302622545654264, + "grad_norm": 1.5311473608016968, + "learning_rate": 0.00012307692307692307, + "loss": 0.8342, + "step": 112 + }, + { + "epoch": 0.1241246738981189, + "grad_norm": 0.8968105912208557, + "learning_rate": 0.00012417582417582416, + "loss": 0.7199, + "step": 113 + }, + { + "epoch": 0.12522312233969518, + "grad_norm": 0.6149659156799316, + "learning_rate": 0.00012527472527472527, + "loss": 0.4961, + "step": 114 + }, + { + "epoch": 0.12632157078127146, + "grad_norm": 8.04592227935791, + "learning_rate": 0.00012637362637362635, + "loss": 0.7515, + "step": 115 + }, + { + "epoch": 0.12742001922284774, + "grad_norm": 0.7797659039497375, + "learning_rate": 0.00012747252747252746, + "loss": 0.7281, + "step": 116 + }, + { + "epoch": 0.128518467664424, + "grad_norm": 0.6414046883583069, + "learning_rate": 0.00012857142857142855, + "loss": 0.6655, + "step": 117 + }, + { + "epoch": 0.12961691610600026, + "grad_norm": 4.678529262542725, + "learning_rate": 0.00012967032967032966, + "loss": 0.9165, + "step": 118 + }, + { + "epoch": 0.13071536454757654, + "grad_norm": 0.8540724515914917, + "learning_rate": 0.00013076923076923077, + "loss": 0.7064, + "step": 119 + }, + { + "epoch": 0.13181381298915282, + "grad_norm": 1.057844638824463, + "learning_rate": 0.00013186813186813186, + "loss": 0.6617, + "step": 120 + }, + { + "epoch": 0.1329122614307291, + "grad_norm": 0.8429140448570251, + "learning_rate": 0.00013296703296703294, + "loss": 0.8156, + "step": 121 + }, + { + "epoch": 0.13401070987230537, + "grad_norm": 0.9944230914115906, + "learning_rate": 0.00013406593406593405, + "loss": 0.5851, + "step": 122 + }, + { + "epoch": 0.13510915831388165, + "grad_norm": 0.6582810878753662, + "learning_rate": 0.00013516483516483517, + "loss": 0.5819, + "step": 123 + }, + { + "epoch": 0.13620760675545793, + "grad_norm": 1.3106951713562012, + "learning_rate": 0.00013626373626373625, + "loss": 0.7598, + "step": 124 + }, + { + "epoch": 0.13730605519703418, + "grad_norm": 1.0464080572128296, + "learning_rate": 0.00013736263736263734, + "loss": 0.7241, + "step": 125 + }, + { + "epoch": 0.13840450363861045, + "grad_norm": 0.8519262075424194, + "learning_rate": 0.00013846153846153845, + "loss": 0.7001, + "step": 126 + }, + { + "epoch": 0.13950295208018673, + "grad_norm": 1.2764228582382202, + "learning_rate": 0.00013956043956043956, + "loss": 0.7152, + "step": 127 + }, + { + "epoch": 0.140601400521763, + "grad_norm": 1.157472014427185, + "learning_rate": 0.00014065934065934064, + "loss": 0.697, + "step": 128 + }, + { + "epoch": 0.1416998489633393, + "grad_norm": 0.7153847813606262, + "learning_rate": 0.00014175824175824173, + "loss": 0.6897, + "step": 129 + }, + { + "epoch": 0.14279829740491556, + "grad_norm": 0.7254152297973633, + "learning_rate": 0.00014285714285714284, + "loss": 0.5263, + "step": 130 + }, + { + "epoch": 0.14389674584649184, + "grad_norm": 1.3370522260665894, + "learning_rate": 0.00014395604395604395, + "loss": 0.7587, + "step": 131 + }, + { + "epoch": 0.1449951942880681, + "grad_norm": 1.092029333114624, + "learning_rate": 0.00014505494505494504, + "loss": 0.8674, + "step": 132 + }, + { + "epoch": 0.14609364272964437, + "grad_norm": 0.6123655438423157, + "learning_rate": 0.00014615384615384615, + "loss": 0.7163, + "step": 133 + }, + { + "epoch": 0.14719209117122065, + "grad_norm": 0.8476639986038208, + "learning_rate": 0.00014725274725274723, + "loss": 0.7241, + "step": 134 + }, + { + "epoch": 0.14829053961279692, + "grad_norm": 0.9986979961395264, + "learning_rate": 0.00014835164835164835, + "loss": 0.6229, + "step": 135 + }, + { + "epoch": 0.1493889880543732, + "grad_norm": 0.8208728432655334, + "learning_rate": 0.00014945054945054943, + "loss": 0.5441, + "step": 136 + }, + { + "epoch": 0.15048743649594948, + "grad_norm": 0.742091953754425, + "learning_rate": 0.00015054945054945054, + "loss": 0.6047, + "step": 137 + }, + { + "epoch": 0.15158588493752576, + "grad_norm": 1.6566306352615356, + "learning_rate": 0.00015164835164835163, + "loss": 0.6381, + "step": 138 + }, + { + "epoch": 0.152684333379102, + "grad_norm": 0.7735741138458252, + "learning_rate": 0.0001527472527472527, + "loss": 0.5842, + "step": 139 + }, + { + "epoch": 0.15378278182067828, + "grad_norm": 0.7116795778274536, + "learning_rate": 0.00015384615384615382, + "loss": 0.7117, + "step": 140 + }, + { + "epoch": 0.15488123026225456, + "grad_norm": 0.6912885904312134, + "learning_rate": 0.00015494505494505494, + "loss": 0.763, + "step": 141 + }, + { + "epoch": 0.15597967870383084, + "grad_norm": 1.0789505243301392, + "learning_rate": 0.00015604395604395605, + "loss": 0.5534, + "step": 142 + }, + { + "epoch": 0.15707812714540711, + "grad_norm": 1.0304033756256104, + "learning_rate": 0.00015714285714285713, + "loss": 0.4961, + "step": 143 + }, + { + "epoch": 0.1581765755869834, + "grad_norm": 1.0216940641403198, + "learning_rate": 0.00015824175824175824, + "loss": 0.8167, + "step": 144 + }, + { + "epoch": 0.15927502402855967, + "grad_norm": 0.7767283916473389, + "learning_rate": 0.00015934065934065933, + "loss": 0.649, + "step": 145 + }, + { + "epoch": 0.16037347247013592, + "grad_norm": 0.6125204563140869, + "learning_rate": 0.00016043956043956041, + "loss": 0.6596, + "step": 146 + }, + { + "epoch": 0.1614719209117122, + "grad_norm": 2.113314390182495, + "learning_rate": 0.00016153846153846153, + "loss": 0.6825, + "step": 147 + }, + { + "epoch": 0.16257036935328847, + "grad_norm": 1.3892889022827148, + "learning_rate": 0.0001626373626373626, + "loss": 0.5162, + "step": 148 + }, + { + "epoch": 0.16366881779486475, + "grad_norm": 1.2544710636138916, + "learning_rate": 0.0001637362637362637, + "loss": 0.5992, + "step": 149 + }, + { + "epoch": 0.16476726623644103, + "grad_norm": 1.2952786684036255, + "learning_rate": 0.00016483516483516484, + "loss": 0.5968, + "step": 150 + }, + { + "epoch": 0.1658657146780173, + "grad_norm": 0.9910382628440857, + "learning_rate": 0.00016593406593406592, + "loss": 0.6138, + "step": 151 + }, + { + "epoch": 0.16696416311959358, + "grad_norm": 0.7291635870933533, + "learning_rate": 0.00016703296703296703, + "loss": 0.8957, + "step": 152 + }, + { + "epoch": 0.16806261156116986, + "grad_norm": 0.7290105819702148, + "learning_rate": 0.00016813186813186812, + "loss": 0.4864, + "step": 153 + }, + { + "epoch": 0.1691610600027461, + "grad_norm": 1.1888444423675537, + "learning_rate": 0.0001692307692307692, + "loss": 0.913, + "step": 154 + }, + { + "epoch": 0.1702595084443224, + "grad_norm": 0.8183659315109253, + "learning_rate": 0.0001703296703296703, + "loss": 0.6405, + "step": 155 + }, + { + "epoch": 0.17135795688589867, + "grad_norm": 0.8549530506134033, + "learning_rate": 0.0001714285714285714, + "loss": 0.7019, + "step": 156 + }, + { + "epoch": 0.17245640532747494, + "grad_norm": 0.5960697531700134, + "learning_rate": 0.0001725274725274725, + "loss": 0.6728, + "step": 157 + }, + { + "epoch": 0.17355485376905122, + "grad_norm": 0.6802973747253418, + "learning_rate": 0.00017362637362637362, + "loss": 0.6462, + "step": 158 + }, + { + "epoch": 0.1746533022106275, + "grad_norm": 0.5056049823760986, + "learning_rate": 0.00017472527472527473, + "loss": 0.5155, + "step": 159 + }, + { + "epoch": 0.17575175065220378, + "grad_norm": 0.8181887865066528, + "learning_rate": 0.00017582417582417582, + "loss": 0.6631, + "step": 160 + }, + { + "epoch": 0.17685019909378003, + "grad_norm": 0.5748574137687683, + "learning_rate": 0.0001769230769230769, + "loss": 0.5807, + "step": 161 + }, + { + "epoch": 0.1779486475353563, + "grad_norm": 0.8585043549537659, + "learning_rate": 0.00017802197802197802, + "loss": 0.5412, + "step": 162 + }, + { + "epoch": 0.17904709597693258, + "grad_norm": 0.8763203620910645, + "learning_rate": 0.0001791208791208791, + "loss": 1.0859, + "step": 163 + }, + { + "epoch": 0.18014554441850886, + "grad_norm": 0.7327267527580261, + "learning_rate": 0.00018021978021978018, + "loss": 0.8034, + "step": 164 + }, + { + "epoch": 0.18124399286008513, + "grad_norm": 0.6813991665840149, + "learning_rate": 0.0001813186813186813, + "loss": 0.9236, + "step": 165 + }, + { + "epoch": 0.1823424413016614, + "grad_norm": 2.9234185218811035, + "learning_rate": 0.00018241758241758238, + "loss": 0.9148, + "step": 166 + }, + { + "epoch": 0.1834408897432377, + "grad_norm": 0.8117207884788513, + "learning_rate": 0.00018351648351648352, + "loss": 1.0514, + "step": 167 + }, + { + "epoch": 0.18453933818481394, + "grad_norm": 0.6485300064086914, + "learning_rate": 0.0001846153846153846, + "loss": 0.4764, + "step": 168 + }, + { + "epoch": 0.18563778662639022, + "grad_norm": 0.43059054017066956, + "learning_rate": 0.00018571428571428572, + "loss": 0.6289, + "step": 169 + }, + { + "epoch": 0.1867362350679665, + "grad_norm": 1.007095456123352, + "learning_rate": 0.0001868131868131868, + "loss": 0.5889, + "step": 170 + }, + { + "epoch": 0.18783468350954277, + "grad_norm": 1.6733218431472778, + "learning_rate": 0.0001879120879120879, + "loss": 0.8036, + "step": 171 + }, + { + "epoch": 0.18893313195111905, + "grad_norm": 0.7533760666847229, + "learning_rate": 0.000189010989010989, + "loss": 0.7282, + "step": 172 + }, + { + "epoch": 0.19003158039269533, + "grad_norm": 0.45892444252967834, + "learning_rate": 0.00019010989010989008, + "loss": 0.6273, + "step": 173 + }, + { + "epoch": 0.1911300288342716, + "grad_norm": 0.54690021276474, + "learning_rate": 0.00019120879120879117, + "loss": 0.669, + "step": 174 + }, + { + "epoch": 0.19222847727584785, + "grad_norm": 0.7361836433410645, + "learning_rate": 0.0001923076923076923, + "loss": 0.8945, + "step": 175 + }, + { + "epoch": 0.19332692571742413, + "grad_norm": 0.5876324772834778, + "learning_rate": 0.0001934065934065934, + "loss": 0.7557, + "step": 176 + }, + { + "epoch": 0.1944253741590004, + "grad_norm": 0.7753897309303284, + "learning_rate": 0.0001945054945054945, + "loss": 0.7904, + "step": 177 + }, + { + "epoch": 0.19552382260057669, + "grad_norm": 0.6244968771934509, + "learning_rate": 0.0001956043956043956, + "loss": 0.7617, + "step": 178 + }, + { + "epoch": 0.19662227104215296, + "grad_norm": 0.6300948262214661, + "learning_rate": 0.00019670329670329667, + "loss": 0.5884, + "step": 179 + }, + { + "epoch": 0.19772071948372924, + "grad_norm": 0.5845354795455933, + "learning_rate": 0.00019780219780219779, + "loss": 0.8034, + "step": 180 + }, + { + "epoch": 0.19881916792530552, + "grad_norm": 0.5231277942657471, + "learning_rate": 0.00019890109890109887, + "loss": 0.5302, + "step": 181 + }, + { + "epoch": 0.19991761636688177, + "grad_norm": 0.8393481969833374, + "learning_rate": 0.00019999999999999998, + "loss": 0.6376, + "step": 182 + }, + { + "epoch": 0.20101606480845804, + "grad_norm": 0.5777038335800171, + "learning_rate": 0.00020109890109890107, + "loss": 0.5777, + "step": 183 + }, + { + "epoch": 0.20211451325003432, + "grad_norm": 0.7751956582069397, + "learning_rate": 0.0002021978021978022, + "loss": 0.8368, + "step": 184 + }, + { + "epoch": 0.2032129616916106, + "grad_norm": 1.5582187175750732, + "learning_rate": 0.0002032967032967033, + "loss": 0.5087, + "step": 185 + }, + { + "epoch": 0.20431141013318688, + "grad_norm": 0.8304231762886047, + "learning_rate": 0.00020439560439560438, + "loss": 0.5512, + "step": 186 + }, + { + "epoch": 0.20540985857476315, + "grad_norm": 0.8545000553131104, + "learning_rate": 0.0002054945054945055, + "loss": 1.2533, + "step": 187 + }, + { + "epoch": 0.20650830701633943, + "grad_norm": 0.4891647696495056, + "learning_rate": 0.00020659340659340657, + "loss": 0.5738, + "step": 188 + }, + { + "epoch": 0.20760675545791568, + "grad_norm": 0.7159665822982788, + "learning_rate": 0.00020769230769230766, + "loss": 0.9266, + "step": 189 + }, + { + "epoch": 0.20870520389949196, + "grad_norm": 0.5053237080574036, + "learning_rate": 0.00020879120879120877, + "loss": 0.4574, + "step": 190 + }, + { + "epoch": 0.20980365234106824, + "grad_norm": 0.728336751461029, + "learning_rate": 0.00020989010989010985, + "loss": 0.6871, + "step": 191 + }, + { + "epoch": 0.2109021007826445, + "grad_norm": 0.8593311309814453, + "learning_rate": 0.000210989010989011, + "loss": 0.6788, + "step": 192 + }, + { + "epoch": 0.2120005492242208, + "grad_norm": 1.247111201286316, + "learning_rate": 0.00021208791208791208, + "loss": 0.5428, + "step": 193 + }, + { + "epoch": 0.21309899766579707, + "grad_norm": 0.6636946201324463, + "learning_rate": 0.0002131868131868132, + "loss": 0.7935, + "step": 194 + }, + { + "epoch": 0.21419744610737335, + "grad_norm": 0.5811622738838196, + "learning_rate": 0.00021428571428571427, + "loss": 0.4322, + "step": 195 + }, + { + "epoch": 0.2152958945489496, + "grad_norm": 0.5329126715660095, + "learning_rate": 0.00021538461538461536, + "loss": 0.7037, + "step": 196 + }, + { + "epoch": 0.21639434299052587, + "grad_norm": 1.730969786643982, + "learning_rate": 0.00021648351648351647, + "loss": 1.0315, + "step": 197 + }, + { + "epoch": 0.21749279143210215, + "grad_norm": 0.5242175459861755, + "learning_rate": 0.00021758241758241756, + "loss": 0.9285, + "step": 198 + }, + { + "epoch": 0.21859123987367843, + "grad_norm": 0.4745596945285797, + "learning_rate": 0.00021868131868131864, + "loss": 0.5414, + "step": 199 + }, + { + "epoch": 0.2196896883152547, + "grad_norm": 0.8693228363990784, + "learning_rate": 0.00021978021978021975, + "loss": 0.4576, + "step": 200 + }, + { + "epoch": 0.22078813675683098, + "grad_norm": 0.7073357105255127, + "learning_rate": 0.00022087912087912086, + "loss": 0.778, + "step": 201 + }, + { + "epoch": 0.22188658519840726, + "grad_norm": 0.535009503364563, + "learning_rate": 0.00022197802197802198, + "loss": 0.7734, + "step": 202 + }, + { + "epoch": 0.2229850336399835, + "grad_norm": 0.5862578749656677, + "learning_rate": 0.00022307692307692306, + "loss": 0.8612, + "step": 203 + }, + { + "epoch": 0.2240834820815598, + "grad_norm": 0.5167233943939209, + "learning_rate": 0.00022417582417582415, + "loss": 0.6122, + "step": 204 + }, + { + "epoch": 0.22518193052313606, + "grad_norm": 0.8982027769088745, + "learning_rate": 0.00022527472527472526, + "loss": 0.8905, + "step": 205 + }, + { + "epoch": 0.22628037896471234, + "grad_norm": 0.7311340570449829, + "learning_rate": 0.00022637362637362634, + "loss": 1.0151, + "step": 206 + }, + { + "epoch": 0.22737882740628862, + "grad_norm": 0.45674124360084534, + "learning_rate": 0.00022747252747252745, + "loss": 0.7056, + "step": 207 + }, + { + "epoch": 0.2284772758478649, + "grad_norm": 0.6916844844818115, + "learning_rate": 0.00022857142857142854, + "loss": 0.5977, + "step": 208 + }, + { + "epoch": 0.22957572428944117, + "grad_norm": 0.6632958650588989, + "learning_rate": 0.00022967032967032962, + "loss": 0.8228, + "step": 209 + }, + { + "epoch": 0.23067417273101745, + "grad_norm": 0.3243491053581238, + "learning_rate": 0.00023076923076923076, + "loss": 0.4823, + "step": 210 + }, + { + "epoch": 0.2317726211725937, + "grad_norm": 0.45630499720573425, + "learning_rate": 0.00023186813186813185, + "loss": 0.7206, + "step": 211 + }, + { + "epoch": 0.23287106961416998, + "grad_norm": 0.6726184487342834, + "learning_rate": 0.00023296703296703296, + "loss": 0.8211, + "step": 212 + }, + { + "epoch": 0.23396951805574626, + "grad_norm": 0.45092982053756714, + "learning_rate": 0.00023406593406593405, + "loss": 0.6812, + "step": 213 + }, + { + "epoch": 0.23506796649732253, + "grad_norm": 0.5624651312828064, + "learning_rate": 0.00023516483516483513, + "loss": 0.726, + "step": 214 + }, + { + "epoch": 0.2361664149388988, + "grad_norm": 1.1685765981674194, + "learning_rate": 0.00023626373626373624, + "loss": 0.7906, + "step": 215 + }, + { + "epoch": 0.2372648633804751, + "grad_norm": 0.581599771976471, + "learning_rate": 0.00023736263736263733, + "loss": 0.7049, + "step": 216 + }, + { + "epoch": 0.23836331182205137, + "grad_norm": 0.7660847902297974, + "learning_rate": 0.00023846153846153844, + "loss": 0.6105, + "step": 217 + }, + { + "epoch": 0.23946176026362762, + "grad_norm": 0.5126472115516663, + "learning_rate": 0.00023956043956043955, + "loss": 0.7134, + "step": 218 + }, + { + "epoch": 0.2405602087052039, + "grad_norm": 0.48460498452186584, + "learning_rate": 0.00024065934065934066, + "loss": 0.5578, + "step": 219 + }, + { + "epoch": 0.24165865714678017, + "grad_norm": 0.41463029384613037, + "learning_rate": 0.00024175824175824175, + "loss": 0.5589, + "step": 220 + }, + { + "epoch": 0.24275710558835645, + "grad_norm": 2.0703623294830322, + "learning_rate": 0.00024285714285714283, + "loss": 0.7128, + "step": 221 + }, + { + "epoch": 0.24385555402993273, + "grad_norm": 1.5641820430755615, + "learning_rate": 0.00024395604395604394, + "loss": 0.4439, + "step": 222 + }, + { + "epoch": 0.244954002471509, + "grad_norm": 0.34634652733802795, + "learning_rate": 0.00024505494505494503, + "loss": 0.5389, + "step": 223 + }, + { + "epoch": 0.24605245091308528, + "grad_norm": 0.5669183135032654, + "learning_rate": 0.00024615384615384614, + "loss": 0.5699, + "step": 224 + }, + { + "epoch": 0.24715089935466153, + "grad_norm": 0.6459633111953735, + "learning_rate": 0.0002472527472527472, + "loss": 0.7904, + "step": 225 + }, + { + "epoch": 0.2482493477962378, + "grad_norm": 0.9719502925872803, + "learning_rate": 0.0002483516483516483, + "loss": 0.7354, + "step": 226 + }, + { + "epoch": 0.24934779623781408, + "grad_norm": 0.7433357834815979, + "learning_rate": 0.0002494505494505494, + "loss": 0.5772, + "step": 227 + }, + { + "epoch": 0.25044624467939036, + "grad_norm": 0.42272481322288513, + "learning_rate": 0.00025054945054945053, + "loss": 0.5609, + "step": 228 + }, + { + "epoch": 0.2515446931209666, + "grad_norm": 1.2868828773498535, + "learning_rate": 0.00025164835164835165, + "loss": 0.5775, + "step": 229 + }, + { + "epoch": 0.2526431415625429, + "grad_norm": 0.40398430824279785, + "learning_rate": 0.0002527472527472527, + "loss": 0.742, + "step": 230 + }, + { + "epoch": 0.25374159000411917, + "grad_norm": 0.46501678228378296, + "learning_rate": 0.0002538461538461538, + "loss": 0.69, + "step": 231 + }, + { + "epoch": 0.25484003844569547, + "grad_norm": 0.46631869673728943, + "learning_rate": 0.00025494505494505493, + "loss": 0.7712, + "step": 232 + }, + { + "epoch": 0.2559384868872717, + "grad_norm": 0.6761367321014404, + "learning_rate": 0.000256043956043956, + "loss": 0.64, + "step": 233 + }, + { + "epoch": 0.257036935328848, + "grad_norm": 0.6253519654273987, + "learning_rate": 0.0002571428571428571, + "loss": 0.5499, + "step": 234 + }, + { + "epoch": 0.2581353837704243, + "grad_norm": 1.0556268692016602, + "learning_rate": 0.0002582417582417582, + "loss": 0.869, + "step": 235 + }, + { + "epoch": 0.2592338322120005, + "grad_norm": 0.4816044867038727, + "learning_rate": 0.0002593406593406593, + "loss": 0.6061, + "step": 236 + }, + { + "epoch": 0.26033228065357683, + "grad_norm": 1.1049383878707886, + "learning_rate": 0.00026043956043956043, + "loss": 0.7695, + "step": 237 + }, + { + "epoch": 0.2614307290951531, + "grad_norm": 0.44643181562423706, + "learning_rate": 0.00026153846153846154, + "loss": 0.7849, + "step": 238 + }, + { + "epoch": 0.2625291775367294, + "grad_norm": 0.5231640338897705, + "learning_rate": 0.0002626373626373626, + "loss": 0.8033, + "step": 239 + }, + { + "epoch": 0.26362762597830564, + "grad_norm": 0.5537316799163818, + "learning_rate": 0.0002637362637362637, + "loss": 0.7317, + "step": 240 + }, + { + "epoch": 0.26472607441988194, + "grad_norm": 0.42069998383522034, + "learning_rate": 0.0002648351648351648, + "loss": 0.6325, + "step": 241 + }, + { + "epoch": 0.2658245228614582, + "grad_norm": 0.8009732365608215, + "learning_rate": 0.0002659340659340659, + "loss": 0.6589, + "step": 242 + }, + { + "epoch": 0.26692297130303444, + "grad_norm": 1.2626444101333618, + "learning_rate": 0.000267032967032967, + "loss": 0.5845, + "step": 243 + }, + { + "epoch": 0.26802141974461074, + "grad_norm": 0.4783913195133209, + "learning_rate": 0.0002681318681318681, + "loss": 0.8844, + "step": 244 + }, + { + "epoch": 0.269119868186187, + "grad_norm": 1.098160982131958, + "learning_rate": 0.0002692307692307692, + "loss": 0.6134, + "step": 245 + }, + { + "epoch": 0.2702183166277633, + "grad_norm": 1.0397273302078247, + "learning_rate": 0.00027032967032967033, + "loss": 0.7861, + "step": 246 + }, + { + "epoch": 0.27131676506933955, + "grad_norm": 0.9729229807853699, + "learning_rate": 0.0002714285714285714, + "loss": 0.7691, + "step": 247 + }, + { + "epoch": 0.27241521351091585, + "grad_norm": 0.44837963581085205, + "learning_rate": 0.0002725274725274725, + "loss": 0.9414, + "step": 248 + }, + { + "epoch": 0.2735136619524921, + "grad_norm": 1.4863499402999878, + "learning_rate": 0.0002736263736263736, + "loss": 0.5825, + "step": 249 + }, + { + "epoch": 0.27461211039406835, + "grad_norm": 0.5948237180709839, + "learning_rate": 0.00027472527472527467, + "loss": 0.4934, + "step": 250 + }, + { + "epoch": 0.27571055883564466, + "grad_norm": 0.5448721051216125, + "learning_rate": 0.0002758241758241758, + "loss": 0.6295, + "step": 251 + }, + { + "epoch": 0.2768090072772209, + "grad_norm": 0.4309394657611847, + "learning_rate": 0.0002769230769230769, + "loss": 0.6561, + "step": 252 + }, + { + "epoch": 0.2779074557187972, + "grad_norm": 0.7659335136413574, + "learning_rate": 0.000278021978021978, + "loss": 0.7588, + "step": 253 + }, + { + "epoch": 0.27900590416037346, + "grad_norm": 0.45655715465545654, + "learning_rate": 0.0002791208791208791, + "loss": 0.5257, + "step": 254 + }, + { + "epoch": 0.28010435260194977, + "grad_norm": 0.5390630960464478, + "learning_rate": 0.0002802197802197802, + "loss": 0.7051, + "step": 255 + }, + { + "epoch": 0.281202801043526, + "grad_norm": 0.39703306555747986, + "learning_rate": 0.0002813186813186813, + "loss": 0.6137, + "step": 256 + }, + { + "epoch": 0.28230124948510227, + "grad_norm": 0.4662924110889435, + "learning_rate": 0.0002824175824175824, + "loss": 0.4897, + "step": 257 + }, + { + "epoch": 0.2833996979266786, + "grad_norm": 0.39399877190589905, + "learning_rate": 0.00028351648351648346, + "loss": 0.6235, + "step": 258 + }, + { + "epoch": 0.2844981463682548, + "grad_norm": 0.497549444437027, + "learning_rate": 0.00028461538461538457, + "loss": 0.5134, + "step": 259 + }, + { + "epoch": 0.28559659480983113, + "grad_norm": 0.6597803235054016, + "learning_rate": 0.0002857142857142857, + "loss": 0.7955, + "step": 260 + }, + { + "epoch": 0.2866950432514074, + "grad_norm": 0.5545711517333984, + "learning_rate": 0.0002868131868131868, + "loss": 0.833, + "step": 261 + }, + { + "epoch": 0.2877934916929837, + "grad_norm": 1.0227786302566528, + "learning_rate": 0.0002879120879120879, + "loss": 0.5249, + "step": 262 + }, + { + "epoch": 0.28889194013455993, + "grad_norm": 0.5727143883705139, + "learning_rate": 0.000289010989010989, + "loss": 0.6319, + "step": 263 + }, + { + "epoch": 0.2899903885761362, + "grad_norm": 0.39322397112846375, + "learning_rate": 0.0002901098901098901, + "loss": 0.7003, + "step": 264 + }, + { + "epoch": 0.2910888370177125, + "grad_norm": 0.5657737851142883, + "learning_rate": 0.0002912087912087912, + "loss": 0.7085, + "step": 265 + }, + { + "epoch": 0.29218728545928874, + "grad_norm": 0.4305976927280426, + "learning_rate": 0.0002923076923076923, + "loss": 0.5931, + "step": 266 + }, + { + "epoch": 0.29328573390086504, + "grad_norm": 0.5300284624099731, + "learning_rate": 0.00029340659340659336, + "loss": 0.7881, + "step": 267 + }, + { + "epoch": 0.2943841823424413, + "grad_norm": 0.5922349095344543, + "learning_rate": 0.00029450549450549447, + "loss": 0.8688, + "step": 268 + }, + { + "epoch": 0.2954826307840176, + "grad_norm": 0.5700828433036804, + "learning_rate": 0.0002956043956043956, + "loss": 1.1328, + "step": 269 + }, + { + "epoch": 0.29658107922559385, + "grad_norm": 0.6773694753646851, + "learning_rate": 0.0002967032967032967, + "loss": 0.7821, + "step": 270 + }, + { + "epoch": 0.2976795276671701, + "grad_norm": 0.5200739502906799, + "learning_rate": 0.0002978021978021978, + "loss": 0.8775, + "step": 271 + }, + { + "epoch": 0.2987779761087464, + "grad_norm": 0.9860020875930786, + "learning_rate": 0.00029890109890109886, + "loss": 0.9141, + "step": 272 + }, + { + "epoch": 0.29987642455032265, + "grad_norm": 0.7012956142425537, + "learning_rate": 0.0003, + "loss": 0.7672, + "step": 273 + }, + { + "epoch": 0.30097487299189896, + "grad_norm": 0.4128098785877228, + "learning_rate": 0.0002998778998778999, + "loss": 0.3969, + "step": 274 + }, + { + "epoch": 0.3020733214334752, + "grad_norm": 0.366597980260849, + "learning_rate": 0.00029975579975579974, + "loss": 0.639, + "step": 275 + }, + { + "epoch": 0.3031717698750515, + "grad_norm": 0.5208033919334412, + "learning_rate": 0.0002996336996336996, + "loss": 0.664, + "step": 276 + }, + { + "epoch": 0.30427021831662776, + "grad_norm": 0.45519202947616577, + "learning_rate": 0.0002995115995115995, + "loss": 0.8495, + "step": 277 + }, + { + "epoch": 0.305368666758204, + "grad_norm": 0.6617010831832886, + "learning_rate": 0.0002993894993894994, + "loss": 1.0204, + "step": 278 + }, + { + "epoch": 0.3064671151997803, + "grad_norm": 1.4151723384857178, + "learning_rate": 0.00029926739926739923, + "loss": 0.8289, + "step": 279 + }, + { + "epoch": 0.30756556364135657, + "grad_norm": 0.6531035900115967, + "learning_rate": 0.00029914529914529915, + "loss": 0.7571, + "step": 280 + }, + { + "epoch": 0.30866401208293287, + "grad_norm": 0.8595600724220276, + "learning_rate": 0.000299023199023199, + "loss": 0.9668, + "step": 281 + }, + { + "epoch": 0.3097624605245091, + "grad_norm": 0.50210040807724, + "learning_rate": 0.00029890109890109886, + "loss": 0.6662, + "step": 282 + }, + { + "epoch": 0.3108609089660854, + "grad_norm": 0.6004669666290283, + "learning_rate": 0.0002987789987789988, + "loss": 0.7127, + "step": 283 + }, + { + "epoch": 0.3119593574076617, + "grad_norm": 0.8085057139396667, + "learning_rate": 0.00029865689865689863, + "loss": 0.9266, + "step": 284 + }, + { + "epoch": 0.3130578058492379, + "grad_norm": 0.44965627789497375, + "learning_rate": 0.0002985347985347985, + "loss": 0.7118, + "step": 285 + }, + { + "epoch": 0.31415625429081423, + "grad_norm": 0.5758265852928162, + "learning_rate": 0.00029841269841269835, + "loss": 0.6915, + "step": 286 + }, + { + "epoch": 0.3152547027323905, + "grad_norm": 0.5623393058776855, + "learning_rate": 0.00029829059829059826, + "loss": 0.6962, + "step": 287 + }, + { + "epoch": 0.3163531511739668, + "grad_norm": 0.857796311378479, + "learning_rate": 0.0002981684981684982, + "loss": 0.676, + "step": 288 + }, + { + "epoch": 0.31745159961554303, + "grad_norm": 0.36431241035461426, + "learning_rate": 0.000298046398046398, + "loss": 0.5475, + "step": 289 + }, + { + "epoch": 0.31855004805711934, + "grad_norm": 0.4778802692890167, + "learning_rate": 0.0002979242979242979, + "loss": 0.7198, + "step": 290 + }, + { + "epoch": 0.3196484964986956, + "grad_norm": 0.4887610673904419, + "learning_rate": 0.0002978021978021978, + "loss": 0.5559, + "step": 291 + }, + { + "epoch": 0.32074694494027184, + "grad_norm": 0.745379626750946, + "learning_rate": 0.00029768009768009766, + "loss": 1.0509, + "step": 292 + }, + { + "epoch": 0.32184539338184814, + "grad_norm": 0.40081167221069336, + "learning_rate": 0.0002975579975579975, + "loss": 0.6564, + "step": 293 + }, + { + "epoch": 0.3229438418234244, + "grad_norm": 0.5133034586906433, + "learning_rate": 0.00029743589743589743, + "loss": 0.6765, + "step": 294 + }, + { + "epoch": 0.3240422902650007, + "grad_norm": 0.5123881697654724, + "learning_rate": 0.0002973137973137973, + "loss": 0.8001, + "step": 295 + }, + { + "epoch": 0.32514073870657695, + "grad_norm": 0.3771597743034363, + "learning_rate": 0.00029719169719169715, + "loss": 0.785, + "step": 296 + }, + { + "epoch": 0.32623918714815325, + "grad_norm": 0.38929086923599243, + "learning_rate": 0.00029706959706959706, + "loss": 0.7273, + "step": 297 + }, + { + "epoch": 0.3273376355897295, + "grad_norm": 0.47761446237564087, + "learning_rate": 0.0002969474969474969, + "loss": 0.6997, + "step": 298 + }, + { + "epoch": 0.3284360840313058, + "grad_norm": 0.4798452854156494, + "learning_rate": 0.0002968253968253968, + "loss": 0.7171, + "step": 299 + }, + { + "epoch": 0.32953453247288206, + "grad_norm": 0.5864073038101196, + "learning_rate": 0.0002967032967032967, + "loss": 0.7075, + "step": 300 + }, + { + "epoch": 0.3306329809144583, + "grad_norm": 0.6298258900642395, + "learning_rate": 0.00029658119658119655, + "loss": 0.8659, + "step": 301 + }, + { + "epoch": 0.3317314293560346, + "grad_norm": 0.9764651656150818, + "learning_rate": 0.0002964590964590964, + "loss": 0.7451, + "step": 302 + }, + { + "epoch": 0.33282987779761086, + "grad_norm": 0.7084535360336304, + "learning_rate": 0.0002963369963369963, + "loss": 0.7896, + "step": 303 + }, + { + "epoch": 0.33392832623918717, + "grad_norm": 0.3226016163825989, + "learning_rate": 0.0002962148962148962, + "loss": 0.5614, + "step": 304 + }, + { + "epoch": 0.3350267746807634, + "grad_norm": 0.5515668988227844, + "learning_rate": 0.0002960927960927961, + "loss": 0.6981, + "step": 305 + }, + { + "epoch": 0.3361252231223397, + "grad_norm": 0.42776307463645935, + "learning_rate": 0.00029597069597069595, + "loss": 0.5911, + "step": 306 + }, + { + "epoch": 0.33722367156391597, + "grad_norm": 0.36645814776420593, + "learning_rate": 0.0002958485958485958, + "loss": 0.5584, + "step": 307 + }, + { + "epoch": 0.3383221200054922, + "grad_norm": 0.4089672565460205, + "learning_rate": 0.0002957264957264957, + "loss": 0.6814, + "step": 308 + }, + { + "epoch": 0.3394205684470685, + "grad_norm": 0.4406324326992035, + "learning_rate": 0.0002956043956043956, + "loss": 0.5426, + "step": 309 + }, + { + "epoch": 0.3405190168886448, + "grad_norm": 0.4138193726539612, + "learning_rate": 0.00029548229548229544, + "loss": 0.7554, + "step": 310 + }, + { + "epoch": 0.3416174653302211, + "grad_norm": 0.45647338032722473, + "learning_rate": 0.00029536019536019535, + "loss": 0.4871, + "step": 311 + }, + { + "epoch": 0.34271591377179733, + "grad_norm": 0.44362974166870117, + "learning_rate": 0.0002952380952380952, + "loss": 0.7254, + "step": 312 + }, + { + "epoch": 0.34381436221337364, + "grad_norm": 0.5832559466362, + "learning_rate": 0.00029511599511599507, + "loss": 0.64, + "step": 313 + }, + { + "epoch": 0.3449128106549499, + "grad_norm": 0.6754651665687561, + "learning_rate": 0.000294993894993895, + "loss": 0.7046, + "step": 314 + }, + { + "epoch": 0.34601125909652614, + "grad_norm": 0.6487123370170593, + "learning_rate": 0.00029487179487179484, + "loss": 0.5934, + "step": 315 + }, + { + "epoch": 0.34710970753810244, + "grad_norm": 0.24118930101394653, + "learning_rate": 0.0002947496947496947, + "loss": 0.5241, + "step": 316 + }, + { + "epoch": 0.3482081559796787, + "grad_norm": 0.4580494165420532, + "learning_rate": 0.0002946275946275946, + "loss": 0.6733, + "step": 317 + }, + { + "epoch": 0.349306604421255, + "grad_norm": 0.4770609736442566, + "learning_rate": 0.00029450549450549447, + "loss": 0.5758, + "step": 318 + }, + { + "epoch": 0.35040505286283125, + "grad_norm": 0.40334221720695496, + "learning_rate": 0.0002943833943833944, + "loss": 0.5365, + "step": 319 + }, + { + "epoch": 0.35150350130440755, + "grad_norm": 0.5605480074882507, + "learning_rate": 0.00029426129426129424, + "loss": 0.5967, + "step": 320 + }, + { + "epoch": 0.3526019497459838, + "grad_norm": 0.6031836271286011, + "learning_rate": 0.0002941391941391941, + "loss": 0.6397, + "step": 321 + }, + { + "epoch": 0.35370039818756005, + "grad_norm": 0.5602075457572937, + "learning_rate": 0.000294017094017094, + "loss": 0.7253, + "step": 322 + }, + { + "epoch": 0.35479884662913636, + "grad_norm": 1.5055879354476929, + "learning_rate": 0.00029389499389499387, + "loss": 0.6066, + "step": 323 + }, + { + "epoch": 0.3558972950707126, + "grad_norm": 1.969072699546814, + "learning_rate": 0.0002937728937728937, + "loss": 0.9263, + "step": 324 + }, + { + "epoch": 0.3569957435122889, + "grad_norm": 0.43139147758483887, + "learning_rate": 0.00029365079365079364, + "loss": 0.6462, + "step": 325 + }, + { + "epoch": 0.35809419195386516, + "grad_norm": 0.40423595905303955, + "learning_rate": 0.0002935286935286935, + "loss": 0.4278, + "step": 326 + }, + { + "epoch": 0.35919264039544146, + "grad_norm": 0.41983166337013245, + "learning_rate": 0.00029340659340659336, + "loss": 0.7527, + "step": 327 + }, + { + "epoch": 0.3602910888370177, + "grad_norm": 0.6624807715415955, + "learning_rate": 0.00029328449328449327, + "loss": 0.7381, + "step": 328 + }, + { + "epoch": 0.36138953727859396, + "grad_norm": 0.6173990964889526, + "learning_rate": 0.00029316239316239313, + "loss": 0.6838, + "step": 329 + }, + { + "epoch": 0.36248798572017027, + "grad_norm": 1.1278433799743652, + "learning_rate": 0.000293040293040293, + "loss": 0.8439, + "step": 330 + }, + { + "epoch": 0.3635864341617465, + "grad_norm": 0.3453993797302246, + "learning_rate": 0.0002929181929181929, + "loss": 0.5324, + "step": 331 + }, + { + "epoch": 0.3646848826033228, + "grad_norm": 0.4151187241077423, + "learning_rate": 0.0002927960927960928, + "loss": 0.7019, + "step": 332 + }, + { + "epoch": 0.3657833310448991, + "grad_norm": 0.4247313439846039, + "learning_rate": 0.0002926739926739926, + "loss": 0.6362, + "step": 333 + }, + { + "epoch": 0.3668817794864754, + "grad_norm": 1.5250136852264404, + "learning_rate": 0.00029255189255189253, + "loss": 0.5885, + "step": 334 + }, + { + "epoch": 0.36798022792805163, + "grad_norm": 0.43669968843460083, + "learning_rate": 0.00029242979242979244, + "loss": 0.9191, + "step": 335 + }, + { + "epoch": 0.3690786763696279, + "grad_norm": 0.8063925504684448, + "learning_rate": 0.0002923076923076923, + "loss": 0.6813, + "step": 336 + }, + { + "epoch": 0.3701771248112042, + "grad_norm": 0.6002399325370789, + "learning_rate": 0.00029218559218559216, + "loss": 0.5859, + "step": 337 + }, + { + "epoch": 0.37127557325278043, + "grad_norm": 0.9405462145805359, + "learning_rate": 0.000292063492063492, + "loss": 0.7476, + "step": 338 + }, + { + "epoch": 0.37237402169435674, + "grad_norm": 0.5050615072250366, + "learning_rate": 0.00029194139194139193, + "loss": 0.5172, + "step": 339 + }, + { + "epoch": 0.373472470135933, + "grad_norm": 0.4593801200389862, + "learning_rate": 0.0002918192918192918, + "loss": 0.5405, + "step": 340 + }, + { + "epoch": 0.3745709185775093, + "grad_norm": 0.5275060534477234, + "learning_rate": 0.00029169719169719164, + "loss": 0.4537, + "step": 341 + }, + { + "epoch": 0.37566936701908554, + "grad_norm": 0.8907522559165955, + "learning_rate": 0.00029157509157509156, + "loss": 0.6826, + "step": 342 + }, + { + "epoch": 0.3767678154606618, + "grad_norm": 0.7229670882225037, + "learning_rate": 0.0002914529914529914, + "loss": 0.6072, + "step": 343 + }, + { + "epoch": 0.3778662639022381, + "grad_norm": 1.7154827117919922, + "learning_rate": 0.0002913308913308913, + "loss": 0.6956, + "step": 344 + }, + { + "epoch": 0.37896471234381435, + "grad_norm": 1.012902021408081, + "learning_rate": 0.0002912087912087912, + "loss": 0.5337, + "step": 345 + }, + { + "epoch": 0.38006316078539065, + "grad_norm": 0.6467313170433044, + "learning_rate": 0.00029108669108669105, + "loss": 0.7652, + "step": 346 + }, + { + "epoch": 0.3811616092269669, + "grad_norm": 0.5594947338104248, + "learning_rate": 0.0002909645909645909, + "loss": 0.578, + "step": 347 + }, + { + "epoch": 0.3822600576685432, + "grad_norm": 0.5808854699134827, + "learning_rate": 0.0002908424908424908, + "loss": 0.6142, + "step": 348 + }, + { + "epoch": 0.38335850611011946, + "grad_norm": 0.6067795157432556, + "learning_rate": 0.00029072039072039073, + "loss": 0.7682, + "step": 349 + }, + { + "epoch": 0.3844569545516957, + "grad_norm": 0.392993301153183, + "learning_rate": 0.0002905982905982906, + "loss": 0.6599, + "step": 350 + }, + { + "epoch": 0.385555402993272, + "grad_norm": 0.3963404893875122, + "learning_rate": 0.00029047619047619045, + "loss": 0.7079, + "step": 351 + }, + { + "epoch": 0.38665385143484826, + "grad_norm": 0.3471222221851349, + "learning_rate": 0.00029035409035409036, + "loss": 0.463, + "step": 352 + }, + { + "epoch": 0.38775229987642457, + "grad_norm": 0.5496531128883362, + "learning_rate": 0.0002902319902319902, + "loss": 0.7639, + "step": 353 + }, + { + "epoch": 0.3888507483180008, + "grad_norm": 0.5482885241508484, + "learning_rate": 0.0002901098901098901, + "loss": 0.4198, + "step": 354 + }, + { + "epoch": 0.3899491967595771, + "grad_norm": 0.7329181432723999, + "learning_rate": 0.00028998778998779, + "loss": 0.6057, + "step": 355 + }, + { + "epoch": 0.39104764520115337, + "grad_norm": 0.41850918531417847, + "learning_rate": 0.00028986568986568985, + "loss": 0.605, + "step": 356 + }, + { + "epoch": 0.3921460936427296, + "grad_norm": 0.4463609457015991, + "learning_rate": 0.0002897435897435897, + "loss": 0.7381, + "step": 357 + }, + { + "epoch": 0.3932445420843059, + "grad_norm": 0.7207491397857666, + "learning_rate": 0.0002896214896214896, + "loss": 0.6892, + "step": 358 + }, + { + "epoch": 0.3943429905258822, + "grad_norm": 0.3715958595275879, + "learning_rate": 0.0002894993894993895, + "loss": 0.5426, + "step": 359 + }, + { + "epoch": 0.3954414389674585, + "grad_norm": 0.7077822685241699, + "learning_rate": 0.00028937728937728933, + "loss": 0.5923, + "step": 360 + }, + { + "epoch": 0.39653988740903473, + "grad_norm": 0.5109585523605347, + "learning_rate": 0.00028925518925518925, + "loss": 0.5939, + "step": 361 + }, + { + "epoch": 0.39763833585061104, + "grad_norm": 0.6105355024337769, + "learning_rate": 0.0002891330891330891, + "loss": 1.0345, + "step": 362 + }, + { + "epoch": 0.3987367842921873, + "grad_norm": 0.479732871055603, + "learning_rate": 0.000289010989010989, + "loss": 0.71, + "step": 363 + }, + { + "epoch": 0.39983523273376353, + "grad_norm": 0.8600007891654968, + "learning_rate": 0.0002888888888888888, + "loss": 0.7406, + "step": 364 + }, + { + "epoch": 0.40093368117533984, + "grad_norm": 0.6584550738334656, + "learning_rate": 0.00028876678876678873, + "loss": 0.6658, + "step": 365 + }, + { + "epoch": 0.4020321296169161, + "grad_norm": 0.7251041531562805, + "learning_rate": 0.00028864468864468865, + "loss": 0.8425, + "step": 366 + }, + { + "epoch": 0.4031305780584924, + "grad_norm": 0.5729238390922546, + "learning_rate": 0.0002885225885225885, + "loss": 0.9054, + "step": 367 + }, + { + "epoch": 0.40422902650006864, + "grad_norm": 1.1829932928085327, + "learning_rate": 0.00028840048840048836, + "loss": 0.9232, + "step": 368 + }, + { + "epoch": 0.40532747494164495, + "grad_norm": 0.37746721506118774, + "learning_rate": 0.0002882783882783883, + "loss": 0.9619, + "step": 369 + }, + { + "epoch": 0.4064259233832212, + "grad_norm": 0.5653749108314514, + "learning_rate": 0.00028815628815628813, + "loss": 0.7182, + "step": 370 + }, + { + "epoch": 0.40752437182479745, + "grad_norm": 0.6024563312530518, + "learning_rate": 0.000288034188034188, + "loss": 0.6881, + "step": 371 + }, + { + "epoch": 0.40862282026637375, + "grad_norm": 0.485350102186203, + "learning_rate": 0.0002879120879120879, + "loss": 0.6451, + "step": 372 + }, + { + "epoch": 0.40972126870795, + "grad_norm": 0.5762611627578735, + "learning_rate": 0.00028778998778998776, + "loss": 0.7818, + "step": 373 + }, + { + "epoch": 0.4108197171495263, + "grad_norm": 0.7961844801902771, + "learning_rate": 0.0002876678876678876, + "loss": 0.6682, + "step": 374 + }, + { + "epoch": 0.41191816559110256, + "grad_norm": 0.4630587697029114, + "learning_rate": 0.00028754578754578753, + "loss": 0.9015, + "step": 375 + }, + { + "epoch": 0.41301661403267886, + "grad_norm": 0.6592808961868286, + "learning_rate": 0.0002874236874236874, + "loss": 0.5738, + "step": 376 + }, + { + "epoch": 0.4141150624742551, + "grad_norm": 0.4788278639316559, + "learning_rate": 0.00028730158730158725, + "loss": 0.7022, + "step": 377 + }, + { + "epoch": 0.41521351091583136, + "grad_norm": 0.5041861534118652, + "learning_rate": 0.00028717948717948716, + "loss": 0.6137, + "step": 378 + }, + { + "epoch": 0.41631195935740767, + "grad_norm": 0.5436013340950012, + "learning_rate": 0.000287057387057387, + "loss": 0.6621, + "step": 379 + }, + { + "epoch": 0.4174104077989839, + "grad_norm": 0.5102400183677673, + "learning_rate": 0.00028693528693528694, + "loss": 0.6627, + "step": 380 + }, + { + "epoch": 0.4185088562405602, + "grad_norm": 0.43655040860176086, + "learning_rate": 0.0002868131868131868, + "loss": 0.6475, + "step": 381 + }, + { + "epoch": 0.4196073046821365, + "grad_norm": 0.3989826738834381, + "learning_rate": 0.00028669108669108665, + "loss": 0.5483, + "step": 382 + }, + { + "epoch": 0.4207057531237128, + "grad_norm": 0.7781158685684204, + "learning_rate": 0.00028656898656898656, + "loss": 0.6475, + "step": 383 + }, + { + "epoch": 0.421804201565289, + "grad_norm": 0.8119930624961853, + "learning_rate": 0.0002864468864468864, + "loss": 0.8122, + "step": 384 + }, + { + "epoch": 0.4229026500068653, + "grad_norm": 0.7233585119247437, + "learning_rate": 0.0002863247863247863, + "loss": 0.7837, + "step": 385 + }, + { + "epoch": 0.4240010984484416, + "grad_norm": 0.41249507665634155, + "learning_rate": 0.0002862026862026862, + "loss": 0.6916, + "step": 386 + }, + { + "epoch": 0.42509954689001783, + "grad_norm": 0.4865298867225647, + "learning_rate": 0.00028608058608058605, + "loss": 0.595, + "step": 387 + }, + { + "epoch": 0.42619799533159414, + "grad_norm": 0.6057963371276855, + "learning_rate": 0.0002859584859584859, + "loss": 0.7214, + "step": 388 + }, + { + "epoch": 0.4272964437731704, + "grad_norm": 0.5390968918800354, + "learning_rate": 0.0002858363858363858, + "loss": 0.805, + "step": 389 + }, + { + "epoch": 0.4283948922147467, + "grad_norm": 0.5944109559059143, + "learning_rate": 0.0002857142857142857, + "loss": 0.9953, + "step": 390 + }, + { + "epoch": 0.42949334065632294, + "grad_norm": 0.5480278134346008, + "learning_rate": 0.00028559218559218554, + "loss": 0.8406, + "step": 391 + }, + { + "epoch": 0.4305917890978992, + "grad_norm": 0.5168552994728088, + "learning_rate": 0.00028547008547008545, + "loss": 0.9715, + "step": 392 + }, + { + "epoch": 0.4316902375394755, + "grad_norm": 0.4859452247619629, + "learning_rate": 0.0002853479853479853, + "loss": 0.7368, + "step": 393 + }, + { + "epoch": 0.43278868598105175, + "grad_norm": 0.4697234034538269, + "learning_rate": 0.0002852258852258852, + "loss": 0.4801, + "step": 394 + }, + { + "epoch": 0.43388713442262805, + "grad_norm": 0.6198891401290894, + "learning_rate": 0.0002851037851037851, + "loss": 0.5184, + "step": 395 + }, + { + "epoch": 0.4349855828642043, + "grad_norm": 0.531563401222229, + "learning_rate": 0.00028498168498168494, + "loss": 0.8047, + "step": 396 + }, + { + "epoch": 0.4360840313057806, + "grad_norm": 0.4610724449157715, + "learning_rate": 0.00028485958485958485, + "loss": 0.4583, + "step": 397 + }, + { + "epoch": 0.43718247974735686, + "grad_norm": 0.5609697699546814, + "learning_rate": 0.0002847374847374847, + "loss": 0.7362, + "step": 398 + }, + { + "epoch": 0.4382809281889331, + "grad_norm": 0.5257968306541443, + "learning_rate": 0.00028461538461538457, + "loss": 0.8173, + "step": 399 + }, + { + "epoch": 0.4393793766305094, + "grad_norm": 0.8307009339332581, + "learning_rate": 0.0002844932844932845, + "loss": 0.5507, + "step": 400 + }, + { + "epoch": 0.44047782507208566, + "grad_norm": 0.36615508794784546, + "learning_rate": 0.00028437118437118434, + "loss": 0.6605, + "step": 401 + }, + { + "epoch": 0.44157627351366197, + "grad_norm": 0.35138362646102905, + "learning_rate": 0.0002842490842490842, + "loss": 0.6614, + "step": 402 + }, + { + "epoch": 0.4426747219552382, + "grad_norm": 0.5054494738578796, + "learning_rate": 0.0002841269841269841, + "loss": 0.799, + "step": 403 + }, + { + "epoch": 0.4437731703968145, + "grad_norm": 0.4711816608905792, + "learning_rate": 0.00028400488400488397, + "loss": 0.8892, + "step": 404 + }, + { + "epoch": 0.44487161883839077, + "grad_norm": 0.5073884725570679, + "learning_rate": 0.00028388278388278383, + "loss": 0.8156, + "step": 405 + }, + { + "epoch": 0.445970067279967, + "grad_norm": 0.29938632249832153, + "learning_rate": 0.00028376068376068374, + "loss": 0.7598, + "step": 406 + }, + { + "epoch": 0.4470685157215433, + "grad_norm": 1.745937466621399, + "learning_rate": 0.00028363858363858365, + "loss": 0.7829, + "step": 407 + }, + { + "epoch": 0.4481669641631196, + "grad_norm": 0.46887943148612976, + "learning_rate": 0.00028351648351648346, + "loss": 0.7798, + "step": 408 + }, + { + "epoch": 0.4492654126046959, + "grad_norm": 0.4274987280368805, + "learning_rate": 0.00028339438339438337, + "loss": 0.8407, + "step": 409 + }, + { + "epoch": 0.45036386104627213, + "grad_norm": 0.4445902109146118, + "learning_rate": 0.0002832722832722833, + "loss": 0.7394, + "step": 410 + }, + { + "epoch": 0.45146230948784843, + "grad_norm": 0.3842466175556183, + "learning_rate": 0.00028315018315018314, + "loss": 0.7781, + "step": 411 + }, + { + "epoch": 0.4525607579294247, + "grad_norm": 0.5660600066184998, + "learning_rate": 0.000283028083028083, + "loss": 0.8058, + "step": 412 + }, + { + "epoch": 0.45365920637100093, + "grad_norm": 0.442911297082901, + "learning_rate": 0.0002829059829059829, + "loss": 0.808, + "step": 413 + }, + { + "epoch": 0.45475765481257724, + "grad_norm": 0.9051260352134705, + "learning_rate": 0.00028278388278388277, + "loss": 0.9427, + "step": 414 + }, + { + "epoch": 0.4558561032541535, + "grad_norm": 0.8027593493461609, + "learning_rate": 0.00028266178266178263, + "loss": 0.531, + "step": 415 + }, + { + "epoch": 0.4569545516957298, + "grad_norm": 0.36242446303367615, + "learning_rate": 0.0002825396825396825, + "loss": 0.5609, + "step": 416 + }, + { + "epoch": 0.45805300013730604, + "grad_norm": 0.6095871925354004, + "learning_rate": 0.0002824175824175824, + "loss": 0.7424, + "step": 417 + }, + { + "epoch": 0.45915144857888235, + "grad_norm": 0.5102814435958862, + "learning_rate": 0.00028229548229548226, + "loss": 0.8861, + "step": 418 + }, + { + "epoch": 0.4602498970204586, + "grad_norm": 0.375265896320343, + "learning_rate": 0.0002821733821733821, + "loss": 0.6235, + "step": 419 + }, + { + "epoch": 0.4613483454620349, + "grad_norm": 0.4506315588951111, + "learning_rate": 0.00028205128205128203, + "loss": 0.6059, + "step": 420 + }, + { + "epoch": 0.46244679390361115, + "grad_norm": 0.8119642734527588, + "learning_rate": 0.0002819291819291819, + "loss": 0.7821, + "step": 421 + }, + { + "epoch": 0.4635452423451874, + "grad_norm": 0.42945513129234314, + "learning_rate": 0.00028180708180708175, + "loss": 0.9503, + "step": 422 + }, + { + "epoch": 0.4646436907867637, + "grad_norm": 0.35567665100097656, + "learning_rate": 0.00028168498168498166, + "loss": 0.5243, + "step": 423 + }, + { + "epoch": 0.46574213922833996, + "grad_norm": 0.5160343647003174, + "learning_rate": 0.00028156288156288157, + "loss": 0.5767, + "step": 424 + }, + { + "epoch": 0.46684058766991626, + "grad_norm": 0.37530624866485596, + "learning_rate": 0.00028144078144078143, + "loss": 1.2016, + "step": 425 + }, + { + "epoch": 0.4679390361114925, + "grad_norm": 0.5283146500587463, + "learning_rate": 0.0002813186813186813, + "loss": 0.5958, + "step": 426 + }, + { + "epoch": 0.4690374845530688, + "grad_norm": 0.5217192769050598, + "learning_rate": 0.0002811965811965812, + "loss": 0.715, + "step": 427 + }, + { + "epoch": 0.47013593299464507, + "grad_norm": 0.5092077851295471, + "learning_rate": 0.00028107448107448106, + "loss": 0.6942, + "step": 428 + }, + { + "epoch": 0.4712343814362213, + "grad_norm": 0.7683324813842773, + "learning_rate": 0.0002809523809523809, + "loss": 1.0185, + "step": 429 + }, + { + "epoch": 0.4723328298777976, + "grad_norm": 0.3117397725582123, + "learning_rate": 0.00028083028083028083, + "loss": 0.6949, + "step": 430 + }, + { + "epoch": 0.47343127831937387, + "grad_norm": 0.3218965232372284, + "learning_rate": 0.0002807081807081807, + "loss": 0.6872, + "step": 431 + }, + { + "epoch": 0.4745297267609502, + "grad_norm": 1.104121446609497, + "learning_rate": 0.00028058608058608055, + "loss": 0.6628, + "step": 432 + }, + { + "epoch": 0.4756281752025264, + "grad_norm": 0.3224816620349884, + "learning_rate": 0.00028046398046398046, + "loss": 0.5974, + "step": 433 + }, + { + "epoch": 0.47672662364410273, + "grad_norm": 0.5742220878601074, + "learning_rate": 0.0002803418803418803, + "loss": 0.7248, + "step": 434 + }, + { + "epoch": 0.477825072085679, + "grad_norm": 0.5449275374412537, + "learning_rate": 0.0002802197802197802, + "loss": 0.8552, + "step": 435 + }, + { + "epoch": 0.47892352052725523, + "grad_norm": 0.44660067558288574, + "learning_rate": 0.0002800976800976801, + "loss": 0.6968, + "step": 436 + }, + { + "epoch": 0.48002196896883154, + "grad_norm": 0.4287508428096771, + "learning_rate": 0.00027997557997557995, + "loss": 0.8101, + "step": 437 + }, + { + "epoch": 0.4811204174104078, + "grad_norm": 0.4142225384712219, + "learning_rate": 0.00027985347985347986, + "loss": 0.5379, + "step": 438 + }, + { + "epoch": 0.4822188658519841, + "grad_norm": 1.246833324432373, + "learning_rate": 0.0002797313797313797, + "loss": 0.7116, + "step": 439 + }, + { + "epoch": 0.48331731429356034, + "grad_norm": 0.3845030963420868, + "learning_rate": 0.0002796092796092796, + "loss": 0.8088, + "step": 440 + }, + { + "epoch": 0.48441576273513665, + "grad_norm": 1.4492995738983154, + "learning_rate": 0.0002794871794871795, + "loss": 0.7358, + "step": 441 + }, + { + "epoch": 0.4855142111767129, + "grad_norm": 0.40994521975517273, + "learning_rate": 0.00027936507936507935, + "loss": 0.6228, + "step": 442 + }, + { + "epoch": 0.48661265961828915, + "grad_norm": 0.4782777428627014, + "learning_rate": 0.0002792429792429792, + "loss": 0.4944, + "step": 443 + }, + { + "epoch": 0.48771110805986545, + "grad_norm": 0.47269922494888306, + "learning_rate": 0.0002791208791208791, + "loss": 0.7023, + "step": 444 + }, + { + "epoch": 0.4888095565014417, + "grad_norm": 0.5529118776321411, + "learning_rate": 0.000278998778998779, + "loss": 0.7717, + "step": 445 + }, + { + "epoch": 0.489908004943018, + "grad_norm": 0.4244072139263153, + "learning_rate": 0.00027887667887667884, + "loss": 0.7902, + "step": 446 + }, + { + "epoch": 0.49100645338459425, + "grad_norm": 1.4737539291381836, + "learning_rate": 0.00027875457875457875, + "loss": 0.5784, + "step": 447 + }, + { + "epoch": 0.49210490182617056, + "grad_norm": 0.40120208263397217, + "learning_rate": 0.0002786324786324786, + "loss": 0.7974, + "step": 448 + }, + { + "epoch": 0.4932033502677468, + "grad_norm": 0.5481031537055969, + "learning_rate": 0.00027851037851037846, + "loss": 0.7867, + "step": 449 + }, + { + "epoch": 0.49430179870932306, + "grad_norm": 0.36719343066215515, + "learning_rate": 0.0002783882783882784, + "loss": 0.6543, + "step": 450 + }, + { + "epoch": 0.49540024715089936, + "grad_norm": 0.3980066776275635, + "learning_rate": 0.00027826617826617824, + "loss": 0.5395, + "step": 451 + }, + { + "epoch": 0.4964986955924756, + "grad_norm": 0.45570313930511475, + "learning_rate": 0.0002781440781440781, + "loss": 0.7908, + "step": 452 + }, + { + "epoch": 0.4975971440340519, + "grad_norm": 0.41858601570129395, + "learning_rate": 0.000278021978021978, + "loss": 0.5248, + "step": 453 + }, + { + "epoch": 0.49869559247562817, + "grad_norm": 0.5019702315330505, + "learning_rate": 0.00027789987789987786, + "loss": 0.8006, + "step": 454 + }, + { + "epoch": 0.4997940409172045, + "grad_norm": 0.4589880108833313, + "learning_rate": 0.0002777777777777778, + "loss": 0.7294, + "step": 455 + }, + { + "epoch": 0.5008924893587807, + "grad_norm": 0.5679266452789307, + "learning_rate": 0.00027765567765567764, + "loss": 0.651, + "step": 456 + }, + { + "epoch": 0.501990937800357, + "grad_norm": 0.4854479134082794, + "learning_rate": 0.0002775335775335775, + "loss": 0.9908, + "step": 457 + }, + { + "epoch": 0.5030893862419332, + "grad_norm": 0.4964112341403961, + "learning_rate": 0.0002774114774114774, + "loss": 0.8084, + "step": 458 + }, + { + "epoch": 0.5041878346835096, + "grad_norm": 0.5130513906478882, + "learning_rate": 0.00027728937728937727, + "loss": 0.8389, + "step": 459 + }, + { + "epoch": 0.5052862831250858, + "grad_norm": 0.4784137010574341, + "learning_rate": 0.0002771672771672771, + "loss": 0.5497, + "step": 460 + }, + { + "epoch": 0.5063847315666621, + "grad_norm": 0.28685998916625977, + "learning_rate": 0.00027704517704517704, + "loss": 0.491, + "step": 461 + }, + { + "epoch": 0.5074831800082383, + "grad_norm": 0.5337100625038147, + "learning_rate": 0.0002769230769230769, + "loss": 0.8315, + "step": 462 + }, + { + "epoch": 0.5085816284498146, + "grad_norm": 0.5431344509124756, + "learning_rate": 0.00027680097680097675, + "loss": 0.5996, + "step": 463 + }, + { + "epoch": 0.5096800768913909, + "grad_norm": 0.4546130299568176, + "learning_rate": 0.00027667887667887667, + "loss": 0.5647, + "step": 464 + }, + { + "epoch": 0.5107785253329672, + "grad_norm": 0.6298655271530151, + "learning_rate": 0.0002765567765567765, + "loss": 0.7684, + "step": 465 + }, + { + "epoch": 0.5118769737745434, + "grad_norm": 0.44330841302871704, + "learning_rate": 0.0002764346764346764, + "loss": 0.4906, + "step": 466 + }, + { + "epoch": 0.5129754222161197, + "grad_norm": 0.3824306130409241, + "learning_rate": 0.0002763125763125763, + "loss": 0.6123, + "step": 467 + }, + { + "epoch": 0.514073870657696, + "grad_norm": 0.3225514590740204, + "learning_rate": 0.00027619047619047615, + "loss": 0.7535, + "step": 468 + }, + { + "epoch": 0.5151723190992723, + "grad_norm": 0.701239824295044, + "learning_rate": 0.00027606837606837607, + "loss": 0.9643, + "step": 469 + }, + { + "epoch": 0.5162707675408486, + "grad_norm": 0.37800920009613037, + "learning_rate": 0.0002759462759462759, + "loss": 0.543, + "step": 470 + }, + { + "epoch": 0.5173692159824248, + "grad_norm": 0.3521328568458557, + "learning_rate": 0.0002758241758241758, + "loss": 0.7157, + "step": 471 + }, + { + "epoch": 0.518467664424001, + "grad_norm": 0.2659924626350403, + "learning_rate": 0.0002757020757020757, + "loss": 0.7334, + "step": 472 + }, + { + "epoch": 0.5195661128655774, + "grad_norm": 0.42815065383911133, + "learning_rate": 0.00027557997557997555, + "loss": 1.2015, + "step": 473 + }, + { + "epoch": 0.5206645613071537, + "grad_norm": 0.7758998870849609, + "learning_rate": 0.0002754578754578754, + "loss": 0.9493, + "step": 474 + }, + { + "epoch": 0.5217630097487299, + "grad_norm": 0.46281251311302185, + "learning_rate": 0.0002753357753357753, + "loss": 0.9159, + "step": 475 + }, + { + "epoch": 0.5228614581903062, + "grad_norm": 0.3668971061706543, + "learning_rate": 0.0002752136752136752, + "loss": 0.4869, + "step": 476 + }, + { + "epoch": 0.5239599066318824, + "grad_norm": 0.462534099817276, + "learning_rate": 0.00027509157509157504, + "loss": 0.6439, + "step": 477 + }, + { + "epoch": 0.5250583550734588, + "grad_norm": 0.6341688632965088, + "learning_rate": 0.00027496947496947495, + "loss": 0.6948, + "step": 478 + }, + { + "epoch": 0.526156803515035, + "grad_norm": 0.5469139814376831, + "learning_rate": 0.0002748473748473748, + "loss": 1.016, + "step": 479 + }, + { + "epoch": 0.5272552519566113, + "grad_norm": 0.438204288482666, + "learning_rate": 0.00027472527472527467, + "loss": 0.6941, + "step": 480 + }, + { + "epoch": 0.5283537003981875, + "grad_norm": 0.586700975894928, + "learning_rate": 0.0002746031746031746, + "loss": 0.6649, + "step": 481 + }, + { + "epoch": 0.5294521488397639, + "grad_norm": 0.4077949523925781, + "learning_rate": 0.0002744810744810745, + "loss": 0.5948, + "step": 482 + }, + { + "epoch": 0.5305505972813401, + "grad_norm": 0.3756411373615265, + "learning_rate": 0.0002743589743589743, + "loss": 0.4915, + "step": 483 + }, + { + "epoch": 0.5316490457229164, + "grad_norm": 1.2067008018493652, + "learning_rate": 0.0002742368742368742, + "loss": 0.8795, + "step": 484 + }, + { + "epoch": 0.5327474941644926, + "grad_norm": 0.3097778260707855, + "learning_rate": 0.0002741147741147741, + "loss": 0.5478, + "step": 485 + }, + { + "epoch": 0.5338459426060689, + "grad_norm": 0.5536866188049316, + "learning_rate": 0.000273992673992674, + "loss": 0.7042, + "step": 486 + }, + { + "epoch": 0.5349443910476452, + "grad_norm": 0.5930231809616089, + "learning_rate": 0.00027387057387057384, + "loss": 0.7108, + "step": 487 + }, + { + "epoch": 0.5360428394892215, + "grad_norm": 0.39304253458976746, + "learning_rate": 0.00027374847374847375, + "loss": 0.788, + "step": 488 + }, + { + "epoch": 0.5371412879307977, + "grad_norm": 0.5238274335861206, + "learning_rate": 0.0002736263736263736, + "loss": 0.9887, + "step": 489 + }, + { + "epoch": 0.538239736372374, + "grad_norm": 0.5993770956993103, + "learning_rate": 0.00027350427350427347, + "loss": 0.7819, + "step": 490 + }, + { + "epoch": 0.5393381848139503, + "grad_norm": 0.4601563811302185, + "learning_rate": 0.00027338217338217333, + "loss": 0.4347, + "step": 491 + }, + { + "epoch": 0.5404366332555266, + "grad_norm": 0.5292415022850037, + "learning_rate": 0.00027326007326007324, + "loss": 0.5248, + "step": 492 + }, + { + "epoch": 0.5415350816971028, + "grad_norm": 0.37247565388679504, + "learning_rate": 0.0002731379731379731, + "loss": 0.5412, + "step": 493 + }, + { + "epoch": 0.5426335301386791, + "grad_norm": 0.6865994930267334, + "learning_rate": 0.00027301587301587296, + "loss": 0.8263, + "step": 494 + }, + { + "epoch": 0.5437319785802553, + "grad_norm": 0.5019715428352356, + "learning_rate": 0.00027289377289377287, + "loss": 0.7084, + "step": 495 + }, + { + "epoch": 0.5448304270218317, + "grad_norm": 0.8432828783988953, + "learning_rate": 0.00027277167277167273, + "loss": 0.6188, + "step": 496 + }, + { + "epoch": 0.545928875463408, + "grad_norm": 0.594881534576416, + "learning_rate": 0.0002726495726495726, + "loss": 0.8923, + "step": 497 + }, + { + "epoch": 0.5470273239049842, + "grad_norm": 0.5573694705963135, + "learning_rate": 0.0002725274725274725, + "loss": 0.6351, + "step": 498 + }, + { + "epoch": 0.5481257723465605, + "grad_norm": 0.30426710844039917, + "learning_rate": 0.0002724053724053724, + "loss": 0.6359, + "step": 499 + }, + { + "epoch": 0.5492242207881367, + "grad_norm": 0.759385883808136, + "learning_rate": 0.00027228327228327227, + "loss": 0.6131, + "step": 500 + }, + { + "epoch": 0.5503226692297131, + "grad_norm": 0.5436901450157166, + "learning_rate": 0.00027216117216117213, + "loss": 0.5232, + "step": 501 + }, + { + "epoch": 0.5514211176712893, + "grad_norm": 0.5924163460731506, + "learning_rate": 0.00027203907203907204, + "loss": 0.9594, + "step": 502 + }, + { + "epoch": 0.5525195661128656, + "grad_norm": 0.49177658557891846, + "learning_rate": 0.0002719169719169719, + "loss": 0.842, + "step": 503 + }, + { + "epoch": 0.5536180145544418, + "grad_norm": 0.4437295198440552, + "learning_rate": 0.00027179487179487176, + "loss": 1.0338, + "step": 504 + }, + { + "epoch": 0.5547164629960182, + "grad_norm": 0.426213800907135, + "learning_rate": 0.00027167277167277167, + "loss": 0.6375, + "step": 505 + }, + { + "epoch": 0.5558149114375944, + "grad_norm": 0.4599516689777374, + "learning_rate": 0.00027155067155067153, + "loss": 0.5005, + "step": 506 + }, + { + "epoch": 0.5569133598791707, + "grad_norm": 0.647957980632782, + "learning_rate": 0.0002714285714285714, + "loss": 0.6292, + "step": 507 + }, + { + "epoch": 0.5580118083207469, + "grad_norm": 0.7891755104064941, + "learning_rate": 0.0002713064713064713, + "loss": 0.697, + "step": 508 + }, + { + "epoch": 0.5591102567623232, + "grad_norm": 0.5290817618370056, + "learning_rate": 0.00027118437118437116, + "loss": 0.4547, + "step": 509 + }, + { + "epoch": 0.5602087052038995, + "grad_norm": 0.4025941789150238, + "learning_rate": 0.000271062271062271, + "loss": 0.6299, + "step": 510 + }, + { + "epoch": 0.5613071536454758, + "grad_norm": 0.7768287658691406, + "learning_rate": 0.00027094017094017093, + "loss": 0.6813, + "step": 511 + }, + { + "epoch": 0.562405602087052, + "grad_norm": 0.6977662444114685, + "learning_rate": 0.0002708180708180708, + "loss": 0.8217, + "step": 512 + }, + { + "epoch": 0.5635040505286283, + "grad_norm": 0.5238949060440063, + "learning_rate": 0.0002706959706959707, + "loss": 0.7348, + "step": 513 + }, + { + "epoch": 0.5646024989702045, + "grad_norm": 0.5099830627441406, + "learning_rate": 0.00027057387057387056, + "loss": 0.9894, + "step": 514 + }, + { + "epoch": 0.5657009474117809, + "grad_norm": 0.6254756450653076, + "learning_rate": 0.0002704517704517704, + "loss": 0.9258, + "step": 515 + }, + { + "epoch": 0.5667993958533571, + "grad_norm": 0.40313196182250977, + "learning_rate": 0.00027032967032967033, + "loss": 0.8115, + "step": 516 + }, + { + "epoch": 0.5678978442949334, + "grad_norm": 0.9706575274467468, + "learning_rate": 0.0002702075702075702, + "loss": 0.5204, + "step": 517 + }, + { + "epoch": 0.5689962927365096, + "grad_norm": 0.36777085065841675, + "learning_rate": 0.00027008547008547005, + "loss": 0.7716, + "step": 518 + }, + { + "epoch": 0.570094741178086, + "grad_norm": 0.48726886510849, + "learning_rate": 0.00026996336996336996, + "loss": 0.7745, + "step": 519 + }, + { + "epoch": 0.5711931896196623, + "grad_norm": 0.3590470850467682, + "learning_rate": 0.0002698412698412698, + "loss": 0.7038, + "step": 520 + }, + { + "epoch": 0.5722916380612385, + "grad_norm": 0.7103118896484375, + "learning_rate": 0.0002697191697191697, + "loss": 0.8368, + "step": 521 + }, + { + "epoch": 0.5733900865028148, + "grad_norm": 0.5503933429718018, + "learning_rate": 0.0002695970695970696, + "loss": 0.6164, + "step": 522 + }, + { + "epoch": 0.574488534944391, + "grad_norm": 0.5255150198936462, + "learning_rate": 0.00026947496947496945, + "loss": 0.8886, + "step": 523 + }, + { + "epoch": 0.5755869833859674, + "grad_norm": 0.4872569739818573, + "learning_rate": 0.0002693528693528693, + "loss": 0.6277, + "step": 524 + }, + { + "epoch": 0.5766854318275436, + "grad_norm": 0.3748464584350586, + "learning_rate": 0.0002692307692307692, + "loss": 0.6471, + "step": 525 + }, + { + "epoch": 0.5777838802691199, + "grad_norm": 0.4401276111602783, + "learning_rate": 0.0002691086691086691, + "loss": 0.9846, + "step": 526 + }, + { + "epoch": 0.5788823287106961, + "grad_norm": 0.9565305709838867, + "learning_rate": 0.00026898656898656894, + "loss": 0.9471, + "step": 527 + }, + { + "epoch": 0.5799807771522724, + "grad_norm": 0.6307245492935181, + "learning_rate": 0.00026886446886446885, + "loss": 0.9168, + "step": 528 + }, + { + "epoch": 0.5810792255938487, + "grad_norm": 0.49177634716033936, + "learning_rate": 0.0002687423687423687, + "loss": 0.5464, + "step": 529 + }, + { + "epoch": 0.582177674035425, + "grad_norm": 0.68553626537323, + "learning_rate": 0.0002686202686202686, + "loss": 0.5874, + "step": 530 + }, + { + "epoch": 0.5832761224770012, + "grad_norm": 0.3811597228050232, + "learning_rate": 0.0002684981684981685, + "loss": 0.766, + "step": 531 + }, + { + "epoch": 0.5843745709185775, + "grad_norm": 0.6634503602981567, + "learning_rate": 0.00026837606837606834, + "loss": 0.6438, + "step": 532 + }, + { + "epoch": 0.5854730193601538, + "grad_norm": 0.6115571856498718, + "learning_rate": 0.00026825396825396825, + "loss": 0.8757, + "step": 533 + }, + { + "epoch": 0.5865714678017301, + "grad_norm": 0.3011985719203949, + "learning_rate": 0.0002681318681318681, + "loss": 0.6188, + "step": 534 + }, + { + "epoch": 0.5876699162433063, + "grad_norm": 0.7029386162757874, + "learning_rate": 0.00026800976800976797, + "loss": 0.8681, + "step": 535 + }, + { + "epoch": 0.5887683646848826, + "grad_norm": 0.4796508550643921, + "learning_rate": 0.0002678876678876679, + "loss": 0.7207, + "step": 536 + }, + { + "epoch": 0.5898668131264588, + "grad_norm": 0.542948842048645, + "learning_rate": 0.00026776556776556774, + "loss": 0.5587, + "step": 537 + }, + { + "epoch": 0.5909652615680352, + "grad_norm": 0.7566731572151184, + "learning_rate": 0.0002676434676434676, + "loss": 0.8562, + "step": 538 + }, + { + "epoch": 0.5920637100096114, + "grad_norm": 0.6411837339401245, + "learning_rate": 0.0002675213675213675, + "loss": 0.4516, + "step": 539 + }, + { + "epoch": 0.5931621584511877, + "grad_norm": 0.41434159874916077, + "learning_rate": 0.00026739926739926737, + "loss": 0.7069, + "step": 540 + }, + { + "epoch": 0.5942606068927639, + "grad_norm": 0.29941752552986145, + "learning_rate": 0.0002672771672771672, + "loss": 0.7444, + "step": 541 + }, + { + "epoch": 0.5953590553343402, + "grad_norm": 1.8168927431106567, + "learning_rate": 0.00026715506715506714, + "loss": 0.4947, + "step": 542 + }, + { + "epoch": 0.5964575037759166, + "grad_norm": 0.5639868974685669, + "learning_rate": 0.000267032967032967, + "loss": 0.6749, + "step": 543 + }, + { + "epoch": 0.5975559522174928, + "grad_norm": 0.5054119229316711, + "learning_rate": 0.0002669108669108669, + "loss": 0.8075, + "step": 544 + }, + { + "epoch": 0.598654400659069, + "grad_norm": 0.3531246483325958, + "learning_rate": 0.00026678876678876677, + "loss": 0.6986, + "step": 545 + }, + { + "epoch": 0.5997528491006453, + "grad_norm": 0.36428287625312805, + "learning_rate": 0.0002666666666666666, + "loss": 0.6496, + "step": 546 + }, + { + "epoch": 0.6008512975422217, + "grad_norm": 0.45706960558891296, + "learning_rate": 0.00026654456654456654, + "loss": 0.5646, + "step": 547 + }, + { + "epoch": 0.6019497459837979, + "grad_norm": 0.39326363801956177, + "learning_rate": 0.0002664224664224664, + "loss": 0.5037, + "step": 548 + }, + { + "epoch": 0.6030481944253742, + "grad_norm": 0.7158151268959045, + "learning_rate": 0.00026630036630036625, + "loss": 0.5643, + "step": 549 + }, + { + "epoch": 0.6041466428669504, + "grad_norm": 0.398335337638855, + "learning_rate": 0.00026617826617826617, + "loss": 0.5462, + "step": 550 + }, + { + "epoch": 0.6052450913085267, + "grad_norm": 0.8625812530517578, + "learning_rate": 0.000266056166056166, + "loss": 0.7898, + "step": 551 + }, + { + "epoch": 0.606343539750103, + "grad_norm": 0.5558099150657654, + "learning_rate": 0.0002659340659340659, + "loss": 0.7968, + "step": 552 + }, + { + "epoch": 0.6074419881916793, + "grad_norm": 0.6244741678237915, + "learning_rate": 0.0002658119658119658, + "loss": 0.9085, + "step": 553 + }, + { + "epoch": 0.6085404366332555, + "grad_norm": 0.4907127916812897, + "learning_rate": 0.00026568986568986565, + "loss": 0.5683, + "step": 554 + }, + { + "epoch": 0.6096388850748318, + "grad_norm": 0.6140159964561462, + "learning_rate": 0.0002655677655677655, + "loss": 0.5693, + "step": 555 + }, + { + "epoch": 0.610737333516408, + "grad_norm": 0.41251274943351746, + "learning_rate": 0.0002654456654456654, + "loss": 0.728, + "step": 556 + }, + { + "epoch": 0.6118357819579844, + "grad_norm": 0.43427684903144836, + "learning_rate": 0.00026532356532356534, + "loss": 0.5692, + "step": 557 + }, + { + "epoch": 0.6129342303995606, + "grad_norm": 0.41471078991889954, + "learning_rate": 0.00026520146520146514, + "loss": 0.6616, + "step": 558 + }, + { + "epoch": 0.6140326788411369, + "grad_norm": 0.4406953752040863, + "learning_rate": 0.00026507936507936506, + "loss": 0.4764, + "step": 559 + }, + { + "epoch": 0.6151311272827131, + "grad_norm": 7.233060359954834, + "learning_rate": 0.00026495726495726497, + "loss": 0.6111, + "step": 560 + }, + { + "epoch": 0.6162295757242895, + "grad_norm": 0.47008857131004333, + "learning_rate": 0.0002648351648351648, + "loss": 0.8145, + "step": 561 + }, + { + "epoch": 0.6173280241658657, + "grad_norm": 0.47636717557907104, + "learning_rate": 0.0002647130647130647, + "loss": 0.8036, + "step": 562 + }, + { + "epoch": 0.618426472607442, + "grad_norm": 0.526971161365509, + "learning_rate": 0.0002645909645909646, + "loss": 0.7559, + "step": 563 + }, + { + "epoch": 0.6195249210490182, + "grad_norm": 0.5027382373809814, + "learning_rate": 0.00026446886446886446, + "loss": 0.7765, + "step": 564 + }, + { + "epoch": 0.6206233694905945, + "grad_norm": 0.4222506284713745, + "learning_rate": 0.0002643467643467643, + "loss": 0.6376, + "step": 565 + }, + { + "epoch": 0.6217218179321709, + "grad_norm": 0.6390372514724731, + "learning_rate": 0.0002642246642246642, + "loss": 0.8224, + "step": 566 + }, + { + "epoch": 0.6228202663737471, + "grad_norm": 0.44495514035224915, + "learning_rate": 0.0002641025641025641, + "loss": 0.5995, + "step": 567 + }, + { + "epoch": 0.6239187148153233, + "grad_norm": 0.7005137205123901, + "learning_rate": 0.00026398046398046394, + "loss": 0.4986, + "step": 568 + }, + { + "epoch": 0.6250171632568996, + "grad_norm": 0.40745365619659424, + "learning_rate": 0.0002638583638583638, + "loss": 0.608, + "step": 569 + }, + { + "epoch": 0.6261156116984758, + "grad_norm": 0.3449142277240753, + "learning_rate": 0.0002637362637362637, + "loss": 0.6253, + "step": 570 + }, + { + "epoch": 0.6272140601400522, + "grad_norm": 0.4318457841873169, + "learning_rate": 0.00026361416361416357, + "loss": 0.6376, + "step": 571 + }, + { + "epoch": 0.6283125085816285, + "grad_norm": 2.2202258110046387, + "learning_rate": 0.00026349206349206343, + "loss": 0.5477, + "step": 572 + }, + { + "epoch": 0.6294109570232047, + "grad_norm": 0.6759721040725708, + "learning_rate": 0.00026336996336996334, + "loss": 1.1176, + "step": 573 + }, + { + "epoch": 0.630509405464781, + "grad_norm": 1.7796927690505981, + "learning_rate": 0.00026324786324786326, + "loss": 0.8713, + "step": 574 + }, + { + "epoch": 0.6316078539063573, + "grad_norm": 0.32952558994293213, + "learning_rate": 0.0002631257631257631, + "loss": 0.4711, + "step": 575 + }, + { + "epoch": 0.6327063023479336, + "grad_norm": 0.40390628576278687, + "learning_rate": 0.000263003663003663, + "loss": 0.5412, + "step": 576 + }, + { + "epoch": 0.6338047507895098, + "grad_norm": 0.7439208030700684, + "learning_rate": 0.0002628815628815629, + "loss": 0.7094, + "step": 577 + }, + { + "epoch": 0.6349031992310861, + "grad_norm": 0.34505775570869446, + "learning_rate": 0.00026275946275946274, + "loss": 0.5939, + "step": 578 + }, + { + "epoch": 0.6360016476726623, + "grad_norm": 0.9452011585235596, + "learning_rate": 0.0002626373626373626, + "loss": 0.5108, + "step": 579 + }, + { + "epoch": 0.6371000961142387, + "grad_norm": 0.42789551615715027, + "learning_rate": 0.0002625152625152625, + "loss": 0.5661, + "step": 580 + }, + { + "epoch": 0.6381985445558149, + "grad_norm": 0.3460575044155121, + "learning_rate": 0.0002623931623931624, + "loss": 0.8333, + "step": 581 + }, + { + "epoch": 0.6392969929973912, + "grad_norm": 0.8932168483734131, + "learning_rate": 0.00026227106227106223, + "loss": 0.7058, + "step": 582 + }, + { + "epoch": 0.6403954414389674, + "grad_norm": 0.8588842749595642, + "learning_rate": 0.00026214896214896214, + "loss": 0.6905, + "step": 583 + }, + { + "epoch": 0.6414938898805437, + "grad_norm": 0.5097251534461975, + "learning_rate": 0.000262026862026862, + "loss": 0.8189, + "step": 584 + }, + { + "epoch": 0.64259233832212, + "grad_norm": 0.45746755599975586, + "learning_rate": 0.00026190476190476186, + "loss": 0.7212, + "step": 585 + }, + { + "epoch": 0.6436907867636963, + "grad_norm": 0.9576689600944519, + "learning_rate": 0.0002617826617826618, + "loss": 0.6159, + "step": 586 + }, + { + "epoch": 0.6447892352052725, + "grad_norm": 0.5721899271011353, + "learning_rate": 0.00026166056166056163, + "loss": 0.6083, + "step": 587 + }, + { + "epoch": 0.6458876836468488, + "grad_norm": 0.4851115942001343, + "learning_rate": 0.00026153846153846154, + "loss": 0.7678, + "step": 588 + }, + { + "epoch": 0.6469861320884251, + "grad_norm": 0.6631761193275452, + "learning_rate": 0.0002614163614163614, + "loss": 0.7068, + "step": 589 + }, + { + "epoch": 0.6480845805300014, + "grad_norm": 0.6862382292747498, + "learning_rate": 0.00026129426129426126, + "loss": 0.5766, + "step": 590 + }, + { + "epoch": 0.6491830289715776, + "grad_norm": 0.3754968047142029, + "learning_rate": 0.0002611721611721612, + "loss": 0.7254, + "step": 591 + }, + { + "epoch": 0.6502814774131539, + "grad_norm": 0.5239700078964233, + "learning_rate": 0.00026105006105006103, + "loss": 0.5777, + "step": 592 + }, + { + "epoch": 0.6513799258547301, + "grad_norm": 0.5103443264961243, + "learning_rate": 0.0002609279609279609, + "loss": 1.0006, + "step": 593 + }, + { + "epoch": 0.6524783742963065, + "grad_norm": 0.4733884632587433, + "learning_rate": 0.0002608058608058608, + "loss": 0.6851, + "step": 594 + }, + { + "epoch": 0.6535768227378828, + "grad_norm": 0.5982065796852112, + "learning_rate": 0.00026068376068376066, + "loss": 0.6295, + "step": 595 + }, + { + "epoch": 0.654675271179459, + "grad_norm": 1.2408190965652466, + "learning_rate": 0.0002605616605616605, + "loss": 0.8806, + "step": 596 + }, + { + "epoch": 0.6557737196210353, + "grad_norm": 0.6005455851554871, + "learning_rate": 0.00026043956043956043, + "loss": 0.7186, + "step": 597 + }, + { + "epoch": 0.6568721680626116, + "grad_norm": 0.33777105808258057, + "learning_rate": 0.0002603174603174603, + "loss": 0.4599, + "step": 598 + }, + { + "epoch": 0.6579706165041879, + "grad_norm": 0.5336529612541199, + "learning_rate": 0.00026019536019536015, + "loss": 0.553, + "step": 599 + }, + { + "epoch": 0.6590690649457641, + "grad_norm": 0.6930931806564331, + "learning_rate": 0.00026007326007326006, + "loss": 0.5686, + "step": 600 + }, + { + "epoch": 0.6601675133873404, + "grad_norm": 1.1340439319610596, + "learning_rate": 0.0002599511599511599, + "loss": 0.5886, + "step": 601 + }, + { + "epoch": 0.6612659618289166, + "grad_norm": 0.9833797812461853, + "learning_rate": 0.0002598290598290598, + "loss": 0.7109, + "step": 602 + }, + { + "epoch": 0.662364410270493, + "grad_norm": 0.9305315017700195, + "learning_rate": 0.0002597069597069597, + "loss": 0.8341, + "step": 603 + }, + { + "epoch": 0.6634628587120692, + "grad_norm": 0.9753265380859375, + "learning_rate": 0.00025958485958485955, + "loss": 0.7102, + "step": 604 + }, + { + "epoch": 0.6645613071536455, + "grad_norm": 2.2342822551727295, + "learning_rate": 0.00025946275946275946, + "loss": 0.6784, + "step": 605 + }, + { + "epoch": 0.6656597555952217, + "grad_norm": 0.6815157532691956, + "learning_rate": 0.0002593406593406593, + "loss": 0.7689, + "step": 606 + }, + { + "epoch": 0.666758204036798, + "grad_norm": 0.7792591452598572, + "learning_rate": 0.0002592185592185592, + "loss": 0.9444, + "step": 607 + }, + { + "epoch": 0.6678566524783743, + "grad_norm": 0.668251097202301, + "learning_rate": 0.0002590964590964591, + "loss": 0.6899, + "step": 608 + }, + { + "epoch": 0.6689551009199506, + "grad_norm": 0.5041349530220032, + "learning_rate": 0.00025897435897435895, + "loss": 0.652, + "step": 609 + }, + { + "epoch": 0.6700535493615268, + "grad_norm": 0.35069939494132996, + "learning_rate": 0.0002588522588522588, + "loss": 0.8102, + "step": 610 + }, + { + "epoch": 0.6711519978031031, + "grad_norm": 3.324793577194214, + "learning_rate": 0.0002587301587301587, + "loss": 0.7936, + "step": 611 + }, + { + "epoch": 0.6722504462446794, + "grad_norm": 0.6778903007507324, + "learning_rate": 0.0002586080586080586, + "loss": 0.6258, + "step": 612 + }, + { + "epoch": 0.6733488946862557, + "grad_norm": 3.034745454788208, + "learning_rate": 0.00025848595848595844, + "loss": 0.697, + "step": 613 + }, + { + "epoch": 0.6744473431278319, + "grad_norm": 2.563870429992676, + "learning_rate": 0.00025836385836385835, + "loss": 0.7596, + "step": 614 + }, + { + "epoch": 0.6755457915694082, + "grad_norm": 0.45592913031578064, + "learning_rate": 0.0002582417582417582, + "loss": 0.7753, + "step": 615 + }, + { + "epoch": 0.6766442400109844, + "grad_norm": 0.7209720015525818, + "learning_rate": 0.00025811965811965807, + "loss": 0.6907, + "step": 616 + }, + { + "epoch": 0.6777426884525608, + "grad_norm": 0.4611949026584625, + "learning_rate": 0.000257997557997558, + "loss": 0.5896, + "step": 617 + }, + { + "epoch": 0.678841136894137, + "grad_norm": 1.3885395526885986, + "learning_rate": 0.0002578754578754579, + "loss": 0.6344, + "step": 618 + }, + { + "epoch": 0.6799395853357133, + "grad_norm": 0.544572651386261, + "learning_rate": 0.00025775335775335775, + "loss": 0.586, + "step": 619 + }, + { + "epoch": 0.6810380337772896, + "grad_norm": 0.5637034177780151, + "learning_rate": 0.0002576312576312576, + "loss": 0.8284, + "step": 620 + }, + { + "epoch": 0.6821364822188658, + "grad_norm": 1.170779824256897, + "learning_rate": 0.00025750915750915747, + "loss": 0.8818, + "step": 621 + }, + { + "epoch": 0.6832349306604422, + "grad_norm": 0.4877263605594635, + "learning_rate": 0.0002573870573870574, + "loss": 0.9179, + "step": 622 + }, + { + "epoch": 0.6843333791020184, + "grad_norm": 0.6684415340423584, + "learning_rate": 0.00025726495726495724, + "loss": 0.7358, + "step": 623 + }, + { + "epoch": 0.6854318275435947, + "grad_norm": 0.6679075956344604, + "learning_rate": 0.0002571428571428571, + "loss": 0.6342, + "step": 624 + }, + { + "epoch": 0.6865302759851709, + "grad_norm": 0.65242600440979, + "learning_rate": 0.000257020757020757, + "loss": 0.4762, + "step": 625 + }, + { + "epoch": 0.6876287244267473, + "grad_norm": 0.806523859500885, + "learning_rate": 0.00025689865689865687, + "loss": 0.7621, + "step": 626 + }, + { + "epoch": 0.6887271728683235, + "grad_norm": 1.09652578830719, + "learning_rate": 0.0002567765567765567, + "loss": 0.6594, + "step": 627 + }, + { + "epoch": 0.6898256213098998, + "grad_norm": 0.412505179643631, + "learning_rate": 0.00025665445665445664, + "loss": 0.8026, + "step": 628 + }, + { + "epoch": 0.690924069751476, + "grad_norm": 0.5801676511764526, + "learning_rate": 0.0002565323565323565, + "loss": 0.7026, + "step": 629 + }, + { + "epoch": 0.6920225181930523, + "grad_norm": 0.6822883486747742, + "learning_rate": 0.00025641025641025636, + "loss": 0.4372, + "step": 630 + }, + { + "epoch": 0.6931209666346286, + "grad_norm": 0.3455508351325989, + "learning_rate": 0.00025628815628815627, + "loss": 0.5624, + "step": 631 + }, + { + "epoch": 0.6942194150762049, + "grad_norm": 0.3533216714859009, + "learning_rate": 0.0002561660561660562, + "loss": 0.7493, + "step": 632 + }, + { + "epoch": 0.6953178635177811, + "grad_norm": 1.4306656122207642, + "learning_rate": 0.000256043956043956, + "loss": 0.7537, + "step": 633 + }, + { + "epoch": 0.6964163119593574, + "grad_norm": 0.336393266916275, + "learning_rate": 0.0002559218559218559, + "loss": 0.787, + "step": 634 + }, + { + "epoch": 0.6975147604009336, + "grad_norm": 0.5303547382354736, + "learning_rate": 0.0002557997557997558, + "loss": 0.5604, + "step": 635 + }, + { + "epoch": 0.69861320884251, + "grad_norm": 0.5421821475028992, + "learning_rate": 0.00025567765567765567, + "loss": 0.6905, + "step": 636 + }, + { + "epoch": 0.6997116572840862, + "grad_norm": 0.5445061922073364, + "learning_rate": 0.00025555555555555553, + "loss": 0.6389, + "step": 637 + }, + { + "epoch": 0.7008101057256625, + "grad_norm": 0.42832881212234497, + "learning_rate": 0.00025543345543345544, + "loss": 0.7825, + "step": 638 + }, + { + "epoch": 0.7019085541672387, + "grad_norm": 1.4624862670898438, + "learning_rate": 0.0002553113553113553, + "loss": 0.4964, + "step": 639 + }, + { + "epoch": 0.7030070026088151, + "grad_norm": 0.38657426834106445, + "learning_rate": 0.00025518925518925516, + "loss": 0.5299, + "step": 640 + }, + { + "epoch": 0.7041054510503914, + "grad_norm": 14.422834396362305, + "learning_rate": 0.00025506715506715507, + "loss": 0.5008, + "step": 641 + }, + { + "epoch": 0.7052038994919676, + "grad_norm": 0.591106653213501, + "learning_rate": 0.00025494505494505493, + "loss": 0.6732, + "step": 642 + }, + { + "epoch": 0.7063023479335439, + "grad_norm": 1.6697375774383545, + "learning_rate": 0.0002548229548229548, + "loss": 0.6782, + "step": 643 + }, + { + "epoch": 0.7074007963751201, + "grad_norm": 1.670777678489685, + "learning_rate": 0.0002547008547008547, + "loss": 0.5275, + "step": 644 + }, + { + "epoch": 0.7084992448166965, + "grad_norm": 2.3361563682556152, + "learning_rate": 0.00025457875457875456, + "loss": 0.4177, + "step": 645 + }, + { + "epoch": 0.7095976932582727, + "grad_norm": 1.823844313621521, + "learning_rate": 0.0002544566544566544, + "loss": 0.5438, + "step": 646 + }, + { + "epoch": 0.710696141699849, + "grad_norm": 0.5374146699905396, + "learning_rate": 0.0002543345543345543, + "loss": 0.6704, + "step": 647 + }, + { + "epoch": 0.7117945901414252, + "grad_norm": 0.9709361791610718, + "learning_rate": 0.0002542124542124542, + "loss": 0.8896, + "step": 648 + }, + { + "epoch": 0.7128930385830015, + "grad_norm": 0.7118197083473206, + "learning_rate": 0.0002540903540903541, + "loss": 0.766, + "step": 649 + }, + { + "epoch": 0.7139914870245778, + "grad_norm": 0.4597225487232208, + "learning_rate": 0.00025396825396825396, + "loss": 0.7498, + "step": 650 + }, + { + "epoch": 0.7150899354661541, + "grad_norm": 0.9708977937698364, + "learning_rate": 0.0002538461538461538, + "loss": 0.7602, + "step": 651 + }, + { + "epoch": 0.7161883839077303, + "grad_norm": 0.8156960606575012, + "learning_rate": 0.00025372405372405373, + "loss": 1.1105, + "step": 652 + }, + { + "epoch": 0.7172868323493066, + "grad_norm": 1.4135644435882568, + "learning_rate": 0.0002536019536019536, + "loss": 0.9203, + "step": 653 + }, + { + "epoch": 0.7183852807908829, + "grad_norm": 0.5754226446151733, + "learning_rate": 0.00025347985347985344, + "loss": 0.5368, + "step": 654 + }, + { + "epoch": 0.7194837292324592, + "grad_norm": 1.7644588947296143, + "learning_rate": 0.00025335775335775336, + "loss": 0.6451, + "step": 655 + }, + { + "epoch": 0.7205821776740354, + "grad_norm": 4.35576868057251, + "learning_rate": 0.0002532356532356532, + "loss": 0.6732, + "step": 656 + }, + { + "epoch": 0.7216806261156117, + "grad_norm": 1.1072558164596558, + "learning_rate": 0.0002531135531135531, + "loss": 0.7901, + "step": 657 + }, + { + "epoch": 0.7227790745571879, + "grad_norm": 0.3916113078594208, + "learning_rate": 0.000252991452991453, + "loss": 0.7153, + "step": 658 + }, + { + "epoch": 0.7238775229987643, + "grad_norm": 1.055137276649475, + "learning_rate": 0.00025286935286935285, + "loss": 0.8664, + "step": 659 + }, + { + "epoch": 0.7249759714403405, + "grad_norm": 0.5966087579727173, + "learning_rate": 0.0002527472527472527, + "loss": 0.933, + "step": 660 + }, + { + "epoch": 0.7260744198819168, + "grad_norm": 0.40958529710769653, + "learning_rate": 0.0002526251526251526, + "loss": 0.7196, + "step": 661 + }, + { + "epoch": 0.727172868323493, + "grad_norm": 0.4636710584163666, + "learning_rate": 0.0002525030525030525, + "loss": 0.7039, + "step": 662 + }, + { + "epoch": 0.7282713167650693, + "grad_norm": 0.6967337131500244, + "learning_rate": 0.0002523809523809524, + "loss": 0.8981, + "step": 663 + }, + { + "epoch": 0.7293697652066456, + "grad_norm": 0.49781784415245056, + "learning_rate": 0.00025225885225885225, + "loss": 0.7239, + "step": 664 + }, + { + "epoch": 0.7304682136482219, + "grad_norm": 0.940851628780365, + "learning_rate": 0.0002521367521367521, + "loss": 0.8199, + "step": 665 + }, + { + "epoch": 0.7315666620897981, + "grad_norm": 1.0271226167678833, + "learning_rate": 0.000252014652014652, + "loss": 0.6757, + "step": 666 + }, + { + "epoch": 0.7326651105313744, + "grad_norm": 0.5299912095069885, + "learning_rate": 0.0002518925518925519, + "loss": 0.8464, + "step": 667 + }, + { + "epoch": 0.7337635589729508, + "grad_norm": 0.7060052156448364, + "learning_rate": 0.00025177045177045173, + "loss": 0.6541, + "step": 668 + }, + { + "epoch": 0.734862007414527, + "grad_norm": 0.5419691205024719, + "learning_rate": 0.00025164835164835165, + "loss": 0.8741, + "step": 669 + }, + { + "epoch": 0.7359604558561033, + "grad_norm": 0.6363463401794434, + "learning_rate": 0.0002515262515262515, + "loss": 0.7224, + "step": 670 + }, + { + "epoch": 0.7370589042976795, + "grad_norm": 0.7622922658920288, + "learning_rate": 0.00025140415140415136, + "loss": 0.9402, + "step": 671 + }, + { + "epoch": 0.7381573527392558, + "grad_norm": 0.7477490305900574, + "learning_rate": 0.0002512820512820513, + "loss": 0.6036, + "step": 672 + }, + { + "epoch": 0.7392558011808321, + "grad_norm": 0.4813562333583832, + "learning_rate": 0.00025115995115995113, + "loss": 0.5982, + "step": 673 + }, + { + "epoch": 0.7403542496224084, + "grad_norm": 3.112766981124878, + "learning_rate": 0.000251037851037851, + "loss": 0.5825, + "step": 674 + }, + { + "epoch": 0.7414526980639846, + "grad_norm": 0.9523088932037354, + "learning_rate": 0.0002509157509157509, + "loss": 0.5698, + "step": 675 + }, + { + "epoch": 0.7425511465055609, + "grad_norm": 0.3426001965999603, + "learning_rate": 0.00025079365079365076, + "loss": 0.5516, + "step": 676 + }, + { + "epoch": 0.7436495949471371, + "grad_norm": 0.4866350591182709, + "learning_rate": 0.0002506715506715506, + "loss": 0.5466, + "step": 677 + }, + { + "epoch": 0.7447480433887135, + "grad_norm": 0.6590595245361328, + "learning_rate": 0.00025054945054945053, + "loss": 0.7579, + "step": 678 + }, + { + "epoch": 0.7458464918302897, + "grad_norm": 0.36733704805374146, + "learning_rate": 0.0002504273504273504, + "loss": 0.5114, + "step": 679 + }, + { + "epoch": 0.746944940271866, + "grad_norm": 0.5890951156616211, + "learning_rate": 0.0002503052503052503, + "loss": 0.7196, + "step": 680 + }, + { + "epoch": 0.7480433887134422, + "grad_norm": 0.8393438458442688, + "learning_rate": 0.00025018315018315016, + "loss": 0.6291, + "step": 681 + }, + { + "epoch": 0.7491418371550186, + "grad_norm": 0.9745636582374573, + "learning_rate": 0.00025006105006105, + "loss": 0.8675, + "step": 682 + }, + { + "epoch": 0.7502402855965948, + "grad_norm": 1.1764310598373413, + "learning_rate": 0.00024993894993894993, + "loss": 0.9384, + "step": 683 + }, + { + "epoch": 0.7513387340381711, + "grad_norm": 0.6199970245361328, + "learning_rate": 0.0002498168498168498, + "loss": 0.5984, + "step": 684 + }, + { + "epoch": 0.7524371824797473, + "grad_norm": 2.2708802223205566, + "learning_rate": 0.00024969474969474965, + "loss": 0.7867, + "step": 685 + }, + { + "epoch": 0.7535356309213236, + "grad_norm": 0.6731462478637695, + "learning_rate": 0.00024957264957264956, + "loss": 0.5377, + "step": 686 + }, + { + "epoch": 0.7546340793629, + "grad_norm": 0.991669774055481, + "learning_rate": 0.0002494505494505494, + "loss": 0.7015, + "step": 687 + }, + { + "epoch": 0.7557325278044762, + "grad_norm": 0.5873506665229797, + "learning_rate": 0.0002493284493284493, + "loss": 0.567, + "step": 688 + }, + { + "epoch": 0.7568309762460524, + "grad_norm": 1.5025473833084106, + "learning_rate": 0.0002492063492063492, + "loss": 0.6264, + "step": 689 + }, + { + "epoch": 0.7579294246876287, + "grad_norm": 0.4942665696144104, + "learning_rate": 0.00024908424908424905, + "loss": 0.7623, + "step": 690 + }, + { + "epoch": 0.7590278731292049, + "grad_norm": 0.5522105693817139, + "learning_rate": 0.0002489621489621489, + "loss": 0.6192, + "step": 691 + }, + { + "epoch": 0.7601263215707813, + "grad_norm": 1.25243079662323, + "learning_rate": 0.0002488400488400488, + "loss": 0.8547, + "step": 692 + }, + { + "epoch": 0.7612247700123576, + "grad_norm": 0.5228685140609741, + "learning_rate": 0.00024871794871794874, + "loss": 0.7365, + "step": 693 + }, + { + "epoch": 0.7623232184539338, + "grad_norm": 1.5090827941894531, + "learning_rate": 0.0002485958485958486, + "loss": 0.9226, + "step": 694 + }, + { + "epoch": 0.76342166689551, + "grad_norm": 3.3617379665374756, + "learning_rate": 0.00024847374847374845, + "loss": 0.7942, + "step": 695 + }, + { + "epoch": 0.7645201153370864, + "grad_norm": 0.5350137948989868, + "learning_rate": 0.0002483516483516483, + "loss": 0.6254, + "step": 696 + }, + { + "epoch": 0.7656185637786627, + "grad_norm": 0.8871312141418457, + "learning_rate": 0.0002482295482295482, + "loss": 0.8241, + "step": 697 + }, + { + "epoch": 0.7667170122202389, + "grad_norm": 0.48593926429748535, + "learning_rate": 0.0002481074481074481, + "loss": 0.5707, + "step": 698 + }, + { + "epoch": 0.7678154606618152, + "grad_norm": 0.7460000514984131, + "learning_rate": 0.00024798534798534794, + "loss": 0.9521, + "step": 699 + }, + { + "epoch": 0.7689139091033914, + "grad_norm": 0.7105034589767456, + "learning_rate": 0.00024786324786324785, + "loss": 0.7513, + "step": 700 + }, + { + "epoch": 0.7700123575449678, + "grad_norm": 0.40251481533050537, + "learning_rate": 0.0002477411477411477, + "loss": 0.6067, + "step": 701 + }, + { + "epoch": 0.771110805986544, + "grad_norm": 0.452709436416626, + "learning_rate": 0.00024761904761904757, + "loss": 0.671, + "step": 702 + }, + { + "epoch": 0.7722092544281203, + "grad_norm": 0.581453263759613, + "learning_rate": 0.0002474969474969475, + "loss": 0.5356, + "step": 703 + }, + { + "epoch": 0.7733077028696965, + "grad_norm": 0.8013669848442078, + "learning_rate": 0.00024737484737484734, + "loss": 0.6889, + "step": 704 + }, + { + "epoch": 0.7744061513112728, + "grad_norm": 1.1480565071105957, + "learning_rate": 0.0002472527472527472, + "loss": 0.7456, + "step": 705 + }, + { + "epoch": 0.7755045997528491, + "grad_norm": 0.7568329572677612, + "learning_rate": 0.0002471306471306471, + "loss": 0.7455, + "step": 706 + }, + { + "epoch": 0.7766030481944254, + "grad_norm": 0.4223226308822632, + "learning_rate": 0.000247008547008547, + "loss": 0.7138, + "step": 707 + }, + { + "epoch": 0.7777014966360016, + "grad_norm": 0.372872531414032, + "learning_rate": 0.00024688644688644683, + "loss": 0.8037, + "step": 708 + }, + { + "epoch": 0.7787999450775779, + "grad_norm": 0.968614399433136, + "learning_rate": 0.00024676434676434674, + "loss": 0.5943, + "step": 709 + }, + { + "epoch": 0.7798983935191542, + "grad_norm": 0.801157534122467, + "learning_rate": 0.00024664224664224665, + "loss": 0.9467, + "step": 710 + }, + { + "epoch": 0.7809968419607305, + "grad_norm": 0.7115808129310608, + "learning_rate": 0.0002465201465201465, + "loss": 0.7828, + "step": 711 + }, + { + "epoch": 0.7820952904023067, + "grad_norm": 1.2951349020004272, + "learning_rate": 0.00024639804639804637, + "loss": 0.6221, + "step": 712 + }, + { + "epoch": 0.783193738843883, + "grad_norm": 0.47706693410873413, + "learning_rate": 0.0002462759462759463, + "loss": 0.3641, + "step": 713 + }, + { + "epoch": 0.7842921872854592, + "grad_norm": 0.8871097564697266, + "learning_rate": 0.00024615384615384614, + "loss": 0.6177, + "step": 714 + }, + { + "epoch": 0.7853906357270356, + "grad_norm": 0.7920973896980286, + "learning_rate": 0.000246031746031746, + "loss": 0.5858, + "step": 715 + }, + { + "epoch": 0.7864890841686119, + "grad_norm": 0.49732694029808044, + "learning_rate": 0.0002459096459096459, + "loss": 0.5176, + "step": 716 + }, + { + "epoch": 0.7875875326101881, + "grad_norm": 0.34965720772743225, + "learning_rate": 0.00024578754578754577, + "loss": 0.4983, + "step": 717 + }, + { + "epoch": 0.7886859810517644, + "grad_norm": 0.45963025093078613, + "learning_rate": 0.00024566544566544563, + "loss": 0.7756, + "step": 718 + }, + { + "epoch": 0.7897844294933407, + "grad_norm": 0.5802373290061951, + "learning_rate": 0.00024554334554334554, + "loss": 0.5773, + "step": 719 + }, + { + "epoch": 0.790882877934917, + "grad_norm": 1.8482742309570312, + "learning_rate": 0.0002454212454212454, + "loss": 0.7978, + "step": 720 + }, + { + "epoch": 0.7919813263764932, + "grad_norm": 0.5821959972381592, + "learning_rate": 0.00024529914529914526, + "loss": 0.7483, + "step": 721 + }, + { + "epoch": 0.7930797748180695, + "grad_norm": 0.9352701306343079, + "learning_rate": 0.0002451770451770451, + "loss": 0.6979, + "step": 722 + }, + { + "epoch": 0.7941782232596457, + "grad_norm": 0.554032564163208, + "learning_rate": 0.00024505494505494503, + "loss": 0.6773, + "step": 723 + }, + { + "epoch": 0.7952766717012221, + "grad_norm": 0.6914504766464233, + "learning_rate": 0.00024493284493284494, + "loss": 0.6548, + "step": 724 + }, + { + "epoch": 0.7963751201427983, + "grad_norm": 0.40804949402809143, + "learning_rate": 0.0002448107448107448, + "loss": 0.4634, + "step": 725 + }, + { + "epoch": 0.7974735685843746, + "grad_norm": 0.4965716302394867, + "learning_rate": 0.00024468864468864466, + "loss": 0.4879, + "step": 726 + }, + { + "epoch": 0.7985720170259508, + "grad_norm": 0.48798999190330505, + "learning_rate": 0.00024456654456654457, + "loss": 0.7003, + "step": 727 + }, + { + "epoch": 0.7996704654675271, + "grad_norm": 0.6946013569831848, + "learning_rate": 0.00024444444444444443, + "loss": 0.7508, + "step": 728 + }, + { + "epoch": 0.8007689139091034, + "grad_norm": 0.4310678243637085, + "learning_rate": 0.0002443223443223443, + "loss": 0.5765, + "step": 729 + }, + { + "epoch": 0.8018673623506797, + "grad_norm": 0.5407636761665344, + "learning_rate": 0.0002442002442002442, + "loss": 0.5445, + "step": 730 + }, + { + "epoch": 0.8029658107922559, + "grad_norm": 0.6281490921974182, + "learning_rate": 0.00024407814407814403, + "loss": 0.9319, + "step": 731 + }, + { + "epoch": 0.8040642592338322, + "grad_norm": 1.2027008533477783, + "learning_rate": 0.00024395604395604394, + "loss": 0.3957, + "step": 732 + }, + { + "epoch": 0.8051627076754085, + "grad_norm": 0.543230414390564, + "learning_rate": 0.00024383394383394383, + "loss": 0.7919, + "step": 733 + }, + { + "epoch": 0.8062611561169848, + "grad_norm": 0.4269828498363495, + "learning_rate": 0.0002437118437118437, + "loss": 0.6081, + "step": 734 + }, + { + "epoch": 0.807359604558561, + "grad_norm": 1.2857966423034668, + "learning_rate": 0.00024358974358974357, + "loss": 0.8654, + "step": 735 + }, + { + "epoch": 0.8084580530001373, + "grad_norm": 0.6370485424995422, + "learning_rate": 0.00024346764346764346, + "loss": 0.8053, + "step": 736 + }, + { + "epoch": 0.8095565014417135, + "grad_norm": 1.1288559436798096, + "learning_rate": 0.00024334554334554332, + "loss": 0.8709, + "step": 737 + }, + { + "epoch": 0.8106549498832899, + "grad_norm": 0.5601497292518616, + "learning_rate": 0.0002432234432234432, + "loss": 0.7982, + "step": 738 + }, + { + "epoch": 0.8117533983248661, + "grad_norm": 0.476745069026947, + "learning_rate": 0.0002431013431013431, + "loss": 0.7372, + "step": 739 + }, + { + "epoch": 0.8128518467664424, + "grad_norm": 0.4287762939929962, + "learning_rate": 0.00024297924297924295, + "loss": 0.5686, + "step": 740 + }, + { + "epoch": 0.8139502952080186, + "grad_norm": 0.7039306163787842, + "learning_rate": 0.00024285714285714283, + "loss": 0.7976, + "step": 741 + }, + { + "epoch": 0.8150487436495949, + "grad_norm": 0.47433528304100037, + "learning_rate": 0.00024273504273504272, + "loss": 0.6375, + "step": 742 + }, + { + "epoch": 0.8161471920911713, + "grad_norm": 0.5443944931030273, + "learning_rate": 0.00024261294261294258, + "loss": 0.6793, + "step": 743 + }, + { + "epoch": 0.8172456405327475, + "grad_norm": 0.516094982624054, + "learning_rate": 0.00024249084249084246, + "loss": 0.785, + "step": 744 + }, + { + "epoch": 0.8183440889743238, + "grad_norm": 0.6694304347038269, + "learning_rate": 0.00024236874236874237, + "loss": 0.5431, + "step": 745 + }, + { + "epoch": 0.8194425374159, + "grad_norm": 0.5309669375419617, + "learning_rate": 0.00024224664224664223, + "loss": 0.5806, + "step": 746 + }, + { + "epoch": 0.8205409858574764, + "grad_norm": 0.5502971410751343, + "learning_rate": 0.00024212454212454212, + "loss": 0.5053, + "step": 747 + }, + { + "epoch": 0.8216394342990526, + "grad_norm": 0.5242869853973389, + "learning_rate": 0.00024200244200244198, + "loss": 0.8189, + "step": 748 + }, + { + "epoch": 0.8227378827406289, + "grad_norm": 0.4131311774253845, + "learning_rate": 0.00024188034188034186, + "loss": 0.7074, + "step": 749 + }, + { + "epoch": 0.8238363311822051, + "grad_norm": 0.599915087223053, + "learning_rate": 0.00024175824175824175, + "loss": 0.9408, + "step": 750 + }, + { + "epoch": 0.8249347796237814, + "grad_norm": 0.3683515191078186, + "learning_rate": 0.0002416361416361416, + "loss": 0.6675, + "step": 751 + }, + { + "epoch": 0.8260332280653577, + "grad_norm": 1.633415699005127, + "learning_rate": 0.0002415140415140415, + "loss": 0.6768, + "step": 752 + }, + { + "epoch": 0.827131676506934, + "grad_norm": 0.3848377764225006, + "learning_rate": 0.00024139194139194138, + "loss": 0.485, + "step": 753 + }, + { + "epoch": 0.8282301249485102, + "grad_norm": 0.4116027355194092, + "learning_rate": 0.00024126984126984123, + "loss": 0.8253, + "step": 754 + }, + { + "epoch": 0.8293285733900865, + "grad_norm": 0.5805407762527466, + "learning_rate": 0.00024114774114774112, + "loss": 0.825, + "step": 755 + }, + { + "epoch": 0.8304270218316627, + "grad_norm": 1.2401742935180664, + "learning_rate": 0.000241025641025641, + "loss": 0.6394, + "step": 756 + }, + { + "epoch": 0.8315254702732391, + "grad_norm": 0.42345038056373596, + "learning_rate": 0.00024090354090354086, + "loss": 0.6958, + "step": 757 + }, + { + "epoch": 0.8326239187148153, + "grad_norm": 1.3758116960525513, + "learning_rate": 0.00024078144078144075, + "loss": 0.6997, + "step": 758 + }, + { + "epoch": 0.8337223671563916, + "grad_norm": 1.1826672554016113, + "learning_rate": 0.00024065934065934066, + "loss": 0.7908, + "step": 759 + }, + { + "epoch": 0.8348208155979678, + "grad_norm": 1.0752373933792114, + "learning_rate": 0.0002405372405372405, + "loss": 0.8896, + "step": 760 + }, + { + "epoch": 0.8359192640395442, + "grad_norm": 0.3347112834453583, + "learning_rate": 0.0002404151404151404, + "loss": 0.8202, + "step": 761 + }, + { + "epoch": 0.8370177124811204, + "grad_norm": 0.5837082266807556, + "learning_rate": 0.0002402930402930403, + "loss": 0.7502, + "step": 762 + }, + { + "epoch": 0.8381161609226967, + "grad_norm": 0.5439388751983643, + "learning_rate": 0.00024017094017094015, + "loss": 0.6928, + "step": 763 + }, + { + "epoch": 0.839214609364273, + "grad_norm": 0.35348060727119446, + "learning_rate": 0.00024004884004884004, + "loss": 0.5495, + "step": 764 + }, + { + "epoch": 0.8403130578058492, + "grad_norm": 0.4943974018096924, + "learning_rate": 0.00023992673992673992, + "loss": 0.9218, + "step": 765 + }, + { + "epoch": 0.8414115062474256, + "grad_norm": 0.628667414188385, + "learning_rate": 0.00023980463980463978, + "loss": 0.6266, + "step": 766 + }, + { + "epoch": 0.8425099546890018, + "grad_norm": 0.822575032711029, + "learning_rate": 0.00023968253968253966, + "loss": 0.791, + "step": 767 + }, + { + "epoch": 0.843608403130578, + "grad_norm": 0.3044184446334839, + "learning_rate": 0.00023956043956043955, + "loss": 0.6048, + "step": 768 + }, + { + "epoch": 0.8447068515721543, + "grad_norm": 0.40807369351387024, + "learning_rate": 0.0002394383394383394, + "loss": 0.6286, + "step": 769 + }, + { + "epoch": 0.8458053000137306, + "grad_norm": 1.2373838424682617, + "learning_rate": 0.0002393162393162393, + "loss": 0.5133, + "step": 770 + }, + { + "epoch": 0.8469037484553069, + "grad_norm": 0.5104987025260925, + "learning_rate": 0.00023919413919413918, + "loss": 0.591, + "step": 771 + }, + { + "epoch": 0.8480021968968832, + "grad_norm": 0.6644220352172852, + "learning_rate": 0.00023907203907203904, + "loss": 0.7039, + "step": 772 + }, + { + "epoch": 0.8491006453384594, + "grad_norm": 0.5887960195541382, + "learning_rate": 0.00023894993894993892, + "loss": 0.7017, + "step": 773 + }, + { + "epoch": 0.8501990937800357, + "grad_norm": 0.6568577885627747, + "learning_rate": 0.00023882783882783878, + "loss": 0.6131, + "step": 774 + }, + { + "epoch": 0.851297542221612, + "grad_norm": 0.6594721674919128, + "learning_rate": 0.00023870573870573867, + "loss": 0.6079, + "step": 775 + }, + { + "epoch": 0.8523959906631883, + "grad_norm": 12.29937744140625, + "learning_rate": 0.00023858363858363858, + "loss": 1.1068, + "step": 776 + }, + { + "epoch": 0.8534944391047645, + "grad_norm": 1.175355315208435, + "learning_rate": 0.00023846153846153844, + "loss": 0.734, + "step": 777 + }, + { + "epoch": 0.8545928875463408, + "grad_norm": 1.7128019332885742, + "learning_rate": 0.00023833943833943832, + "loss": 0.6395, + "step": 778 + }, + { + "epoch": 0.855691335987917, + "grad_norm": 0.6479717493057251, + "learning_rate": 0.0002382173382173382, + "loss": 0.8572, + "step": 779 + }, + { + "epoch": 0.8567897844294934, + "grad_norm": 0.9646544456481934, + "learning_rate": 0.00023809523809523807, + "loss": 1.1168, + "step": 780 + }, + { + "epoch": 0.8578882328710696, + "grad_norm": 0.8290930986404419, + "learning_rate": 0.00023797313797313795, + "loss": 0.4413, + "step": 781 + }, + { + "epoch": 0.8589866813126459, + "grad_norm": 0.6690389513969421, + "learning_rate": 0.00023785103785103784, + "loss": 1.1878, + "step": 782 + }, + { + "epoch": 0.8600851297542221, + "grad_norm": 0.6602356433868408, + "learning_rate": 0.0002377289377289377, + "loss": 0.5862, + "step": 783 + }, + { + "epoch": 0.8611835781957984, + "grad_norm": 0.612316370010376, + "learning_rate": 0.00023760683760683758, + "loss": 0.7971, + "step": 784 + }, + { + "epoch": 0.8622820266373747, + "grad_norm": 0.7429434657096863, + "learning_rate": 0.00023748473748473747, + "loss": 0.6265, + "step": 785 + }, + { + "epoch": 0.863380475078951, + "grad_norm": 0.40107640624046326, + "learning_rate": 0.00023736263736263733, + "loss": 0.6697, + "step": 786 + }, + { + "epoch": 0.8644789235205272, + "grad_norm": 0.45808035135269165, + "learning_rate": 0.0002372405372405372, + "loss": 0.7443, + "step": 787 + }, + { + "epoch": 0.8655773719621035, + "grad_norm": 0.36327049136161804, + "learning_rate": 0.0002371184371184371, + "loss": 0.6518, + "step": 788 + }, + { + "epoch": 0.8666758204036799, + "grad_norm": 0.45617833733558655, + "learning_rate": 0.00023699633699633696, + "loss": 0.792, + "step": 789 + }, + { + "epoch": 0.8677742688452561, + "grad_norm": 0.5354835391044617, + "learning_rate": 0.00023687423687423687, + "loss": 0.7788, + "step": 790 + }, + { + "epoch": 0.8688727172868324, + "grad_norm": 0.9770327210426331, + "learning_rate": 0.00023675213675213675, + "loss": 0.7267, + "step": 791 + }, + { + "epoch": 0.8699711657284086, + "grad_norm": 0.646757960319519, + "learning_rate": 0.0002366300366300366, + "loss": 0.7234, + "step": 792 + }, + { + "epoch": 0.8710696141699849, + "grad_norm": 0.4694693982601166, + "learning_rate": 0.0002365079365079365, + "loss": 0.8261, + "step": 793 + }, + { + "epoch": 0.8721680626115612, + "grad_norm": 0.9923954606056213, + "learning_rate": 0.00023638583638583638, + "loss": 0.703, + "step": 794 + }, + { + "epoch": 0.8732665110531375, + "grad_norm": 1.6440534591674805, + "learning_rate": 0.00023626373626373624, + "loss": 0.7654, + "step": 795 + }, + { + "epoch": 0.8743649594947137, + "grad_norm": 0.3947128653526306, + "learning_rate": 0.00023614163614163613, + "loss": 0.637, + "step": 796 + }, + { + "epoch": 0.87546340793629, + "grad_norm": 3.4264323711395264, + "learning_rate": 0.000236019536019536, + "loss": 0.7325, + "step": 797 + }, + { + "epoch": 0.8765618563778662, + "grad_norm": 0.5469256043434143, + "learning_rate": 0.00023589743589743587, + "loss": 0.8203, + "step": 798 + }, + { + "epoch": 0.8776603048194426, + "grad_norm": 0.5184471011161804, + "learning_rate": 0.00023577533577533576, + "loss": 0.7895, + "step": 799 + }, + { + "epoch": 0.8787587532610188, + "grad_norm": 0.8231347799301147, + "learning_rate": 0.00023565323565323562, + "loss": 0.7888, + "step": 800 + }, + { + "epoch": 0.8798572017025951, + "grad_norm": 14.826855659484863, + "learning_rate": 0.0002355311355311355, + "loss": 0.7564, + "step": 801 + }, + { + "epoch": 0.8809556501441713, + "grad_norm": 0.5809927582740784, + "learning_rate": 0.00023540903540903539, + "loss": 0.6702, + "step": 802 + }, + { + "epoch": 0.8820540985857477, + "grad_norm": 0.7244674563407898, + "learning_rate": 0.00023528693528693524, + "loss": 0.6475, + "step": 803 + }, + { + "epoch": 0.8831525470273239, + "grad_norm": 0.8071272373199463, + "learning_rate": 0.00023516483516483513, + "loss": 0.7434, + "step": 804 + }, + { + "epoch": 0.8842509954689002, + "grad_norm": 0.6872429847717285, + "learning_rate": 0.00023504273504273504, + "loss": 0.5968, + "step": 805 + }, + { + "epoch": 0.8853494439104764, + "grad_norm": 9.353965759277344, + "learning_rate": 0.00023492063492063487, + "loss": 0.4228, + "step": 806 + }, + { + "epoch": 0.8864478923520527, + "grad_norm": 0.47151222825050354, + "learning_rate": 0.00023479853479853479, + "loss": 0.6832, + "step": 807 + }, + { + "epoch": 0.887546340793629, + "grad_norm": 1.4599422216415405, + "learning_rate": 0.00023467643467643467, + "loss": 0.6692, + "step": 808 + }, + { + "epoch": 0.8886447892352053, + "grad_norm": 0.45811519026756287, + "learning_rate": 0.00023455433455433453, + "loss": 0.787, + "step": 809 + }, + { + "epoch": 0.8897432376767815, + "grad_norm": 1.077709674835205, + "learning_rate": 0.00023443223443223442, + "loss": 0.6695, + "step": 810 + }, + { + "epoch": 0.8908416861183578, + "grad_norm": 0.5702061057090759, + "learning_rate": 0.0002343101343101343, + "loss": 0.5858, + "step": 811 + }, + { + "epoch": 0.891940134559934, + "grad_norm": 2.2391059398651123, + "learning_rate": 0.00023418803418803416, + "loss": 0.6688, + "step": 812 + }, + { + "epoch": 0.8930385830015104, + "grad_norm": 1.6974279880523682, + "learning_rate": 0.00023406593406593405, + "loss": 0.8545, + "step": 813 + }, + { + "epoch": 0.8941370314430866, + "grad_norm": 0.983435869216919, + "learning_rate": 0.00023394383394383393, + "loss": 0.8128, + "step": 814 + }, + { + "epoch": 0.8952354798846629, + "grad_norm": 0.44103240966796875, + "learning_rate": 0.0002338217338217338, + "loss": 0.7968, + "step": 815 + }, + { + "epoch": 0.8963339283262391, + "grad_norm": 1.0707038640975952, + "learning_rate": 0.00023369963369963367, + "loss": 0.6996, + "step": 816 + }, + { + "epoch": 0.8974323767678155, + "grad_norm": 0.8029122352600098, + "learning_rate": 0.00023357753357753356, + "loss": 0.7911, + "step": 817 + }, + { + "epoch": 0.8985308252093918, + "grad_norm": 0.46339499950408936, + "learning_rate": 0.00023345543345543342, + "loss": 0.7712, + "step": 818 + }, + { + "epoch": 0.899629273650968, + "grad_norm": 1.020947813987732, + "learning_rate": 0.0002333333333333333, + "loss": 0.6865, + "step": 819 + }, + { + "epoch": 0.9007277220925443, + "grad_norm": 0.5332039594650269, + "learning_rate": 0.00023321123321123322, + "loss": 0.8352, + "step": 820 + }, + { + "epoch": 0.9018261705341205, + "grad_norm": 0.40052923560142517, + "learning_rate": 0.00023308913308913307, + "loss": 0.5435, + "step": 821 + }, + { + "epoch": 0.9029246189756969, + "grad_norm": 0.6643521189689636, + "learning_rate": 0.00023296703296703296, + "loss": 0.7406, + "step": 822 + }, + { + "epoch": 0.9040230674172731, + "grad_norm": 0.7514997720718384, + "learning_rate": 0.00023284493284493285, + "loss": 0.7595, + "step": 823 + }, + { + "epoch": 0.9051215158588494, + "grad_norm": 0.7124571204185486, + "learning_rate": 0.0002327228327228327, + "loss": 0.5736, + "step": 824 + }, + { + "epoch": 0.9062199643004256, + "grad_norm": 0.6757075786590576, + "learning_rate": 0.0002326007326007326, + "loss": 0.6275, + "step": 825 + }, + { + "epoch": 0.9073184127420019, + "grad_norm": 0.4200783669948578, + "learning_rate": 0.00023247863247863245, + "loss": 0.6267, + "step": 826 + }, + { + "epoch": 0.9084168611835782, + "grad_norm": 0.5442836284637451, + "learning_rate": 0.00023235653235653233, + "loss": 0.6814, + "step": 827 + }, + { + "epoch": 0.9095153096251545, + "grad_norm": 0.4859601557254791, + "learning_rate": 0.00023223443223443222, + "loss": 0.6451, + "step": 828 + }, + { + "epoch": 0.9106137580667307, + "grad_norm": 0.7353097200393677, + "learning_rate": 0.00023211233211233208, + "loss": 0.6723, + "step": 829 + }, + { + "epoch": 0.911712206508307, + "grad_norm": 0.6389304995536804, + "learning_rate": 0.00023199023199023196, + "loss": 0.9429, + "step": 830 + }, + { + "epoch": 0.9128106549498833, + "grad_norm": 0.6813933849334717, + "learning_rate": 0.00023186813186813185, + "loss": 0.5319, + "step": 831 + }, + { + "epoch": 0.9139091033914596, + "grad_norm": 0.40023690462112427, + "learning_rate": 0.0002317460317460317, + "loss": 0.5808, + "step": 832 + }, + { + "epoch": 0.9150075518330358, + "grad_norm": 0.5327205657958984, + "learning_rate": 0.0002316239316239316, + "loss": 0.6666, + "step": 833 + }, + { + "epoch": 0.9161060002746121, + "grad_norm": 1.672450065612793, + "learning_rate": 0.0002315018315018315, + "loss": 0.7758, + "step": 834 + }, + { + "epoch": 0.9172044487161883, + "grad_norm": 0.5022990703582764, + "learning_rate": 0.00023137973137973134, + "loss": 0.6309, + "step": 835 + }, + { + "epoch": 0.9183028971577647, + "grad_norm": 0.43023642897605896, + "learning_rate": 0.00023125763125763125, + "loss": 0.5343, + "step": 836 + }, + { + "epoch": 0.919401345599341, + "grad_norm": 0.6878641843795776, + "learning_rate": 0.00023113553113553113, + "loss": 0.7268, + "step": 837 + }, + { + "epoch": 0.9204997940409172, + "grad_norm": 0.40551453828811646, + "learning_rate": 0.000231013431013431, + "loss": 0.5784, + "step": 838 + }, + { + "epoch": 0.9215982424824934, + "grad_norm": 0.412356436252594, + "learning_rate": 0.00023089133089133088, + "loss": 0.7685, + "step": 839 + }, + { + "epoch": 0.9226966909240698, + "grad_norm": 1.1603305339813232, + "learning_rate": 0.00023076923076923076, + "loss": 0.518, + "step": 840 + }, + { + "epoch": 0.9237951393656461, + "grad_norm": 0.6733229756355286, + "learning_rate": 0.00023064713064713062, + "loss": 0.5883, + "step": 841 + }, + { + "epoch": 0.9248935878072223, + "grad_norm": 0.619434654712677, + "learning_rate": 0.0002305250305250305, + "loss": 0.6244, + "step": 842 + }, + { + "epoch": 0.9259920362487986, + "grad_norm": 0.6989772319793701, + "learning_rate": 0.0002304029304029304, + "loss": 0.5763, + "step": 843 + }, + { + "epoch": 0.9270904846903748, + "grad_norm": 0.6276418566703796, + "learning_rate": 0.00023028083028083025, + "loss": 0.4762, + "step": 844 + }, + { + "epoch": 0.9281889331319512, + "grad_norm": 0.5577360987663269, + "learning_rate": 0.00023015873015873014, + "loss": 0.6254, + "step": 845 + }, + { + "epoch": 0.9292873815735274, + "grad_norm": 0.6185848116874695, + "learning_rate": 0.00023003663003663002, + "loss": 1.0182, + "step": 846 + }, + { + "epoch": 0.9303858300151037, + "grad_norm": 1.2415262460708618, + "learning_rate": 0.00022991452991452988, + "loss": 0.4677, + "step": 847 + }, + { + "epoch": 0.9314842784566799, + "grad_norm": 0.4582594335079193, + "learning_rate": 0.00022979242979242977, + "loss": 0.6308, + "step": 848 + }, + { + "epoch": 0.9325827268982562, + "grad_norm": 0.4749620258808136, + "learning_rate": 0.00022967032967032962, + "loss": 0.6217, + "step": 849 + }, + { + "epoch": 0.9336811753398325, + "grad_norm": 0.48614588379859924, + "learning_rate": 0.0002295482295482295, + "loss": 0.7469, + "step": 850 + }, + { + "epoch": 0.9347796237814088, + "grad_norm": 0.7357453107833862, + "learning_rate": 0.00022942612942612942, + "loss": 0.5978, + "step": 851 + }, + { + "epoch": 0.935878072222985, + "grad_norm": 0.53326815366745, + "learning_rate": 0.00022930402930402928, + "loss": 0.7678, + "step": 852 + }, + { + "epoch": 0.9369765206645613, + "grad_norm": 0.4853271245956421, + "learning_rate": 0.00022918192918192917, + "loss": 0.4888, + "step": 853 + }, + { + "epoch": 0.9380749691061376, + "grad_norm": 1.6529743671417236, + "learning_rate": 0.00022905982905982905, + "loss": 0.6103, + "step": 854 + }, + { + "epoch": 0.9391734175477139, + "grad_norm": 0.8255143165588379, + "learning_rate": 0.0002289377289377289, + "loss": 0.6977, + "step": 855 + }, + { + "epoch": 0.9402718659892901, + "grad_norm": 0.3999016284942627, + "learning_rate": 0.0002288156288156288, + "loss": 0.5398, + "step": 856 + }, + { + "epoch": 0.9413703144308664, + "grad_norm": 1.933090329170227, + "learning_rate": 0.00022869352869352868, + "loss": 1.0827, + "step": 857 + }, + { + "epoch": 0.9424687628724426, + "grad_norm": 0.8884105682373047, + "learning_rate": 0.00022857142857142854, + "loss": 0.702, + "step": 858 + }, + { + "epoch": 0.943567211314019, + "grad_norm": 0.4555901885032654, + "learning_rate": 0.00022844932844932843, + "loss": 0.8737, + "step": 859 + }, + { + "epoch": 0.9446656597555952, + "grad_norm": 0.535915732383728, + "learning_rate": 0.0002283272283272283, + "loss": 0.7036, + "step": 860 + }, + { + "epoch": 0.9457641081971715, + "grad_norm": 0.7607597708702087, + "learning_rate": 0.00022820512820512817, + "loss": 0.8707, + "step": 861 + }, + { + "epoch": 0.9468625566387477, + "grad_norm": 0.4056457579135895, + "learning_rate": 0.00022808302808302805, + "loss": 0.6658, + "step": 862 + }, + { + "epoch": 0.947961005080324, + "grad_norm": 0.5472984313964844, + "learning_rate": 0.00022796092796092794, + "loss": 0.5429, + "step": 863 + }, + { + "epoch": 0.9490594535219004, + "grad_norm": 0.6866592764854431, + "learning_rate": 0.0002278388278388278, + "loss": 0.7343, + "step": 864 + }, + { + "epoch": 0.9501579019634766, + "grad_norm": 0.5244406461715698, + "learning_rate": 0.0002277167277167277, + "loss": 0.669, + "step": 865 + }, + { + "epoch": 0.9512563504050529, + "grad_norm": 0.45024383068084717, + "learning_rate": 0.0002275946275946276, + "loss": 0.9062, + "step": 866 + }, + { + "epoch": 0.9523547988466291, + "grad_norm": 0.4252873659133911, + "learning_rate": 0.00022747252747252745, + "loss": 0.6109, + "step": 867 + }, + { + "epoch": 0.9534532472882055, + "grad_norm": 0.50081467628479, + "learning_rate": 0.00022735042735042734, + "loss": 0.5266, + "step": 868 + }, + { + "epoch": 0.9545516957297817, + "grad_norm": 0.9674072861671448, + "learning_rate": 0.00022722832722832723, + "loss": 0.7197, + "step": 869 + }, + { + "epoch": 0.955650144171358, + "grad_norm": 1.572348952293396, + "learning_rate": 0.00022710622710622708, + "loss": 0.4728, + "step": 870 + }, + { + "epoch": 0.9567485926129342, + "grad_norm": 0.6033158898353577, + "learning_rate": 0.00022698412698412697, + "loss": 0.6394, + "step": 871 + }, + { + "epoch": 0.9578470410545105, + "grad_norm": 0.5810523629188538, + "learning_rate": 0.00022686202686202686, + "loss": 0.8813, + "step": 872 + }, + { + "epoch": 0.9589454894960868, + "grad_norm": 0.46345213055610657, + "learning_rate": 0.00022673992673992671, + "loss": 0.5828, + "step": 873 + }, + { + "epoch": 0.9600439379376631, + "grad_norm": 0.5414748191833496, + "learning_rate": 0.0002266178266178266, + "loss": 0.6311, + "step": 874 + }, + { + "epoch": 0.9611423863792393, + "grad_norm": 0.9083818197250366, + "learning_rate": 0.00022649572649572646, + "loss": 0.961, + "step": 875 + }, + { + "epoch": 0.9622408348208156, + "grad_norm": 0.786993145942688, + "learning_rate": 0.00022637362637362634, + "loss": 0.7825, + "step": 876 + }, + { + "epoch": 0.9633392832623918, + "grad_norm": 0.7639968991279602, + "learning_rate": 0.00022625152625152623, + "loss": 0.8989, + "step": 877 + }, + { + "epoch": 0.9644377317039682, + "grad_norm": 0.43360400199890137, + "learning_rate": 0.0002261294261294261, + "loss": 0.6747, + "step": 878 + }, + { + "epoch": 0.9655361801455444, + "grad_norm": 0.8512898683547974, + "learning_rate": 0.00022600732600732597, + "loss": 0.7152, + "step": 879 + }, + { + "epoch": 0.9666346285871207, + "grad_norm": 0.46903684735298157, + "learning_rate": 0.00022588522588522589, + "loss": 0.7594, + "step": 880 + }, + { + "epoch": 0.9677330770286969, + "grad_norm": 1.9560080766677856, + "learning_rate": 0.00022576312576312572, + "loss": 0.598, + "step": 881 + }, + { + "epoch": 0.9688315254702733, + "grad_norm": 1.1595470905303955, + "learning_rate": 0.00022564102564102563, + "loss": 0.6005, + "step": 882 + }, + { + "epoch": 0.9699299739118495, + "grad_norm": 0.7318668365478516, + "learning_rate": 0.00022551892551892551, + "loss": 0.7327, + "step": 883 + }, + { + "epoch": 0.9710284223534258, + "grad_norm": 0.6557647585868835, + "learning_rate": 0.00022539682539682537, + "loss": 0.5858, + "step": 884 + }, + { + "epoch": 0.972126870795002, + "grad_norm": 0.5645928382873535, + "learning_rate": 0.00022527472527472526, + "loss": 0.5818, + "step": 885 + }, + { + "epoch": 0.9732253192365783, + "grad_norm": 0.4630253314971924, + "learning_rate": 0.00022515262515262514, + "loss": 0.8363, + "step": 886 + }, + { + "epoch": 0.9743237676781547, + "grad_norm": 0.6750912666320801, + "learning_rate": 0.000225030525030525, + "loss": 0.8865, + "step": 887 + }, + { + "epoch": 0.9754222161197309, + "grad_norm": 0.6309487819671631, + "learning_rate": 0.0002249084249084249, + "loss": 0.5596, + "step": 888 + }, + { + "epoch": 0.9765206645613072, + "grad_norm": 0.9696050882339478, + "learning_rate": 0.00022478632478632477, + "loss": 0.7752, + "step": 889 + }, + { + "epoch": 0.9776191130028834, + "grad_norm": 0.7614735960960388, + "learning_rate": 0.00022466422466422463, + "loss": 0.7131, + "step": 890 + }, + { + "epoch": 0.9787175614444596, + "grad_norm": 0.4971006214618683, + "learning_rate": 0.00022454212454212452, + "loss": 0.6218, + "step": 891 + }, + { + "epoch": 0.979816009886036, + "grad_norm": 0.47809773683547974, + "learning_rate": 0.0002244200244200244, + "loss": 0.5678, + "step": 892 + }, + { + "epoch": 0.9809144583276123, + "grad_norm": 0.5959337949752808, + "learning_rate": 0.00022429792429792426, + "loss": 1.0002, + "step": 893 + }, + { + "epoch": 0.9820129067691885, + "grad_norm": 0.45277753472328186, + "learning_rate": 0.00022417582417582415, + "loss": 0.7321, + "step": 894 + }, + { + "epoch": 0.9831113552107648, + "grad_norm": 1.279405951499939, + "learning_rate": 0.00022405372405372406, + "loss": 0.7912, + "step": 895 + }, + { + "epoch": 0.9842098036523411, + "grad_norm": 0.49885687232017517, + "learning_rate": 0.00022393162393162392, + "loss": 0.5558, + "step": 896 + }, + { + "epoch": 0.9853082520939174, + "grad_norm": 0.474979430437088, + "learning_rate": 0.0002238095238095238, + "loss": 0.7095, + "step": 897 + }, + { + "epoch": 0.9864067005354936, + "grad_norm": 0.3826389014720917, + "learning_rate": 0.0002236874236874237, + "loss": 0.5695, + "step": 898 + }, + { + "epoch": 0.9875051489770699, + "grad_norm": 0.33514517545700073, + "learning_rate": 0.00022356532356532355, + "loss": 0.6341, + "step": 899 + }, + { + "epoch": 0.9886035974186461, + "grad_norm": 0.5049251914024353, + "learning_rate": 0.00022344322344322343, + "loss": 0.5577, + "step": 900 + }, + { + "epoch": 0.9897020458602225, + "grad_norm": 0.5179988145828247, + "learning_rate": 0.0002233211233211233, + "loss": 0.5769, + "step": 901 + }, + { + "epoch": 0.9908004943017987, + "grad_norm": 0.5194469094276428, + "learning_rate": 0.00022319902319902318, + "loss": 0.5466, + "step": 902 + }, + { + "epoch": 0.991898942743375, + "grad_norm": 0.46941491961479187, + "learning_rate": 0.00022307692307692306, + "loss": 0.642, + "step": 903 + }, + { + "epoch": 0.9929973911849512, + "grad_norm": 0.379682719707489, + "learning_rate": 0.00022295482295482292, + "loss": 0.5508, + "step": 904 + }, + { + "epoch": 0.9940958396265275, + "grad_norm": 1.3844119310379028, + "learning_rate": 0.0002228327228327228, + "loss": 0.8814, + "step": 905 + }, + { + "epoch": 0.9951942880681038, + "grad_norm": 2.497697114944458, + "learning_rate": 0.0002227106227106227, + "loss": 0.8116, + "step": 906 + }, + { + "epoch": 0.9962927365096801, + "grad_norm": 0.36689239740371704, + "learning_rate": 0.00022258852258852255, + "loss": 0.5001, + "step": 907 + }, + { + "epoch": 0.9973911849512563, + "grad_norm": 0.39868447184562683, + "learning_rate": 0.00022246642246642243, + "loss": 0.6913, + "step": 908 + }, + { + "epoch": 0.9984896333928326, + "grad_norm": 0.5270336270332336, + "learning_rate": 0.00022234432234432235, + "loss": 0.5401, + "step": 909 + }, + { + "epoch": 0.999588081834409, + "grad_norm": 0.4079851508140564, + "learning_rate": 0.00022222222222222218, + "loss": 0.471, + "step": 910 + }, + { + "epoch": 1.000686530275985, + "grad_norm": 0.43189048767089844, + "learning_rate": 0.0002221001221001221, + "loss": 0.8237, + "step": 911 + }, + { + "epoch": 1.0017849787175614, + "grad_norm": 0.52342289686203, + "learning_rate": 0.00022197802197802198, + "loss": 0.6363, + "step": 912 + }, + { + "epoch": 1.0028834271591378, + "grad_norm": 0.38078904151916504, + "learning_rate": 0.00022185592185592184, + "loss": 0.4411, + "step": 913 + }, + { + "epoch": 1.003981875600714, + "grad_norm": 0.5302817821502686, + "learning_rate": 0.00022173382173382172, + "loss": 0.858, + "step": 914 + }, + { + "epoch": 1.0050803240422903, + "grad_norm": 0.3696751892566681, + "learning_rate": 0.0002216117216117216, + "loss": 0.8766, + "step": 915 + }, + { + "epoch": 1.0061787724838664, + "grad_norm": 0.7566766738891602, + "learning_rate": 0.00022148962148962146, + "loss": 1.067, + "step": 916 + }, + { + "epoch": 1.0072772209254428, + "grad_norm": 0.7399318218231201, + "learning_rate": 0.00022136752136752135, + "loss": 0.6683, + "step": 917 + }, + { + "epoch": 1.0083756693670192, + "grad_norm": 0.5435899496078491, + "learning_rate": 0.00022124542124542124, + "loss": 0.6045, + "step": 918 + }, + { + "epoch": 1.0094741178085953, + "grad_norm": 0.9680571556091309, + "learning_rate": 0.0002211233211233211, + "loss": 0.7546, + "step": 919 + }, + { + "epoch": 1.0105725662501717, + "grad_norm": 0.6131067872047424, + "learning_rate": 0.00022100122100122098, + "loss": 0.6655, + "step": 920 + }, + { + "epoch": 1.0116710146917478, + "grad_norm": 0.8093316555023193, + "learning_rate": 0.00022087912087912086, + "loss": 0.4812, + "step": 921 + }, + { + "epoch": 1.0127694631333242, + "grad_norm": 0.5077763199806213, + "learning_rate": 0.00022075702075702072, + "loss": 0.5357, + "step": 922 + }, + { + "epoch": 1.0138679115749005, + "grad_norm": 0.4767695963382721, + "learning_rate": 0.0002206349206349206, + "loss": 0.5807, + "step": 923 + }, + { + "epoch": 1.0149663600164767, + "grad_norm": 0.3215581178665161, + "learning_rate": 0.00022051282051282052, + "loss": 0.5773, + "step": 924 + }, + { + "epoch": 1.016064808458053, + "grad_norm": 0.425603985786438, + "learning_rate": 0.00022039072039072035, + "loss": 0.5441, + "step": 925 + }, + { + "epoch": 1.0171632568996292, + "grad_norm": 0.6131730079650879, + "learning_rate": 0.00022026862026862027, + "loss": 0.856, + "step": 926 + }, + { + "epoch": 1.0182617053412055, + "grad_norm": 0.5472941398620605, + "learning_rate": 0.00022014652014652012, + "loss": 0.8228, + "step": 927 + }, + { + "epoch": 1.0193601537827819, + "grad_norm": 0.46728211641311646, + "learning_rate": 0.00022002442002442, + "loss": 0.7615, + "step": 928 + }, + { + "epoch": 1.020458602224358, + "grad_norm": 0.39919501543045044, + "learning_rate": 0.0002199023199023199, + "loss": 0.709, + "step": 929 + }, + { + "epoch": 1.0215570506659344, + "grad_norm": 0.564400315284729, + "learning_rate": 0.00021978021978021975, + "loss": 0.5941, + "step": 930 + }, + { + "epoch": 1.0226554991075107, + "grad_norm": 0.39073804020881653, + "learning_rate": 0.00021965811965811964, + "loss": 0.6386, + "step": 931 + }, + { + "epoch": 1.0237539475490869, + "grad_norm": 0.3725563585758209, + "learning_rate": 0.00021953601953601952, + "loss": 0.4766, + "step": 932 + }, + { + "epoch": 1.0248523959906632, + "grad_norm": 1.319197654724121, + "learning_rate": 0.00021941391941391938, + "loss": 0.8465, + "step": 933 + }, + { + "epoch": 1.0259508444322394, + "grad_norm": 0.5126785635948181, + "learning_rate": 0.00021929181929181927, + "loss": 0.5103, + "step": 934 + }, + { + "epoch": 1.0270492928738157, + "grad_norm": 0.5401897430419922, + "learning_rate": 0.00021916971916971915, + "loss": 0.5879, + "step": 935 + }, + { + "epoch": 1.028147741315392, + "grad_norm": 0.47014057636260986, + "learning_rate": 0.000219047619047619, + "loss": 0.658, + "step": 936 + }, + { + "epoch": 1.0292461897569682, + "grad_norm": 0.49227291345596313, + "learning_rate": 0.0002189255189255189, + "loss": 0.5271, + "step": 937 + }, + { + "epoch": 1.0303446381985446, + "grad_norm": 0.8186778426170349, + "learning_rate": 0.00021880341880341878, + "loss": 0.6491, + "step": 938 + }, + { + "epoch": 1.0314430866401207, + "grad_norm": 0.46345674991607666, + "learning_rate": 0.00021868131868131864, + "loss": 0.7935, + "step": 939 + }, + { + "epoch": 1.032541535081697, + "grad_norm": 1.7300915718078613, + "learning_rate": 0.00021855921855921855, + "loss": 0.516, + "step": 940 + }, + { + "epoch": 1.0336399835232735, + "grad_norm": 0.5100822448730469, + "learning_rate": 0.00021843711843711844, + "loss": 0.8286, + "step": 941 + }, + { + "epoch": 1.0347384319648496, + "grad_norm": 0.42278483510017395, + "learning_rate": 0.0002183150183150183, + "loss": 0.7312, + "step": 942 + }, + { + "epoch": 1.035836880406426, + "grad_norm": 0.42105185985565186, + "learning_rate": 0.00021819291819291818, + "loss": 0.5729, + "step": 943 + }, + { + "epoch": 1.036935328848002, + "grad_norm": 0.5117312669754028, + "learning_rate": 0.00021807081807081807, + "loss": 0.7688, + "step": 944 + }, + { + "epoch": 1.0380337772895785, + "grad_norm": 0.4982740879058838, + "learning_rate": 0.00021794871794871793, + "loss": 0.5746, + "step": 945 + }, + { + "epoch": 1.0391322257311548, + "grad_norm": 0.5181052684783936, + "learning_rate": 0.0002178266178266178, + "loss": 0.8446, + "step": 946 + }, + { + "epoch": 1.040230674172731, + "grad_norm": 5.104315757751465, + "learning_rate": 0.0002177045177045177, + "loss": 0.9641, + "step": 947 + }, + { + "epoch": 1.0413291226143073, + "grad_norm": 0.7384645938873291, + "learning_rate": 0.00021758241758241756, + "loss": 0.7168, + "step": 948 + }, + { + "epoch": 1.0424275710558835, + "grad_norm": 0.4367550313472748, + "learning_rate": 0.00021746031746031744, + "loss": 0.7139, + "step": 949 + }, + { + "epoch": 1.0435260194974598, + "grad_norm": 0.7332566380500793, + "learning_rate": 0.00021733821733821733, + "loss": 0.7082, + "step": 950 + }, + { + "epoch": 1.0446244679390362, + "grad_norm": 0.4191775918006897, + "learning_rate": 0.00021721611721611719, + "loss": 0.7986, + "step": 951 + }, + { + "epoch": 1.0457229163806123, + "grad_norm": 0.33929941058158875, + "learning_rate": 0.00021709401709401707, + "loss": 0.3784, + "step": 952 + }, + { + "epoch": 1.0468213648221887, + "grad_norm": 0.5255181789398193, + "learning_rate": 0.00021697191697191693, + "loss": 0.5842, + "step": 953 + }, + { + "epoch": 1.047919813263765, + "grad_norm": 0.5401780605316162, + "learning_rate": 0.00021684981684981681, + "loss": 0.7939, + "step": 954 + }, + { + "epoch": 1.0490182617053412, + "grad_norm": 0.34873855113983154, + "learning_rate": 0.00021672771672771673, + "loss": 0.7957, + "step": 955 + }, + { + "epoch": 1.0501167101469175, + "grad_norm": 0.33418160676956177, + "learning_rate": 0.00021660561660561656, + "loss": 0.6037, + "step": 956 + }, + { + "epoch": 1.0512151585884937, + "grad_norm": 0.3197249174118042, + "learning_rate": 0.00021648351648351647, + "loss": 0.5223, + "step": 957 + }, + { + "epoch": 1.05231360703007, + "grad_norm": 0.5962835550308228, + "learning_rate": 0.00021636141636141636, + "loss": 0.5213, + "step": 958 + }, + { + "epoch": 1.0534120554716464, + "grad_norm": 1.3891643285751343, + "learning_rate": 0.00021623931623931622, + "loss": 0.6781, + "step": 959 + }, + { + "epoch": 1.0545105039132225, + "grad_norm": 0.42117932438850403, + "learning_rate": 0.0002161172161172161, + "loss": 0.6363, + "step": 960 + }, + { + "epoch": 1.055608952354799, + "grad_norm": 0.4514491558074951, + "learning_rate": 0.00021599511599511599, + "loss": 0.6904, + "step": 961 + }, + { + "epoch": 1.056707400796375, + "grad_norm": 0.4863387644290924, + "learning_rate": 0.00021587301587301584, + "loss": 0.6595, + "step": 962 + }, + { + "epoch": 1.0578058492379514, + "grad_norm": 0.6178450584411621, + "learning_rate": 0.00021575091575091573, + "loss": 0.8412, + "step": 963 + }, + { + "epoch": 1.0589042976795278, + "grad_norm": 0.3728642761707306, + "learning_rate": 0.00021562881562881562, + "loss": 0.629, + "step": 964 + }, + { + "epoch": 1.060002746121104, + "grad_norm": 0.7554892301559448, + "learning_rate": 0.00021550671550671547, + "loss": 0.5804, + "step": 965 + }, + { + "epoch": 1.0611011945626803, + "grad_norm": 0.550298273563385, + "learning_rate": 0.00021538461538461536, + "loss": 0.476, + "step": 966 + }, + { + "epoch": 1.0621996430042564, + "grad_norm": 0.4082244336605072, + "learning_rate": 0.00021526251526251524, + "loss": 0.4001, + "step": 967 + }, + { + "epoch": 1.0632980914458328, + "grad_norm": 1.2327499389648438, + "learning_rate": 0.0002151404151404151, + "loss": 0.4583, + "step": 968 + }, + { + "epoch": 1.0643965398874091, + "grad_norm": 0.860550045967102, + "learning_rate": 0.000215018315018315, + "loss": 0.6415, + "step": 969 + }, + { + "epoch": 1.0654949883289853, + "grad_norm": 0.558860182762146, + "learning_rate": 0.0002148962148962149, + "loss": 0.6215, + "step": 970 + }, + { + "epoch": 1.0665934367705616, + "grad_norm": 0.7794890403747559, + "learning_rate": 0.00021477411477411476, + "loss": 0.5094, + "step": 971 + }, + { + "epoch": 1.0676918852121378, + "grad_norm": 0.48574942350387573, + "learning_rate": 0.00021465201465201465, + "loss": 0.7385, + "step": 972 + }, + { + "epoch": 1.0687903336537141, + "grad_norm": 0.4496791660785675, + "learning_rate": 0.00021452991452991453, + "loss": 0.5036, + "step": 973 + }, + { + "epoch": 1.0698887820952905, + "grad_norm": 0.5360952615737915, + "learning_rate": 0.0002144078144078144, + "loss": 0.6825, + "step": 974 + }, + { + "epoch": 1.0709872305368666, + "grad_norm": 0.5783904194831848, + "learning_rate": 0.00021428571428571427, + "loss": 0.6736, + "step": 975 + }, + { + "epoch": 1.072085678978443, + "grad_norm": 2.290815830230713, + "learning_rate": 0.00021416361416361416, + "loss": 0.696, + "step": 976 + }, + { + "epoch": 1.0731841274200193, + "grad_norm": 1.3432899713516235, + "learning_rate": 0.00021404151404151402, + "loss": 0.5296, + "step": 977 + }, + { + "epoch": 1.0742825758615955, + "grad_norm": 0.5308722257614136, + "learning_rate": 0.0002139194139194139, + "loss": 0.6642, + "step": 978 + }, + { + "epoch": 1.0753810243031718, + "grad_norm": 0.7245768904685974, + "learning_rate": 0.00021379731379731376, + "loss": 0.6811, + "step": 979 + }, + { + "epoch": 1.076479472744748, + "grad_norm": 0.3873349726200104, + "learning_rate": 0.00021367521367521365, + "loss": 0.8503, + "step": 980 + }, + { + "epoch": 1.0775779211863243, + "grad_norm": 0.5792405605316162, + "learning_rate": 0.00021355311355311353, + "loss": 0.4543, + "step": 981 + }, + { + "epoch": 1.0786763696279005, + "grad_norm": 0.6543241143226624, + "learning_rate": 0.0002134310134310134, + "loss": 0.7778, + "step": 982 + }, + { + "epoch": 1.0797748180694768, + "grad_norm": 0.5572071075439453, + "learning_rate": 0.00021330891330891328, + "loss": 0.8446, + "step": 983 + }, + { + "epoch": 1.0808732665110532, + "grad_norm": 0.5798014402389526, + "learning_rate": 0.0002131868131868132, + "loss": 0.7461, + "step": 984 + }, + { + "epoch": 1.0819717149526293, + "grad_norm": 0.8282085657119751, + "learning_rate": 0.00021306471306471302, + "loss": 0.612, + "step": 985 + }, + { + "epoch": 1.0830701633942057, + "grad_norm": 0.5782580971717834, + "learning_rate": 0.00021294261294261293, + "loss": 0.5506, + "step": 986 + }, + { + "epoch": 1.084168611835782, + "grad_norm": 0.3826775848865509, + "learning_rate": 0.00021282051282051282, + "loss": 0.7859, + "step": 987 + }, + { + "epoch": 1.0852670602773582, + "grad_norm": 0.534752368927002, + "learning_rate": 0.00021269841269841268, + "loss": 0.8835, + "step": 988 + }, + { + "epoch": 1.0863655087189346, + "grad_norm": 0.45931264758110046, + "learning_rate": 0.00021257631257631256, + "loss": 0.6694, + "step": 989 + }, + { + "epoch": 1.0874639571605107, + "grad_norm": 0.6106250286102295, + "learning_rate": 0.00021245421245421245, + "loss": 0.8274, + "step": 990 + }, + { + "epoch": 1.088562405602087, + "grad_norm": 0.3704061806201935, + "learning_rate": 0.0002123321123321123, + "loss": 0.7449, + "step": 991 + }, + { + "epoch": 1.0896608540436634, + "grad_norm": 0.3922840356826782, + "learning_rate": 0.0002122100122100122, + "loss": 0.5845, + "step": 992 + }, + { + "epoch": 1.0907593024852396, + "grad_norm": 0.48152726888656616, + "learning_rate": 0.00021208791208791208, + "loss": 0.6608, + "step": 993 + }, + { + "epoch": 1.091857750926816, + "grad_norm": 0.42257216572761536, + "learning_rate": 0.00021196581196581194, + "loss": 0.6379, + "step": 994 + }, + { + "epoch": 1.092956199368392, + "grad_norm": 0.4746345579624176, + "learning_rate": 0.00021184371184371182, + "loss": 0.6467, + "step": 995 + }, + { + "epoch": 1.0940546478099684, + "grad_norm": 0.3915644884109497, + "learning_rate": 0.0002117216117216117, + "loss": 0.9699, + "step": 996 + }, + { + "epoch": 1.0951530962515448, + "grad_norm": 0.5957880020141602, + "learning_rate": 0.00021159951159951157, + "loss": 0.6917, + "step": 997 + }, + { + "epoch": 1.096251544693121, + "grad_norm": 0.4327985942363739, + "learning_rate": 0.00021147741147741145, + "loss": 0.8091, + "step": 998 + }, + { + "epoch": 1.0973499931346973, + "grad_norm": 0.42600274085998535, + "learning_rate": 0.00021135531135531136, + "loss": 0.7685, + "step": 999 + }, + { + "epoch": 1.0984484415762734, + "grad_norm": 0.7165039777755737, + "learning_rate": 0.0002112332112332112, + "loss": 0.8646, + "step": 1000 + }, + { + "epoch": 1.0995468900178498, + "grad_norm": 0.447652131319046, + "learning_rate": 0.0002111111111111111, + "loss": 0.521, + "step": 1001 + }, + { + "epoch": 1.1006453384594261, + "grad_norm": 0.3022591769695282, + "learning_rate": 0.000210989010989011, + "loss": 0.6099, + "step": 1002 + }, + { + "epoch": 1.1017437869010023, + "grad_norm": 0.32764387130737305, + "learning_rate": 0.00021086691086691085, + "loss": 0.5624, + "step": 1003 + }, + { + "epoch": 1.1028422353425786, + "grad_norm": 0.7301959991455078, + "learning_rate": 0.00021074481074481074, + "loss": 0.6091, + "step": 1004 + }, + { + "epoch": 1.1039406837841548, + "grad_norm": 0.4734131097793579, + "learning_rate": 0.0002106227106227106, + "loss": 0.6849, + "step": 1005 + }, + { + "epoch": 1.1050391322257311, + "grad_norm": 0.7214820384979248, + "learning_rate": 0.00021050061050061048, + "loss": 0.789, + "step": 1006 + }, + { + "epoch": 1.1061375806673075, + "grad_norm": 0.31265702843666077, + "learning_rate": 0.00021037851037851037, + "loss": 0.5176, + "step": 1007 + }, + { + "epoch": 1.1072360291088836, + "grad_norm": 0.5804157257080078, + "learning_rate": 0.00021025641025641022, + "loss": 1.0152, + "step": 1008 + }, + { + "epoch": 1.10833447755046, + "grad_norm": 0.3624595105648041, + "learning_rate": 0.0002101343101343101, + "loss": 0.6843, + "step": 1009 + }, + { + "epoch": 1.1094329259920364, + "grad_norm": 0.5099515318870544, + "learning_rate": 0.00021001221001221, + "loss": 0.5568, + "step": 1010 + }, + { + "epoch": 1.1105313744336125, + "grad_norm": 0.46201249957084656, + "learning_rate": 0.00020989010989010985, + "loss": 0.5883, + "step": 1011 + }, + { + "epoch": 1.1116298228751889, + "grad_norm": 0.4493483603000641, + "learning_rate": 0.00020976800976800974, + "loss": 0.8338, + "step": 1012 + }, + { + "epoch": 1.112728271316765, + "grad_norm": 0.4771614968776703, + "learning_rate": 0.00020964590964590963, + "loss": 0.7251, + "step": 1013 + }, + { + "epoch": 1.1138267197583414, + "grad_norm": 2.073347806930542, + "learning_rate": 0.00020952380952380948, + "loss": 0.8921, + "step": 1014 + }, + { + "epoch": 1.1149251681999177, + "grad_norm": 0.435680091381073, + "learning_rate": 0.0002094017094017094, + "loss": 0.5444, + "step": 1015 + }, + { + "epoch": 1.1160236166414939, + "grad_norm": 0.46824783086776733, + "learning_rate": 0.00020927960927960928, + "loss": 0.5591, + "step": 1016 + }, + { + "epoch": 1.1171220650830702, + "grad_norm": 0.43938374519348145, + "learning_rate": 0.00020915750915750914, + "loss": 0.7476, + "step": 1017 + }, + { + "epoch": 1.1182205135246464, + "grad_norm": 0.3620377779006958, + "learning_rate": 0.00020903540903540903, + "loss": 0.5763, + "step": 1018 + }, + { + "epoch": 1.1193189619662227, + "grad_norm": 0.612406313419342, + "learning_rate": 0.0002089133089133089, + "loss": 0.706, + "step": 1019 + }, + { + "epoch": 1.120417410407799, + "grad_norm": 0.5045173168182373, + "learning_rate": 0.00020879120879120877, + "loss": 0.6799, + "step": 1020 + }, + { + "epoch": 1.1215158588493752, + "grad_norm": 0.4815331995487213, + "learning_rate": 0.00020866910866910865, + "loss": 0.8845, + "step": 1021 + }, + { + "epoch": 1.1226143072909516, + "grad_norm": 0.3756159245967865, + "learning_rate": 0.00020854700854700854, + "loss": 0.5545, + "step": 1022 + }, + { + "epoch": 1.1237127557325277, + "grad_norm": 0.3184347152709961, + "learning_rate": 0.0002084249084249084, + "loss": 0.5109, + "step": 1023 + }, + { + "epoch": 1.124811204174104, + "grad_norm": 0.4000808298587799, + "learning_rate": 0.00020830280830280828, + "loss": 0.8363, + "step": 1024 + }, + { + "epoch": 1.1259096526156804, + "grad_norm": 0.3930743336677551, + "learning_rate": 0.00020818070818070817, + "loss": 0.6183, + "step": 1025 + }, + { + "epoch": 1.1270081010572566, + "grad_norm": 0.7536817789077759, + "learning_rate": 0.00020805860805860803, + "loss": 0.7511, + "step": 1026 + }, + { + "epoch": 1.128106549498833, + "grad_norm": 0.5012079477310181, + "learning_rate": 0.00020793650793650791, + "loss": 0.6346, + "step": 1027 + }, + { + "epoch": 1.129204997940409, + "grad_norm": 0.9914690852165222, + "learning_rate": 0.00020781440781440783, + "loss": 0.5827, + "step": 1028 + }, + { + "epoch": 1.1303034463819854, + "grad_norm": 0.9096476435661316, + "learning_rate": 0.00020769230769230766, + "loss": 1.0235, + "step": 1029 + }, + { + "epoch": 1.1314018948235618, + "grad_norm": 0.6668229699134827, + "learning_rate": 0.00020757020757020757, + "loss": 0.741, + "step": 1030 + }, + { + "epoch": 1.132500343265138, + "grad_norm": 0.3232771158218384, + "learning_rate": 0.0002074481074481074, + "loss": 0.6206, + "step": 1031 + }, + { + "epoch": 1.1335987917067143, + "grad_norm": 0.278003990650177, + "learning_rate": 0.00020732600732600731, + "loss": 0.5661, + "step": 1032 + }, + { + "epoch": 1.1346972401482907, + "grad_norm": 1.481213927268982, + "learning_rate": 0.0002072039072039072, + "loss": 0.6422, + "step": 1033 + }, + { + "epoch": 1.1357956885898668, + "grad_norm": 0.4688512682914734, + "learning_rate": 0.00020708180708180706, + "loss": 0.4163, + "step": 1034 + }, + { + "epoch": 1.1368941370314432, + "grad_norm": 0.6438425779342651, + "learning_rate": 0.00020695970695970694, + "loss": 0.6241, + "step": 1035 + }, + { + "epoch": 1.1379925854730193, + "grad_norm": 0.5013176798820496, + "learning_rate": 0.00020683760683760683, + "loss": 0.6273, + "step": 1036 + }, + { + "epoch": 1.1390910339145957, + "grad_norm": 0.5178597569465637, + "learning_rate": 0.0002067155067155067, + "loss": 0.7489, + "step": 1037 + }, + { + "epoch": 1.1401894823561718, + "grad_norm": 0.5804840922355652, + "learning_rate": 0.00020659340659340657, + "loss": 0.9142, + "step": 1038 + }, + { + "epoch": 1.1412879307977482, + "grad_norm": 0.47613444924354553, + "learning_rate": 0.00020647130647130646, + "loss": 0.9531, + "step": 1039 + }, + { + "epoch": 1.1423863792393245, + "grad_norm": 0.4835624694824219, + "learning_rate": 0.00020634920634920632, + "loss": 0.6349, + "step": 1040 + }, + { + "epoch": 1.1434848276809007, + "grad_norm": 0.38351112604141235, + "learning_rate": 0.0002062271062271062, + "loss": 0.4726, + "step": 1041 + }, + { + "epoch": 1.144583276122477, + "grad_norm": 0.5533854365348816, + "learning_rate": 0.0002061050061050061, + "loss": 0.5108, + "step": 1042 + }, + { + "epoch": 1.1456817245640534, + "grad_norm": 0.4842824637889862, + "learning_rate": 0.00020598290598290595, + "loss": 0.6038, + "step": 1043 + }, + { + "epoch": 1.1467801730056295, + "grad_norm": 0.552798330783844, + "learning_rate": 0.00020586080586080583, + "loss": 0.8056, + "step": 1044 + }, + { + "epoch": 1.1478786214472059, + "grad_norm": 0.40466025471687317, + "learning_rate": 0.00020573870573870574, + "loss": 0.6234, + "step": 1045 + }, + { + "epoch": 1.148977069888782, + "grad_norm": 0.6988784074783325, + "learning_rate": 0.0002056166056166056, + "loss": 0.7721, + "step": 1046 + }, + { + "epoch": 1.1500755183303584, + "grad_norm": 0.4852863550186157, + "learning_rate": 0.0002054945054945055, + "loss": 0.6074, + "step": 1047 + }, + { + "epoch": 1.1511739667719347, + "grad_norm": 0.4548696279525757, + "learning_rate": 0.00020537240537240537, + "loss": 0.5592, + "step": 1048 + }, + { + "epoch": 1.1522724152135109, + "grad_norm": 0.9355410933494568, + "learning_rate": 0.00020525030525030523, + "loss": 0.8618, + "step": 1049 + }, + { + "epoch": 1.1533708636550872, + "grad_norm": 0.5641398429870605, + "learning_rate": 0.00020512820512820512, + "loss": 0.704, + "step": 1050 + }, + { + "epoch": 1.1544693120966634, + "grad_norm": 0.48187771439552307, + "learning_rate": 0.000205006105006105, + "loss": 0.6008, + "step": 1051 + }, + { + "epoch": 1.1555677605382397, + "grad_norm": 0.41609904170036316, + "learning_rate": 0.00020488400488400486, + "loss": 0.8812, + "step": 1052 + }, + { + "epoch": 1.156666208979816, + "grad_norm": 0.919477105140686, + "learning_rate": 0.00020476190476190475, + "loss": 0.6597, + "step": 1053 + }, + { + "epoch": 1.1577646574213922, + "grad_norm": 0.5008611083030701, + "learning_rate": 0.0002046398046398046, + "loss": 0.6501, + "step": 1054 + }, + { + "epoch": 1.1588631058629686, + "grad_norm": 0.39832696318626404, + "learning_rate": 0.0002045177045177045, + "loss": 0.6232, + "step": 1055 + }, + { + "epoch": 1.159961554304545, + "grad_norm": 0.5290446281433105, + "learning_rate": 0.00020439560439560438, + "loss": 0.6123, + "step": 1056 + }, + { + "epoch": 1.161060002746121, + "grad_norm": 0.40837669372558594, + "learning_rate": 0.00020427350427350423, + "loss": 0.4989, + "step": 1057 + }, + { + "epoch": 1.1621584511876974, + "grad_norm": 0.43407055735588074, + "learning_rate": 0.00020415140415140412, + "loss": 0.6961, + "step": 1058 + }, + { + "epoch": 1.1632568996292736, + "grad_norm": 0.7601787447929382, + "learning_rate": 0.00020402930402930403, + "loss": 0.9308, + "step": 1059 + }, + { + "epoch": 1.16435534807085, + "grad_norm": 0.452628493309021, + "learning_rate": 0.00020390720390720386, + "loss": 0.6478, + "step": 1060 + }, + { + "epoch": 1.165453796512426, + "grad_norm": 0.4524000287055969, + "learning_rate": 0.00020378510378510378, + "loss": 0.4499, + "step": 1061 + }, + { + "epoch": 1.1665522449540024, + "grad_norm": 0.5971822142601013, + "learning_rate": 0.00020366300366300366, + "loss": 0.6402, + "step": 1062 + }, + { + "epoch": 1.1676506933955788, + "grad_norm": 0.36858659982681274, + "learning_rate": 0.00020354090354090352, + "loss": 0.6511, + "step": 1063 + }, + { + "epoch": 1.168749141837155, + "grad_norm": 0.47295433282852173, + "learning_rate": 0.0002034188034188034, + "loss": 0.5977, + "step": 1064 + }, + { + "epoch": 1.1698475902787313, + "grad_norm": 0.4402971565723419, + "learning_rate": 0.0002032967032967033, + "loss": 0.4824, + "step": 1065 + }, + { + "epoch": 1.1709460387203077, + "grad_norm": 0.3752620816230774, + "learning_rate": 0.00020317460317460315, + "loss": 0.6519, + "step": 1066 + }, + { + "epoch": 1.1720444871618838, + "grad_norm": 0.45207279920578003, + "learning_rate": 0.00020305250305250303, + "loss": 0.6869, + "step": 1067 + }, + { + "epoch": 1.1731429356034602, + "grad_norm": 0.4255804121494293, + "learning_rate": 0.00020293040293040292, + "loss": 0.7289, + "step": 1068 + }, + { + "epoch": 1.1742413840450363, + "grad_norm": 0.48725178837776184, + "learning_rate": 0.00020280830280830278, + "loss": 0.5472, + "step": 1069 + }, + { + "epoch": 1.1753398324866127, + "grad_norm": 0.37094470858573914, + "learning_rate": 0.00020268620268620266, + "loss": 0.558, + "step": 1070 + }, + { + "epoch": 1.176438280928189, + "grad_norm": 0.4191375970840454, + "learning_rate": 0.00020256410256410255, + "loss": 0.6422, + "step": 1071 + }, + { + "epoch": 1.1775367293697652, + "grad_norm": 0.4091531038284302, + "learning_rate": 0.0002024420024420024, + "loss": 0.6705, + "step": 1072 + }, + { + "epoch": 1.1786351778113415, + "grad_norm": 0.4876718521118164, + "learning_rate": 0.0002023199023199023, + "loss": 0.8265, + "step": 1073 + }, + { + "epoch": 1.1797336262529177, + "grad_norm": 0.43008798360824585, + "learning_rate": 0.0002021978021978022, + "loss": 0.5159, + "step": 1074 + }, + { + "epoch": 1.180832074694494, + "grad_norm": 0.47896140813827515, + "learning_rate": 0.00020207570207570204, + "loss": 0.5455, + "step": 1075 + }, + { + "epoch": 1.1819305231360704, + "grad_norm": 0.5313389301300049, + "learning_rate": 0.00020195360195360195, + "loss": 0.7628, + "step": 1076 + }, + { + "epoch": 1.1830289715776465, + "grad_norm": 0.46337512135505676, + "learning_rate": 0.00020183150183150184, + "loss": 0.6661, + "step": 1077 + }, + { + "epoch": 1.1841274200192229, + "grad_norm": 0.4304458498954773, + "learning_rate": 0.0002017094017094017, + "loss": 0.7019, + "step": 1078 + }, + { + "epoch": 1.185225868460799, + "grad_norm": 0.638445258140564, + "learning_rate": 0.00020158730158730158, + "loss": 0.6972, + "step": 1079 + }, + { + "epoch": 1.1863243169023754, + "grad_norm": 1.8217968940734863, + "learning_rate": 0.00020146520146520144, + "loss": 0.5217, + "step": 1080 + }, + { + "epoch": 1.1874227653439517, + "grad_norm": 0.4996611773967743, + "learning_rate": 0.00020134310134310132, + "loss": 0.6767, + "step": 1081 + }, + { + "epoch": 1.1885212137855279, + "grad_norm": 0.43705832958221436, + "learning_rate": 0.0002012210012210012, + "loss": 0.7364, + "step": 1082 + }, + { + "epoch": 1.1896196622271042, + "grad_norm": 0.4148736596107483, + "learning_rate": 0.00020109890109890107, + "loss": 0.7544, + "step": 1083 + }, + { + "epoch": 1.1907181106686804, + "grad_norm": 0.5772218108177185, + "learning_rate": 0.00020097680097680095, + "loss": 0.6349, + "step": 1084 + }, + { + "epoch": 1.1918165591102567, + "grad_norm": 0.9127015471458435, + "learning_rate": 0.00020085470085470084, + "loss": 0.4772, + "step": 1085 + }, + { + "epoch": 1.192915007551833, + "grad_norm": 0.46906840801239014, + "learning_rate": 0.0002007326007326007, + "loss": 0.6184, + "step": 1086 + }, + { + "epoch": 1.1940134559934092, + "grad_norm": 0.38405168056488037, + "learning_rate": 0.00020061050061050058, + "loss": 0.5027, + "step": 1087 + }, + { + "epoch": 1.1951119044349856, + "grad_norm": 0.6352836489677429, + "learning_rate": 0.00020048840048840047, + "loss": 0.6674, + "step": 1088 + }, + { + "epoch": 1.196210352876562, + "grad_norm": 0.6750807762145996, + "learning_rate": 0.00020036630036630033, + "loss": 0.5707, + "step": 1089 + }, + { + "epoch": 1.197308801318138, + "grad_norm": 0.5661985874176025, + "learning_rate": 0.00020024420024420024, + "loss": 0.8298, + "step": 1090 + }, + { + "epoch": 1.1984072497597145, + "grad_norm": 0.6393309831619263, + "learning_rate": 0.00020012210012210012, + "loss": 0.7397, + "step": 1091 + }, + { + "epoch": 1.1995056982012906, + "grad_norm": 0.5442856550216675, + "learning_rate": 0.00019999999999999998, + "loss": 0.7176, + "step": 1092 + }, + { + "epoch": 1.200604146642867, + "grad_norm": 1.0100654363632202, + "learning_rate": 0.00019987789987789987, + "loss": 0.8052, + "step": 1093 + }, + { + "epoch": 1.201702595084443, + "grad_norm": 0.3916209936141968, + "learning_rate": 0.00019975579975579975, + "loss": 0.5951, + "step": 1094 + }, + { + "epoch": 1.2028010435260195, + "grad_norm": 0.3890608847141266, + "learning_rate": 0.0001996336996336996, + "loss": 0.8129, + "step": 1095 + }, + { + "epoch": 1.2038994919675958, + "grad_norm": 0.4267507493495941, + "learning_rate": 0.0001995115995115995, + "loss": 0.8741, + "step": 1096 + }, + { + "epoch": 1.204997940409172, + "grad_norm": 0.49055561423301697, + "learning_rate": 0.00019938949938949938, + "loss": 0.901, + "step": 1097 + }, + { + "epoch": 1.2060963888507483, + "grad_norm": 0.6662428379058838, + "learning_rate": 0.00019926739926739924, + "loss": 0.4971, + "step": 1098 + }, + { + "epoch": 1.2071948372923247, + "grad_norm": 0.4469052255153656, + "learning_rate": 0.00019914529914529913, + "loss": 0.6593, + "step": 1099 + }, + { + "epoch": 1.2082932857339008, + "grad_norm": 0.5514255166053772, + "learning_rate": 0.000199023199023199, + "loss": 0.8033, + "step": 1100 + }, + { + "epoch": 1.2093917341754772, + "grad_norm": 0.4838184714317322, + "learning_rate": 0.00019890109890109887, + "loss": 0.5533, + "step": 1101 + }, + { + "epoch": 1.2104901826170533, + "grad_norm": 0.6061891913414001, + "learning_rate": 0.00019877899877899876, + "loss": 0.5837, + "step": 1102 + }, + { + "epoch": 1.2115886310586297, + "grad_norm": 0.3387523889541626, + "learning_rate": 0.00019865689865689867, + "loss": 0.455, + "step": 1103 + }, + { + "epoch": 1.212687079500206, + "grad_norm": 0.5204731225967407, + "learning_rate": 0.0001985347985347985, + "loss": 0.6869, + "step": 1104 + }, + { + "epoch": 1.2137855279417822, + "grad_norm": 0.5747571587562561, + "learning_rate": 0.0001984126984126984, + "loss": 0.7208, + "step": 1105 + }, + { + "epoch": 1.2148839763833585, + "grad_norm": 0.5382461547851562, + "learning_rate": 0.00019829059829059824, + "loss": 0.6035, + "step": 1106 + }, + { + "epoch": 1.2159824248249347, + "grad_norm": 0.44335421919822693, + "learning_rate": 0.00019816849816849816, + "loss": 0.8563, + "step": 1107 + }, + { + "epoch": 1.217080873266511, + "grad_norm": 0.3059934675693512, + "learning_rate": 0.00019804639804639804, + "loss": 0.6422, + "step": 1108 + }, + { + "epoch": 1.2181793217080874, + "grad_norm": 0.4306177794933319, + "learning_rate": 0.0001979242979242979, + "loss": 0.5347, + "step": 1109 + }, + { + "epoch": 1.2192777701496635, + "grad_norm": 0.5196095705032349, + "learning_rate": 0.00019780219780219779, + "loss": 0.5996, + "step": 1110 + }, + { + "epoch": 1.22037621859124, + "grad_norm": 0.4814283549785614, + "learning_rate": 0.00019768009768009767, + "loss": 0.6782, + "step": 1111 + }, + { + "epoch": 1.2214746670328163, + "grad_norm": 0.2287791222333908, + "learning_rate": 0.00019755799755799753, + "loss": 0.5908, + "step": 1112 + }, + { + "epoch": 1.2225731154743924, + "grad_norm": 0.43044313788414, + "learning_rate": 0.00019743589743589742, + "loss": 0.6554, + "step": 1113 + }, + { + "epoch": 1.2236715639159688, + "grad_norm": 0.390874445438385, + "learning_rate": 0.0001973137973137973, + "loss": 0.5777, + "step": 1114 + }, + { + "epoch": 1.224770012357545, + "grad_norm": 0.5380458235740662, + "learning_rate": 0.00019719169719169716, + "loss": 0.467, + "step": 1115 + }, + { + "epoch": 1.2258684607991213, + "grad_norm": 0.6176440119743347, + "learning_rate": 0.00019706959706959704, + "loss": 0.5625, + "step": 1116 + }, + { + "epoch": 1.2269669092406974, + "grad_norm": 0.4321332275867462, + "learning_rate": 0.00019694749694749693, + "loss": 0.7262, + "step": 1117 + }, + { + "epoch": 1.2280653576822738, + "grad_norm": 0.5679623484611511, + "learning_rate": 0.0001968253968253968, + "loss": 0.8216, + "step": 1118 + }, + { + "epoch": 1.2291638061238501, + "grad_norm": 0.4741218686103821, + "learning_rate": 0.00019670329670329667, + "loss": 0.7164, + "step": 1119 + }, + { + "epoch": 1.2302622545654263, + "grad_norm": 0.6570267677307129, + "learning_rate": 0.00019658119658119659, + "loss": 0.7606, + "step": 1120 + }, + { + "epoch": 1.2313607030070026, + "grad_norm": 0.4256306290626526, + "learning_rate": 0.00019645909645909644, + "loss": 0.5137, + "step": 1121 + }, + { + "epoch": 1.232459151448579, + "grad_norm": 0.4444984793663025, + "learning_rate": 0.00019633699633699633, + "loss": 0.8863, + "step": 1122 + }, + { + "epoch": 1.2335575998901551, + "grad_norm": 0.458133339881897, + "learning_rate": 0.00019621489621489622, + "loss": 0.6445, + "step": 1123 + }, + { + "epoch": 1.2346560483317315, + "grad_norm": 0.6087627410888672, + "learning_rate": 0.00019609279609279607, + "loss": 0.5625, + "step": 1124 + }, + { + "epoch": 1.2357544967733076, + "grad_norm": 0.42782312631607056, + "learning_rate": 0.00019597069597069596, + "loss": 0.6321, + "step": 1125 + }, + { + "epoch": 1.236852945214884, + "grad_norm": 0.49623987078666687, + "learning_rate": 0.00019584859584859585, + "loss": 0.6473, + "step": 1126 + }, + { + "epoch": 1.2379513936564603, + "grad_norm": 0.5348198413848877, + "learning_rate": 0.0001957264957264957, + "loss": 0.6948, + "step": 1127 + }, + { + "epoch": 1.2390498420980365, + "grad_norm": 0.44476062059402466, + "learning_rate": 0.0001956043956043956, + "loss": 0.5917, + "step": 1128 + }, + { + "epoch": 1.2401482905396128, + "grad_norm": 0.5777286291122437, + "learning_rate": 0.00019548229548229547, + "loss": 0.7474, + "step": 1129 + }, + { + "epoch": 1.241246738981189, + "grad_norm": 0.3132689893245697, + "learning_rate": 0.00019536019536019533, + "loss": 0.5827, + "step": 1130 + }, + { + "epoch": 1.2423451874227653, + "grad_norm": 0.3898192346096039, + "learning_rate": 0.00019523809523809522, + "loss": 0.5469, + "step": 1131 + }, + { + "epoch": 1.2434436358643417, + "grad_norm": 0.338693767786026, + "learning_rate": 0.00019511599511599508, + "loss": 0.704, + "step": 1132 + }, + { + "epoch": 1.2445420843059178, + "grad_norm": 0.4276609718799591, + "learning_rate": 0.00019499389499389496, + "loss": 0.7269, + "step": 1133 + }, + { + "epoch": 1.2456405327474942, + "grad_norm": 0.7320281863212585, + "learning_rate": 0.00019487179487179487, + "loss": 0.62, + "step": 1134 + }, + { + "epoch": 1.2467389811890706, + "grad_norm": 0.4023820757865906, + "learning_rate": 0.0001947496947496947, + "loss": 0.4234, + "step": 1135 + }, + { + "epoch": 1.2478374296306467, + "grad_norm": 0.3218212425708771, + "learning_rate": 0.00019462759462759462, + "loss": 0.5325, + "step": 1136 + }, + { + "epoch": 1.248935878072223, + "grad_norm": 0.45131513476371765, + "learning_rate": 0.0001945054945054945, + "loss": 0.5667, + "step": 1137 + }, + { + "epoch": 1.2500343265137992, + "grad_norm": 0.604475200176239, + "learning_rate": 0.00019438339438339436, + "loss": 0.9018, + "step": 1138 + }, + { + "epoch": 1.2511327749553756, + "grad_norm": 0.46968311071395874, + "learning_rate": 0.00019426129426129425, + "loss": 0.7946, + "step": 1139 + }, + { + "epoch": 1.2522312233969517, + "grad_norm": 0.3960346281528473, + "learning_rate": 0.00019413919413919413, + "loss": 0.7719, + "step": 1140 + }, + { + "epoch": 1.253329671838528, + "grad_norm": 0.5146461129188538, + "learning_rate": 0.000194017094017094, + "loss": 0.8946, + "step": 1141 + }, + { + "epoch": 1.2544281202801044, + "grad_norm": 0.6343802809715271, + "learning_rate": 0.00019389499389499388, + "loss": 0.7822, + "step": 1142 + }, + { + "epoch": 1.2555265687216806, + "grad_norm": 0.4646434485912323, + "learning_rate": 0.00019377289377289376, + "loss": 0.6722, + "step": 1143 + }, + { + "epoch": 1.256625017163257, + "grad_norm": 0.48127877712249756, + "learning_rate": 0.00019365079365079362, + "loss": 0.9059, + "step": 1144 + }, + { + "epoch": 1.2577234656048333, + "grad_norm": 0.4040716290473938, + "learning_rate": 0.0001935286935286935, + "loss": 0.7288, + "step": 1145 + }, + { + "epoch": 1.2588219140464094, + "grad_norm": 0.43992865085601807, + "learning_rate": 0.0001934065934065934, + "loss": 0.5804, + "step": 1146 + }, + { + "epoch": 1.2599203624879858, + "grad_norm": 0.41578513383865356, + "learning_rate": 0.00019328449328449325, + "loss": 0.5459, + "step": 1147 + }, + { + "epoch": 1.261018810929562, + "grad_norm": 0.40165719389915466, + "learning_rate": 0.00019316239316239314, + "loss": 0.6001, + "step": 1148 + }, + { + "epoch": 1.2621172593711383, + "grad_norm": 0.43200212717056274, + "learning_rate": 0.00019304029304029305, + "loss": 0.8712, + "step": 1149 + }, + { + "epoch": 1.2632157078127144, + "grad_norm": 0.3217264413833618, + "learning_rate": 0.00019291819291819288, + "loss": 0.6074, + "step": 1150 + }, + { + "epoch": 1.2643141562542908, + "grad_norm": 0.3964528441429138, + "learning_rate": 0.0001927960927960928, + "loss": 0.6131, + "step": 1151 + }, + { + "epoch": 1.2654126046958671, + "grad_norm": 0.5151070952415466, + "learning_rate": 0.00019267399267399268, + "loss": 0.6992, + "step": 1152 + }, + { + "epoch": 1.2665110531374433, + "grad_norm": 0.5902129411697388, + "learning_rate": 0.00019255189255189254, + "loss": 0.7311, + "step": 1153 + }, + { + "epoch": 1.2676095015790196, + "grad_norm": 0.5386108160018921, + "learning_rate": 0.00019242979242979242, + "loss": 0.6469, + "step": 1154 + }, + { + "epoch": 1.268707950020596, + "grad_norm": 0.384093701839447, + "learning_rate": 0.0001923076923076923, + "loss": 0.7111, + "step": 1155 + }, + { + "epoch": 1.2698063984621721, + "grad_norm": 0.34160250425338745, + "learning_rate": 0.00019218559218559217, + "loss": 0.5396, + "step": 1156 + }, + { + "epoch": 1.2709048469037485, + "grad_norm": 0.6590912938117981, + "learning_rate": 0.00019206349206349205, + "loss": 1.1613, + "step": 1157 + }, + { + "epoch": 1.2720032953453249, + "grad_norm": 0.6230842471122742, + "learning_rate": 0.0001919413919413919, + "loss": 0.7701, + "step": 1158 + }, + { + "epoch": 1.273101743786901, + "grad_norm": 0.3881864547729492, + "learning_rate": 0.0001918192918192918, + "loss": 0.633, + "step": 1159 + }, + { + "epoch": 1.2742001922284774, + "grad_norm": 0.4538264274597168, + "learning_rate": 0.00019169719169719168, + "loss": 0.451, + "step": 1160 + }, + { + "epoch": 1.2752986406700535, + "grad_norm": 0.6188018321990967, + "learning_rate": 0.00019157509157509154, + "loss": 0.9563, + "step": 1161 + }, + { + "epoch": 1.2763970891116299, + "grad_norm": 0.4172852039337158, + "learning_rate": 0.00019145299145299142, + "loss": 0.8284, + "step": 1162 + }, + { + "epoch": 1.277495537553206, + "grad_norm": 0.338623583316803, + "learning_rate": 0.0001913308913308913, + "loss": 0.6745, + "step": 1163 + }, + { + "epoch": 1.2785939859947824, + "grad_norm": 0.3960900902748108, + "learning_rate": 0.00019120879120879117, + "loss": 0.6508, + "step": 1164 + }, + { + "epoch": 1.2796924344363587, + "grad_norm": 0.37232962250709534, + "learning_rate": 0.00019108669108669108, + "loss": 0.7347, + "step": 1165 + }, + { + "epoch": 1.2807908828779349, + "grad_norm": 0.47092223167419434, + "learning_rate": 0.00019096459096459097, + "loss": 0.8251, + "step": 1166 + }, + { + "epoch": 1.2818893313195112, + "grad_norm": 0.4647108316421509, + "learning_rate": 0.00019084249084249082, + "loss": 0.556, + "step": 1167 + }, + { + "epoch": 1.2829877797610876, + "grad_norm": 0.5812810659408569, + "learning_rate": 0.0001907203907203907, + "loss": 0.6802, + "step": 1168 + }, + { + "epoch": 1.2840862282026637, + "grad_norm": 0.3731052279472351, + "learning_rate": 0.0001905982905982906, + "loss": 0.6384, + "step": 1169 + }, + { + "epoch": 1.28518467664424, + "grad_norm": 0.47995856404304504, + "learning_rate": 0.00019047619047619045, + "loss": 0.4914, + "step": 1170 + }, + { + "epoch": 1.2862831250858162, + "grad_norm": 0.3223705589771271, + "learning_rate": 0.00019035409035409034, + "loss": 0.6676, + "step": 1171 + }, + { + "epoch": 1.2873815735273926, + "grad_norm": 0.5643377304077148, + "learning_rate": 0.00019023199023199023, + "loss": 0.8224, + "step": 1172 + }, + { + "epoch": 1.2884800219689687, + "grad_norm": 0.48324450850486755, + "learning_rate": 0.00019010989010989008, + "loss": 0.8005, + "step": 1173 + }, + { + "epoch": 1.289578470410545, + "grad_norm": 0.40516728162765503, + "learning_rate": 0.00018998778998778997, + "loss": 0.5463, + "step": 1174 + }, + { + "epoch": 1.2906769188521214, + "grad_norm": 0.45521625876426697, + "learning_rate": 0.00018986568986568985, + "loss": 0.7562, + "step": 1175 + }, + { + "epoch": 1.2917753672936976, + "grad_norm": 0.38747909665107727, + "learning_rate": 0.0001897435897435897, + "loss": 0.5074, + "step": 1176 + }, + { + "epoch": 1.292873815735274, + "grad_norm": 0.39688000082969666, + "learning_rate": 0.0001896214896214896, + "loss": 0.3551, + "step": 1177 + }, + { + "epoch": 1.2939722641768503, + "grad_norm": 0.6891604065895081, + "learning_rate": 0.0001894993894993895, + "loss": 0.601, + "step": 1178 + }, + { + "epoch": 1.2950707126184264, + "grad_norm": 0.5177300572395325, + "learning_rate": 0.00018937728937728934, + "loss": 0.5188, + "step": 1179 + }, + { + "epoch": 1.2961691610600028, + "grad_norm": 0.3166979253292084, + "learning_rate": 0.00018925518925518926, + "loss": 0.8411, + "step": 1180 + }, + { + "epoch": 1.2972676095015792, + "grad_norm": 0.6637437343597412, + "learning_rate": 0.00018913308913308914, + "loss": 0.7256, + "step": 1181 + }, + { + "epoch": 1.2983660579431553, + "grad_norm": 0.424932599067688, + "learning_rate": 0.000189010989010989, + "loss": 0.783, + "step": 1182 + }, + { + "epoch": 1.2994645063847314, + "grad_norm": 0.47751033306121826, + "learning_rate": 0.00018888888888888888, + "loss": 0.7039, + "step": 1183 + }, + { + "epoch": 1.3005629548263078, + "grad_norm": 0.4332704544067383, + "learning_rate": 0.00018876678876678874, + "loss": 0.4797, + "step": 1184 + }, + { + "epoch": 1.3016614032678842, + "grad_norm": 0.439431756734848, + "learning_rate": 0.00018864468864468863, + "loss": 0.6256, + "step": 1185 + }, + { + "epoch": 1.3027598517094603, + "grad_norm": 0.4334176480770111, + "learning_rate": 0.00018852258852258851, + "loss": 0.5583, + "step": 1186 + }, + { + "epoch": 1.3038583001510367, + "grad_norm": 0.42080724239349365, + "learning_rate": 0.00018840048840048837, + "loss": 0.461, + "step": 1187 + }, + { + "epoch": 1.304956748592613, + "grad_norm": 0.41007399559020996, + "learning_rate": 0.00018827838827838826, + "loss": 0.4746, + "step": 1188 + }, + { + "epoch": 1.3060551970341892, + "grad_norm": 0.3763822019100189, + "learning_rate": 0.00018815628815628814, + "loss": 0.5352, + "step": 1189 + }, + { + "epoch": 1.3071536454757655, + "grad_norm": 0.5557730197906494, + "learning_rate": 0.000188034188034188, + "loss": 0.5404, + "step": 1190 + }, + { + "epoch": 1.3082520939173419, + "grad_norm": 0.43677788972854614, + "learning_rate": 0.0001879120879120879, + "loss": 0.7111, + "step": 1191 + }, + { + "epoch": 1.309350542358918, + "grad_norm": 0.6084219217300415, + "learning_rate": 0.00018778998778998777, + "loss": 0.7524, + "step": 1192 + }, + { + "epoch": 1.3104489908004944, + "grad_norm": 0.7219144701957703, + "learning_rate": 0.00018766788766788763, + "loss": 0.6182, + "step": 1193 + }, + { + "epoch": 1.3115474392420705, + "grad_norm": 0.5280331969261169, + "learning_rate": 0.00018754578754578752, + "loss": 0.8023, + "step": 1194 + }, + { + "epoch": 1.3126458876836469, + "grad_norm": 0.42130032181739807, + "learning_rate": 0.00018742368742368743, + "loss": 0.5673, + "step": 1195 + }, + { + "epoch": 1.313744336125223, + "grad_norm": 0.6063292026519775, + "learning_rate": 0.0001873015873015873, + "loss": 0.6438, + "step": 1196 + }, + { + "epoch": 1.3148427845667994, + "grad_norm": 0.4073690176010132, + "learning_rate": 0.00018717948717948717, + "loss": 0.7099, + "step": 1197 + }, + { + "epoch": 1.3159412330083757, + "grad_norm": 0.5419113636016846, + "learning_rate": 0.00018705738705738706, + "loss": 0.6451, + "step": 1198 + }, + { + "epoch": 1.3170396814499519, + "grad_norm": 0.4489867091178894, + "learning_rate": 0.00018693528693528692, + "loss": 0.7522, + "step": 1199 + }, + { + "epoch": 1.3181381298915282, + "grad_norm": 0.3536837697029114, + "learning_rate": 0.0001868131868131868, + "loss": 0.6201, + "step": 1200 + }, + { + "epoch": 1.3192365783331046, + "grad_norm": 0.42462313175201416, + "learning_rate": 0.0001866910866910867, + "loss": 0.4804, + "step": 1201 + }, + { + "epoch": 1.3203350267746807, + "grad_norm": 0.612319827079773, + "learning_rate": 0.00018656898656898655, + "loss": 0.8546, + "step": 1202 + }, + { + "epoch": 1.321433475216257, + "grad_norm": 0.5242000222206116, + "learning_rate": 0.00018644688644688643, + "loss": 0.7577, + "step": 1203 + }, + { + "epoch": 1.3225319236578332, + "grad_norm": 0.5688628554344177, + "learning_rate": 0.00018632478632478632, + "loss": 0.6645, + "step": 1204 + }, + { + "epoch": 1.3236303720994096, + "grad_norm": 0.3695731461048126, + "learning_rate": 0.00018620268620268618, + "loss": 0.4979, + "step": 1205 + }, + { + "epoch": 1.3247288205409857, + "grad_norm": 0.44525593519210815, + "learning_rate": 0.00018608058608058606, + "loss": 0.807, + "step": 1206 + }, + { + "epoch": 1.325827268982562, + "grad_norm": 0.37627971172332764, + "learning_rate": 0.00018595848595848595, + "loss": 0.6584, + "step": 1207 + }, + { + "epoch": 1.3269257174241385, + "grad_norm": 0.39727315306663513, + "learning_rate": 0.0001858363858363858, + "loss": 0.5565, + "step": 1208 + }, + { + "epoch": 1.3280241658657146, + "grad_norm": 0.4151424169540405, + "learning_rate": 0.00018571428571428572, + "loss": 0.81, + "step": 1209 + }, + { + "epoch": 1.329122614307291, + "grad_norm": 0.37529075145721436, + "learning_rate": 0.00018559218559218555, + "loss": 0.6188, + "step": 1210 + }, + { + "epoch": 1.3302210627488673, + "grad_norm": 0.43061408400535583, + "learning_rate": 0.00018547008547008546, + "loss": 0.814, + "step": 1211 + }, + { + "epoch": 1.3313195111904434, + "grad_norm": 0.437511682510376, + "learning_rate": 0.00018534798534798535, + "loss": 0.55, + "step": 1212 + }, + { + "epoch": 1.3324179596320198, + "grad_norm": 0.5172685980796814, + "learning_rate": 0.0001852258852258852, + "loss": 0.6551, + "step": 1213 + }, + { + "epoch": 1.3335164080735962, + "grad_norm": 0.3292716443538666, + "learning_rate": 0.0001851037851037851, + "loss": 0.5108, + "step": 1214 + }, + { + "epoch": 1.3346148565151723, + "grad_norm": 0.7129474878311157, + "learning_rate": 0.00018498168498168498, + "loss": 0.7197, + "step": 1215 + }, + { + "epoch": 1.3357133049567487, + "grad_norm": 0.46317145228385925, + "learning_rate": 0.00018485958485958483, + "loss": 0.6553, + "step": 1216 + }, + { + "epoch": 1.3368117533983248, + "grad_norm": 0.5539398789405823, + "learning_rate": 0.00018473748473748472, + "loss": 0.7057, + "step": 1217 + }, + { + "epoch": 1.3379102018399012, + "grad_norm": 0.40555253624916077, + "learning_rate": 0.0001846153846153846, + "loss": 0.5976, + "step": 1218 + }, + { + "epoch": 1.3390086502814773, + "grad_norm": 0.462704062461853, + "learning_rate": 0.00018449328449328446, + "loss": 0.7018, + "step": 1219 + }, + { + "epoch": 1.3401070987230537, + "grad_norm": 0.407287061214447, + "learning_rate": 0.00018437118437118435, + "loss": 0.4726, + "step": 1220 + }, + { + "epoch": 1.34120554716463, + "grad_norm": 0.3654995858669281, + "learning_rate": 0.00018424908424908423, + "loss": 0.5811, + "step": 1221 + }, + { + "epoch": 1.3423039956062062, + "grad_norm": 0.46455878019332886, + "learning_rate": 0.0001841269841269841, + "loss": 0.8998, + "step": 1222 + }, + { + "epoch": 1.3434024440477825, + "grad_norm": 0.47929346561431885, + "learning_rate": 0.00018400488400488398, + "loss": 0.7348, + "step": 1223 + }, + { + "epoch": 1.344500892489359, + "grad_norm": 0.7128652930259705, + "learning_rate": 0.0001838827838827839, + "loss": 1.2647, + "step": 1224 + }, + { + "epoch": 1.345599340930935, + "grad_norm": 0.3956572413444519, + "learning_rate": 0.00018376068376068372, + "loss": 0.6985, + "step": 1225 + }, + { + "epoch": 1.3466977893725114, + "grad_norm": 0.5585309863090515, + "learning_rate": 0.00018363858363858364, + "loss": 1.0086, + "step": 1226 + }, + { + "epoch": 1.3477962378140875, + "grad_norm": 1.5960838794708252, + "learning_rate": 0.00018351648351648352, + "loss": 0.644, + "step": 1227 + }, + { + "epoch": 1.3488946862556639, + "grad_norm": 0.6499342322349548, + "learning_rate": 0.00018339438339438338, + "loss": 0.7698, + "step": 1228 + }, + { + "epoch": 1.34999313469724, + "grad_norm": 0.42246925830841064, + "learning_rate": 0.00018327228327228326, + "loss": 0.5614, + "step": 1229 + }, + { + "epoch": 1.3510915831388164, + "grad_norm": 0.42192572355270386, + "learning_rate": 0.00018315018315018315, + "loss": 0.7726, + "step": 1230 + }, + { + "epoch": 1.3521900315803927, + "grad_norm": 0.6409221887588501, + "learning_rate": 0.000183028083028083, + "loss": 0.5928, + "step": 1231 + }, + { + "epoch": 1.3532884800219689, + "grad_norm": 1.328852653503418, + "learning_rate": 0.0001829059829059829, + "loss": 0.7861, + "step": 1232 + }, + { + "epoch": 1.3543869284635452, + "grad_norm": 0.4519331753253937, + "learning_rate": 0.00018278388278388275, + "loss": 0.5938, + "step": 1233 + }, + { + "epoch": 1.3554853769051216, + "grad_norm": 0.3942720592021942, + "learning_rate": 0.00018266178266178264, + "loss": 0.4781, + "step": 1234 + }, + { + "epoch": 1.3565838253466977, + "grad_norm": 0.5066869258880615, + "learning_rate": 0.00018253968253968252, + "loss": 0.8069, + "step": 1235 + }, + { + "epoch": 1.357682273788274, + "grad_norm": 0.37002792954444885, + "learning_rate": 0.00018241758241758238, + "loss": 0.5737, + "step": 1236 + }, + { + "epoch": 1.3587807222298505, + "grad_norm": 0.3738810122013092, + "learning_rate": 0.00018229548229548227, + "loss": 0.5169, + "step": 1237 + }, + { + "epoch": 1.3598791706714266, + "grad_norm": 0.44956260919570923, + "learning_rate": 0.00018217338217338215, + "loss": 0.5614, + "step": 1238 + }, + { + "epoch": 1.3609776191130027, + "grad_norm": 0.34839004278182983, + "learning_rate": 0.000182051282051282, + "loss": 0.5783, + "step": 1239 + }, + { + "epoch": 1.362076067554579, + "grad_norm": 0.30152127146720886, + "learning_rate": 0.00018192918192918192, + "loss": 0.4321, + "step": 1240 + }, + { + "epoch": 1.3631745159961555, + "grad_norm": 0.6672345399856567, + "learning_rate": 0.0001818070818070818, + "loss": 0.6073, + "step": 1241 + }, + { + "epoch": 1.3642729644377316, + "grad_norm": 0.45652687549591064, + "learning_rate": 0.00018168498168498167, + "loss": 0.6193, + "step": 1242 + }, + { + "epoch": 1.365371412879308, + "grad_norm": 0.6392306089401245, + "learning_rate": 0.00018156288156288155, + "loss": 0.8388, + "step": 1243 + }, + { + "epoch": 1.3664698613208843, + "grad_norm": 0.5510252714157104, + "learning_rate": 0.00018144078144078144, + "loss": 0.6512, + "step": 1244 + }, + { + "epoch": 1.3675683097624605, + "grad_norm": 0.38780227303504944, + "learning_rate": 0.0001813186813186813, + "loss": 0.6835, + "step": 1245 + }, + { + "epoch": 1.3686667582040368, + "grad_norm": 0.47472965717315674, + "learning_rate": 0.00018119658119658118, + "loss": 0.6625, + "step": 1246 + }, + { + "epoch": 1.3697652066456132, + "grad_norm": 0.3599228262901306, + "learning_rate": 0.00018107448107448107, + "loss": 0.5063, + "step": 1247 + }, + { + "epoch": 1.3708636550871893, + "grad_norm": 0.3284567892551422, + "learning_rate": 0.00018095238095238093, + "loss": 0.7679, + "step": 1248 + }, + { + "epoch": 1.3719621035287657, + "grad_norm": 0.5258575081825256, + "learning_rate": 0.0001808302808302808, + "loss": 0.6213, + "step": 1249 + }, + { + "epoch": 1.3730605519703418, + "grad_norm": 0.3211069405078888, + "learning_rate": 0.0001807081807081807, + "loss": 0.5306, + "step": 1250 + }, + { + "epoch": 1.3741590004119182, + "grad_norm": 0.6325588822364807, + "learning_rate": 0.00018058608058608056, + "loss": 0.8104, + "step": 1251 + }, + { + "epoch": 1.3752574488534943, + "grad_norm": 0.4994303584098816, + "learning_rate": 0.00018046398046398044, + "loss": 0.6464, + "step": 1252 + }, + { + "epoch": 1.3763558972950707, + "grad_norm": 0.3013019263744354, + "learning_rate": 0.00018034188034188035, + "loss": 0.4749, + "step": 1253 + }, + { + "epoch": 1.377454345736647, + "grad_norm": 1.0342131853103638, + "learning_rate": 0.00018021978021978018, + "loss": 0.7995, + "step": 1254 + }, + { + "epoch": 1.3785527941782232, + "grad_norm": 0.40213823318481445, + "learning_rate": 0.0001800976800976801, + "loss": 0.8791, + "step": 1255 + }, + { + "epoch": 1.3796512426197995, + "grad_norm": 0.37126532196998596, + "learning_rate": 0.00017997557997557998, + "loss": 0.551, + "step": 1256 + }, + { + "epoch": 1.380749691061376, + "grad_norm": 0.3417685031890869, + "learning_rate": 0.00017985347985347984, + "loss": 0.583, + "step": 1257 + }, + { + "epoch": 1.381848139502952, + "grad_norm": 0.33571329712867737, + "learning_rate": 0.00017973137973137973, + "loss": 0.4927, + "step": 1258 + }, + { + "epoch": 1.3829465879445284, + "grad_norm": 0.5128073692321777, + "learning_rate": 0.00017960927960927959, + "loss": 0.5903, + "step": 1259 + }, + { + "epoch": 1.3840450363861048, + "grad_norm": 0.5345245599746704, + "learning_rate": 0.00017948717948717947, + "loss": 0.5828, + "step": 1260 + }, + { + "epoch": 1.385143484827681, + "grad_norm": 0.312639981508255, + "learning_rate": 0.00017936507936507936, + "loss": 0.6905, + "step": 1261 + }, + { + "epoch": 1.386241933269257, + "grad_norm": 0.4795394837856293, + "learning_rate": 0.00017924297924297921, + "loss": 0.6193, + "step": 1262 + }, + { + "epoch": 1.3873403817108334, + "grad_norm": 0.39672231674194336, + "learning_rate": 0.0001791208791208791, + "loss": 0.7833, + "step": 1263 + }, + { + "epoch": 1.3884388301524098, + "grad_norm": 0.46752655506134033, + "learning_rate": 0.00017899877899877899, + "loss": 0.6385, + "step": 1264 + }, + { + "epoch": 1.389537278593986, + "grad_norm": 0.5376736521720886, + "learning_rate": 0.00017887667887667884, + "loss": 0.6362, + "step": 1265 + }, + { + "epoch": 1.3906357270355623, + "grad_norm": 0.5675904750823975, + "learning_rate": 0.00017875457875457873, + "loss": 0.7975, + "step": 1266 + }, + { + "epoch": 1.3917341754771386, + "grad_norm": 0.5429015755653381, + "learning_rate": 0.00017863247863247861, + "loss": 0.5415, + "step": 1267 + }, + { + "epoch": 1.3928326239187148, + "grad_norm": 0.3714626729488373, + "learning_rate": 0.00017851037851037847, + "loss": 0.7104, + "step": 1268 + }, + { + "epoch": 1.3939310723602911, + "grad_norm": 0.7549324035644531, + "learning_rate": 0.00017838827838827836, + "loss": 0.698, + "step": 1269 + }, + { + "epoch": 1.3950295208018675, + "grad_norm": 0.36867257952690125, + "learning_rate": 0.00017826617826617827, + "loss": 0.6019, + "step": 1270 + }, + { + "epoch": 1.3961279692434436, + "grad_norm": 0.42439624667167664, + "learning_rate": 0.00017814407814407813, + "loss": 0.4626, + "step": 1271 + }, + { + "epoch": 1.39722641768502, + "grad_norm": 0.4768877923488617, + "learning_rate": 0.00017802197802197802, + "loss": 0.671, + "step": 1272 + }, + { + "epoch": 1.3983248661265961, + "grad_norm": 0.3415908217430115, + "learning_rate": 0.0001778998778998779, + "loss": 0.5904, + "step": 1273 + }, + { + "epoch": 1.3994233145681725, + "grad_norm": 0.5370535850524902, + "learning_rate": 0.00017777777777777776, + "loss": 0.578, + "step": 1274 + }, + { + "epoch": 1.4005217630097486, + "grad_norm": 0.61114901304245, + "learning_rate": 0.00017765567765567764, + "loss": 0.6498, + "step": 1275 + }, + { + "epoch": 1.401620211451325, + "grad_norm": 0.3491772711277008, + "learning_rate": 0.00017753357753357753, + "loss": 0.6057, + "step": 1276 + }, + { + "epoch": 1.4027186598929013, + "grad_norm": 0.4992705285549164, + "learning_rate": 0.0001774114774114774, + "loss": 0.8541, + "step": 1277 + }, + { + "epoch": 1.4038171083344775, + "grad_norm": 0.5476379990577698, + "learning_rate": 0.00017728937728937727, + "loss": 0.5608, + "step": 1278 + }, + { + "epoch": 1.4049155567760538, + "grad_norm": 0.6107895374298096, + "learning_rate": 0.00017716727716727716, + "loss": 0.7437, + "step": 1279 + }, + { + "epoch": 1.4060140052176302, + "grad_norm": 0.510809600353241, + "learning_rate": 0.00017704517704517702, + "loss": 0.6569, + "step": 1280 + }, + { + "epoch": 1.4071124536592063, + "grad_norm": 0.5050077438354492, + "learning_rate": 0.0001769230769230769, + "loss": 0.6566, + "step": 1281 + }, + { + "epoch": 1.4082109021007827, + "grad_norm": 0.44812703132629395, + "learning_rate": 0.0001768009768009768, + "loss": 0.6557, + "step": 1282 + }, + { + "epoch": 1.4093093505423588, + "grad_norm": 0.5216537714004517, + "learning_rate": 0.00017667887667887665, + "loss": 0.7311, + "step": 1283 + }, + { + "epoch": 1.4104077989839352, + "grad_norm": 0.5608856081962585, + "learning_rate": 0.00017655677655677656, + "loss": 0.9001, + "step": 1284 + }, + { + "epoch": 1.4115062474255113, + "grad_norm": 0.47205066680908203, + "learning_rate": 0.0001764346764346764, + "loss": 0.5214, + "step": 1285 + }, + { + "epoch": 1.4126046958670877, + "grad_norm": 0.4073629081249237, + "learning_rate": 0.0001763125763125763, + "loss": 0.483, + "step": 1286 + }, + { + "epoch": 1.413703144308664, + "grad_norm": 0.42381593585014343, + "learning_rate": 0.0001761904761904762, + "loss": 0.4895, + "step": 1287 + }, + { + "epoch": 1.4148015927502402, + "grad_norm": 0.629356861114502, + "learning_rate": 0.00017606837606837605, + "loss": 0.4639, + "step": 1288 + }, + { + "epoch": 1.4159000411918166, + "grad_norm": 0.3123486340045929, + "learning_rate": 0.00017594627594627593, + "loss": 0.4575, + "step": 1289 + }, + { + "epoch": 1.416998489633393, + "grad_norm": 0.4163682460784912, + "learning_rate": 0.00017582417582417582, + "loss": 0.7511, + "step": 1290 + }, + { + "epoch": 1.418096938074969, + "grad_norm": 0.5697455406188965, + "learning_rate": 0.00017570207570207568, + "loss": 0.5977, + "step": 1291 + }, + { + "epoch": 1.4191953865165454, + "grad_norm": 0.39232510328292847, + "learning_rate": 0.00017557997557997556, + "loss": 0.6133, + "step": 1292 + }, + { + "epoch": 1.4202938349581218, + "grad_norm": 0.5452993512153625, + "learning_rate": 0.00017545787545787545, + "loss": 0.6596, + "step": 1293 + }, + { + "epoch": 1.421392283399698, + "grad_norm": 0.39080601930618286, + "learning_rate": 0.0001753357753357753, + "loss": 0.7422, + "step": 1294 + }, + { + "epoch": 1.4224907318412743, + "grad_norm": 0.6513398289680481, + "learning_rate": 0.0001752136752136752, + "loss": 0.5277, + "step": 1295 + }, + { + "epoch": 1.4235891802828504, + "grad_norm": 0.4627130329608917, + "learning_rate": 0.00017509157509157508, + "loss": 0.6296, + "step": 1296 + }, + { + "epoch": 1.4246876287244268, + "grad_norm": 0.499700129032135, + "learning_rate": 0.00017496947496947494, + "loss": 0.689, + "step": 1297 + }, + { + "epoch": 1.425786077166003, + "grad_norm": 0.4668709635734558, + "learning_rate": 0.00017484737484737482, + "loss": 0.784, + "step": 1298 + }, + { + "epoch": 1.4268845256075793, + "grad_norm": 0.6378145217895508, + "learning_rate": 0.00017472527472527473, + "loss": 0.5077, + "step": 1299 + }, + { + "epoch": 1.4279829740491556, + "grad_norm": 0.6320174336433411, + "learning_rate": 0.00017460317460317457, + "loss": 1.061, + "step": 1300 + }, + { + "epoch": 1.4290814224907318, + "grad_norm": 0.48719078302383423, + "learning_rate": 0.00017448107448107448, + "loss": 0.7181, + "step": 1301 + }, + { + "epoch": 1.4301798709323081, + "grad_norm": 0.5345287919044495, + "learning_rate": 0.00017435897435897436, + "loss": 0.5599, + "step": 1302 + }, + { + "epoch": 1.4312783193738845, + "grad_norm": 0.567857563495636, + "learning_rate": 0.00017423687423687422, + "loss": 0.6294, + "step": 1303 + }, + { + "epoch": 1.4323767678154606, + "grad_norm": 0.5715040564537048, + "learning_rate": 0.0001741147741147741, + "loss": 0.5326, + "step": 1304 + }, + { + "epoch": 1.433475216257037, + "grad_norm": 0.40048834681510925, + "learning_rate": 0.000173992673992674, + "loss": 0.687, + "step": 1305 + }, + { + "epoch": 1.4345736646986131, + "grad_norm": 0.4964540898799896, + "learning_rate": 0.00017387057387057385, + "loss": 0.6149, + "step": 1306 + }, + { + "epoch": 1.4356721131401895, + "grad_norm": 0.5018569231033325, + "learning_rate": 0.00017374847374847374, + "loss": 0.4224, + "step": 1307 + }, + { + "epoch": 1.4367705615817656, + "grad_norm": 0.6026094555854797, + "learning_rate": 0.00017362637362637362, + "loss": 0.8934, + "step": 1308 + }, + { + "epoch": 1.437869010023342, + "grad_norm": 0.33409950137138367, + "learning_rate": 0.00017350427350427348, + "loss": 0.6725, + "step": 1309 + }, + { + "epoch": 1.4389674584649184, + "grad_norm": 0.43982234597206116, + "learning_rate": 0.00017338217338217337, + "loss": 0.9203, + "step": 1310 + }, + { + "epoch": 1.4400659069064945, + "grad_norm": 0.843877911567688, + "learning_rate": 0.00017326007326007322, + "loss": 0.6028, + "step": 1311 + }, + { + "epoch": 1.4411643553480709, + "grad_norm": 0.35148733854293823, + "learning_rate": 0.0001731379731379731, + "loss": 0.7503, + "step": 1312 + }, + { + "epoch": 1.4422628037896472, + "grad_norm": 0.4561845362186432, + "learning_rate": 0.000173015873015873, + "loss": 0.6577, + "step": 1313 + }, + { + "epoch": 1.4433612522312234, + "grad_norm": 0.47295713424682617, + "learning_rate": 0.00017289377289377285, + "loss": 0.8013, + "step": 1314 + }, + { + "epoch": 1.4444597006727997, + "grad_norm": 0.46340033411979675, + "learning_rate": 0.00017277167277167277, + "loss": 0.73, + "step": 1315 + }, + { + "epoch": 1.445558149114376, + "grad_norm": 0.49221453070640564, + "learning_rate": 0.00017264957264957265, + "loss": 0.6735, + "step": 1316 + }, + { + "epoch": 1.4466565975559522, + "grad_norm": 0.36250925064086914, + "learning_rate": 0.0001725274725274725, + "loss": 0.7463, + "step": 1317 + }, + { + "epoch": 1.4477550459975284, + "grad_norm": 0.3832615911960602, + "learning_rate": 0.0001724053724053724, + "loss": 0.7295, + "step": 1318 + }, + { + "epoch": 1.4488534944391047, + "grad_norm": 0.7413591742515564, + "learning_rate": 0.00017228327228327228, + "loss": 0.7627, + "step": 1319 + }, + { + "epoch": 1.449951942880681, + "grad_norm": 0.45626765489578247, + "learning_rate": 0.00017216117216117214, + "loss": 0.727, + "step": 1320 + }, + { + "epoch": 1.4510503913222572, + "grad_norm": 0.3024120330810547, + "learning_rate": 0.00017203907203907202, + "loss": 0.3986, + "step": 1321 + }, + { + "epoch": 1.4521488397638336, + "grad_norm": 0.31635284423828125, + "learning_rate": 0.0001719169719169719, + "loss": 0.3469, + "step": 1322 + }, + { + "epoch": 1.45324728820541, + "grad_norm": 0.36893391609191895, + "learning_rate": 0.00017179487179487177, + "loss": 0.7017, + "step": 1323 + }, + { + "epoch": 1.454345736646986, + "grad_norm": 0.4804024398326874, + "learning_rate": 0.00017167277167277165, + "loss": 0.8811, + "step": 1324 + }, + { + "epoch": 1.4554441850885624, + "grad_norm": 0.4446522295475006, + "learning_rate": 0.00017155067155067154, + "loss": 0.8027, + "step": 1325 + }, + { + "epoch": 1.4565426335301388, + "grad_norm": 0.27936413884162903, + "learning_rate": 0.0001714285714285714, + "loss": 0.3846, + "step": 1326 + }, + { + "epoch": 1.457641081971715, + "grad_norm": 0.3312259316444397, + "learning_rate": 0.00017130647130647128, + "loss": 0.4852, + "step": 1327 + }, + { + "epoch": 1.4587395304132913, + "grad_norm": 0.4751642644405365, + "learning_rate": 0.0001711843711843712, + "loss": 0.7337, + "step": 1328 + }, + { + "epoch": 1.4598379788548674, + "grad_norm": 0.5365067720413208, + "learning_rate": 0.00017106227106227103, + "loss": 0.8052, + "step": 1329 + }, + { + "epoch": 1.4609364272964438, + "grad_norm": 0.5944942831993103, + "learning_rate": 0.00017094017094017094, + "loss": 0.7673, + "step": 1330 + }, + { + "epoch": 1.46203487573802, + "grad_norm": 0.48244431614875793, + "learning_rate": 0.00017081807081807083, + "loss": 0.855, + "step": 1331 + }, + { + "epoch": 1.4631333241795963, + "grad_norm": 0.32348135113716125, + "learning_rate": 0.00017069597069597068, + "loss": 0.5133, + "step": 1332 + }, + { + "epoch": 1.4642317726211727, + "grad_norm": 0.6455866694450378, + "learning_rate": 0.00017057387057387057, + "loss": 0.6825, + "step": 1333 + }, + { + "epoch": 1.4653302210627488, + "grad_norm": 0.3937522768974304, + "learning_rate": 0.00017045177045177045, + "loss": 0.6335, + "step": 1334 + }, + { + "epoch": 1.4664286695043252, + "grad_norm": 0.33579352498054504, + "learning_rate": 0.0001703296703296703, + "loss": 0.4711, + "step": 1335 + }, + { + "epoch": 1.4675271179459015, + "grad_norm": 0.5055533647537231, + "learning_rate": 0.0001702075702075702, + "loss": 0.6512, + "step": 1336 + }, + { + "epoch": 1.4686255663874777, + "grad_norm": 0.40702182054519653, + "learning_rate": 0.00017008547008547006, + "loss": 0.8833, + "step": 1337 + }, + { + "epoch": 1.469724014829054, + "grad_norm": 0.3574135899543762, + "learning_rate": 0.00016996336996336994, + "loss": 0.7127, + "step": 1338 + }, + { + "epoch": 1.4708224632706302, + "grad_norm": 0.45641472935676575, + "learning_rate": 0.00016984126984126983, + "loss": 0.7258, + "step": 1339 + }, + { + "epoch": 1.4719209117122065, + "grad_norm": 1.5012352466583252, + "learning_rate": 0.0001697191697191697, + "loss": 0.8065, + "step": 1340 + }, + { + "epoch": 1.4730193601537827, + "grad_norm": 0.5025885701179504, + "learning_rate": 0.00016959706959706957, + "loss": 0.9377, + "step": 1341 + }, + { + "epoch": 1.474117808595359, + "grad_norm": 0.2942202687263489, + "learning_rate": 0.00016947496947496946, + "loss": 0.5693, + "step": 1342 + }, + { + "epoch": 1.4752162570369354, + "grad_norm": 0.48770126700401306, + "learning_rate": 0.00016935286935286932, + "loss": 0.5483, + "step": 1343 + }, + { + "epoch": 1.4763147054785115, + "grad_norm": 0.3853349983692169, + "learning_rate": 0.0001692307692307692, + "loss": 0.5787, + "step": 1344 + }, + { + "epoch": 1.4774131539200879, + "grad_norm": 0.3593169152736664, + "learning_rate": 0.00016910866910866911, + "loss": 0.6426, + "step": 1345 + }, + { + "epoch": 1.4785116023616642, + "grad_norm": 0.5932713150978088, + "learning_rate": 0.00016898656898656897, + "loss": 0.7543, + "step": 1346 + }, + { + "epoch": 1.4796100508032404, + "grad_norm": 0.43406638503074646, + "learning_rate": 0.00016886446886446886, + "loss": 0.7868, + "step": 1347 + }, + { + "epoch": 1.4807084992448167, + "grad_norm": 0.38596048951148987, + "learning_rate": 0.00016874236874236874, + "loss": 0.49, + "step": 1348 + }, + { + "epoch": 1.481806947686393, + "grad_norm": 0.42844533920288086, + "learning_rate": 0.0001686202686202686, + "loss": 0.6485, + "step": 1349 + }, + { + "epoch": 1.4829053961279692, + "grad_norm": 0.5165280103683472, + "learning_rate": 0.0001684981684981685, + "loss": 0.6924, + "step": 1350 + }, + { + "epoch": 1.4840038445695456, + "grad_norm": 0.5717988610267639, + "learning_rate": 0.00016837606837606837, + "loss": 0.5624, + "step": 1351 + }, + { + "epoch": 1.4851022930111217, + "grad_norm": 0.4384293556213379, + "learning_rate": 0.00016825396825396823, + "loss": 0.7895, + "step": 1352 + }, + { + "epoch": 1.486200741452698, + "grad_norm": 0.5472243428230286, + "learning_rate": 0.00016813186813186812, + "loss": 0.8838, + "step": 1353 + }, + { + "epoch": 1.4872991898942742, + "grad_norm": 0.3903232216835022, + "learning_rate": 0.000168009768009768, + "loss": 0.5452, + "step": 1354 + }, + { + "epoch": 1.4883976383358506, + "grad_norm": 0.3799583613872528, + "learning_rate": 0.00016788766788766786, + "loss": 0.8931, + "step": 1355 + }, + { + "epoch": 1.489496086777427, + "grad_norm": 0.4481349289417267, + "learning_rate": 0.00016776556776556775, + "loss": 0.5956, + "step": 1356 + }, + { + "epoch": 1.490594535219003, + "grad_norm": 0.45875266194343567, + "learning_rate": 0.00016764346764346763, + "loss": 0.4729, + "step": 1357 + }, + { + "epoch": 1.4916929836605795, + "grad_norm": 0.494112104177475, + "learning_rate": 0.0001675213675213675, + "loss": 0.6416, + "step": 1358 + }, + { + "epoch": 1.4927914321021558, + "grad_norm": 0.3976772725582123, + "learning_rate": 0.0001673992673992674, + "loss": 0.6601, + "step": 1359 + }, + { + "epoch": 1.493889880543732, + "grad_norm": 0.29009610414505005, + "learning_rate": 0.0001672771672771673, + "loss": 0.4261, + "step": 1360 + }, + { + "epoch": 1.4949883289853083, + "grad_norm": 0.5540419816970825, + "learning_rate": 0.00016715506715506715, + "loss": 0.8206, + "step": 1361 + }, + { + "epoch": 1.4960867774268845, + "grad_norm": 0.41308313608169556, + "learning_rate": 0.00016703296703296703, + "loss": 0.7862, + "step": 1362 + }, + { + "epoch": 1.4971852258684608, + "grad_norm": 0.6565150618553162, + "learning_rate": 0.0001669108669108669, + "loss": 0.6963, + "step": 1363 + }, + { + "epoch": 1.498283674310037, + "grad_norm": 0.4901321530342102, + "learning_rate": 0.00016678876678876678, + "loss": 0.7063, + "step": 1364 + }, + { + "epoch": 1.4993821227516133, + "grad_norm": 0.4676086902618408, + "learning_rate": 0.00016666666666666666, + "loss": 0.5142, + "step": 1365 + }, + { + "epoch": 1.5004805711931897, + "grad_norm": 0.4745628833770752, + "learning_rate": 0.00016654456654456652, + "loss": 0.7659, + "step": 1366 + }, + { + "epoch": 1.5015790196347658, + "grad_norm": 0.42693057656288147, + "learning_rate": 0.0001664224664224664, + "loss": 0.9233, + "step": 1367 + }, + { + "epoch": 1.5026774680763422, + "grad_norm": 0.4110391139984131, + "learning_rate": 0.0001663003663003663, + "loss": 0.5062, + "step": 1368 + }, + { + "epoch": 1.5037759165179185, + "grad_norm": 0.3090996742248535, + "learning_rate": 0.00016617826617826615, + "loss": 0.4462, + "step": 1369 + }, + { + "epoch": 1.5048743649594947, + "grad_norm": 0.42027410864830017, + "learning_rate": 0.00016605616605616603, + "loss": 0.8589, + "step": 1370 + }, + { + "epoch": 1.505972813401071, + "grad_norm": 0.38396796584129333, + "learning_rate": 0.00016593406593406592, + "loss": 0.6609, + "step": 1371 + }, + { + "epoch": 1.5070712618426474, + "grad_norm": 0.5236012935638428, + "learning_rate": 0.00016581196581196578, + "loss": 0.6506, + "step": 1372 + }, + { + "epoch": 1.5081697102842235, + "grad_norm": 0.7232113480567932, + "learning_rate": 0.00016568986568986566, + "loss": 0.6689, + "step": 1373 + }, + { + "epoch": 1.5092681587257997, + "grad_norm": 0.4777502417564392, + "learning_rate": 0.00016556776556776558, + "loss": 0.5701, + "step": 1374 + }, + { + "epoch": 1.510366607167376, + "grad_norm": 0.39154767990112305, + "learning_rate": 0.0001654456654456654, + "loss": 0.4906, + "step": 1375 + }, + { + "epoch": 1.5114650556089524, + "grad_norm": 0.469382107257843, + "learning_rate": 0.00016532356532356532, + "loss": 0.5768, + "step": 1376 + }, + { + "epoch": 1.5125635040505285, + "grad_norm": 0.3485945761203766, + "learning_rate": 0.0001652014652014652, + "loss": 0.7814, + "step": 1377 + }, + { + "epoch": 1.513661952492105, + "grad_norm": 0.4375949203968048, + "learning_rate": 0.00016507936507936506, + "loss": 0.6328, + "step": 1378 + }, + { + "epoch": 1.5147604009336813, + "grad_norm": 0.47778064012527466, + "learning_rate": 0.00016495726495726495, + "loss": 0.635, + "step": 1379 + }, + { + "epoch": 1.5158588493752574, + "grad_norm": 0.3515126705169678, + "learning_rate": 0.00016483516483516484, + "loss": 0.7014, + "step": 1380 + }, + { + "epoch": 1.5169572978168337, + "grad_norm": 0.3710018992424011, + "learning_rate": 0.0001647130647130647, + "loss": 0.7903, + "step": 1381 + }, + { + "epoch": 1.51805574625841, + "grad_norm": 0.37630394101142883, + "learning_rate": 0.00016459096459096458, + "loss": 0.5446, + "step": 1382 + }, + { + "epoch": 1.5191541946999862, + "grad_norm": 0.4312807321548462, + "learning_rate": 0.00016446886446886446, + "loss": 0.6101, + "step": 1383 + }, + { + "epoch": 1.5202526431415624, + "grad_norm": 0.399384468793869, + "learning_rate": 0.00016434676434676432, + "loss": 0.5734, + "step": 1384 + }, + { + "epoch": 1.521351091583139, + "grad_norm": 0.41233471035957336, + "learning_rate": 0.0001642246642246642, + "loss": 0.6525, + "step": 1385 + }, + { + "epoch": 1.522449540024715, + "grad_norm": 0.5215228199958801, + "learning_rate": 0.0001641025641025641, + "loss": 0.4804, + "step": 1386 + }, + { + "epoch": 1.5235479884662912, + "grad_norm": 0.42069393396377563, + "learning_rate": 0.00016398046398046395, + "loss": 0.5517, + "step": 1387 + }, + { + "epoch": 1.5246464369078676, + "grad_norm": 1.7902978658676147, + "learning_rate": 0.00016385836385836384, + "loss": 0.6295, + "step": 1388 + }, + { + "epoch": 1.525744885349444, + "grad_norm": 0.7353507280349731, + "learning_rate": 0.0001637362637362637, + "loss": 1.0585, + "step": 1389 + }, + { + "epoch": 1.52684333379102, + "grad_norm": 0.45992404222488403, + "learning_rate": 0.0001636141636141636, + "loss": 0.7671, + "step": 1390 + }, + { + "epoch": 1.5279417822325965, + "grad_norm": 0.3927334249019623, + "learning_rate": 0.0001634920634920635, + "loss": 0.7479, + "step": 1391 + }, + { + "epoch": 1.5290402306741728, + "grad_norm": 0.32833003997802734, + "learning_rate": 0.00016336996336996335, + "loss": 0.5774, + "step": 1392 + }, + { + "epoch": 1.530138679115749, + "grad_norm": 0.4306529462337494, + "learning_rate": 0.00016324786324786324, + "loss": 0.6317, + "step": 1393 + }, + { + "epoch": 1.5312371275573253, + "grad_norm": 0.5411052703857422, + "learning_rate": 0.00016312576312576312, + "loss": 0.6637, + "step": 1394 + }, + { + "epoch": 1.5323355759989017, + "grad_norm": 0.633800745010376, + "learning_rate": 0.00016300366300366298, + "loss": 0.7145, + "step": 1395 + }, + { + "epoch": 1.5334340244404778, + "grad_norm": 0.6986578702926636, + "learning_rate": 0.00016288156288156287, + "loss": 0.7194, + "step": 1396 + }, + { + "epoch": 1.534532472882054, + "grad_norm": 0.5223686695098877, + "learning_rate": 0.00016275946275946275, + "loss": 0.7849, + "step": 1397 + }, + { + "epoch": 1.5356309213236303, + "grad_norm": 0.5342483520507812, + "learning_rate": 0.0001626373626373626, + "loss": 0.8885, + "step": 1398 + }, + { + "epoch": 1.5367293697652067, + "grad_norm": 0.5467656850814819, + "learning_rate": 0.0001625152625152625, + "loss": 0.6265, + "step": 1399 + }, + { + "epoch": 1.5378278182067828, + "grad_norm": 0.4483658969402313, + "learning_rate": 0.00016239316239316238, + "loss": 0.7133, + "step": 1400 + }, + { + "epoch": 1.5389262666483592, + "grad_norm": 0.5714216232299805, + "learning_rate": 0.00016227106227106224, + "loss": 0.5212, + "step": 1401 + }, + { + "epoch": 1.5400247150899355, + "grad_norm": 0.5487145781517029, + "learning_rate": 0.00016214896214896213, + "loss": 0.6276, + "step": 1402 + }, + { + "epoch": 1.5411231635315117, + "grad_norm": 0.3687078654766083, + "learning_rate": 0.00016202686202686204, + "loss": 0.7512, + "step": 1403 + }, + { + "epoch": 1.542221611973088, + "grad_norm": 0.3596762418746948, + "learning_rate": 0.00016190476190476187, + "loss": 0.7192, + "step": 1404 + }, + { + "epoch": 1.5433200604146644, + "grad_norm": 0.4092305898666382, + "learning_rate": 0.00016178266178266178, + "loss": 0.7339, + "step": 1405 + }, + { + "epoch": 1.5444185088562405, + "grad_norm": 0.4018193483352661, + "learning_rate": 0.00016166056166056167, + "loss": 0.7213, + "step": 1406 + }, + { + "epoch": 1.5455169572978167, + "grad_norm": 0.4993208646774292, + "learning_rate": 0.00016153846153846153, + "loss": 0.6362, + "step": 1407 + }, + { + "epoch": 1.5466154057393933, + "grad_norm": 0.3958855867385864, + "learning_rate": 0.0001614163614163614, + "loss": 0.8482, + "step": 1408 + }, + { + "epoch": 1.5477138541809694, + "grad_norm": 0.32689765095710754, + "learning_rate": 0.0001612942612942613, + "loss": 0.6583, + "step": 1409 + }, + { + "epoch": 1.5488123026225455, + "grad_norm": 0.48947611451148987, + "learning_rate": 0.00016117216117216116, + "loss": 0.6707, + "step": 1410 + }, + { + "epoch": 1.549910751064122, + "grad_norm": 0.3446139395236969, + "learning_rate": 0.00016105006105006104, + "loss": 0.8914, + "step": 1411 + }, + { + "epoch": 1.5510091995056983, + "grad_norm": 0.585746705532074, + "learning_rate": 0.0001609279609279609, + "loss": 0.5413, + "step": 1412 + }, + { + "epoch": 1.5521076479472744, + "grad_norm": 0.6561328172683716, + "learning_rate": 0.00016080586080586079, + "loss": 0.3728, + "step": 1413 + }, + { + "epoch": 1.5532060963888508, + "grad_norm": 0.47158828377723694, + "learning_rate": 0.00016068376068376067, + "loss": 0.6525, + "step": 1414 + }, + { + "epoch": 1.5543045448304271, + "grad_norm": 0.3676914572715759, + "learning_rate": 0.00016056166056166053, + "loss": 0.7395, + "step": 1415 + }, + { + "epoch": 1.5554029932720033, + "grad_norm": 0.608076810836792, + "learning_rate": 0.00016043956043956041, + "loss": 0.5289, + "step": 1416 + }, + { + "epoch": 1.5565014417135794, + "grad_norm": 0.44940462708473206, + "learning_rate": 0.0001603174603174603, + "loss": 0.6282, + "step": 1417 + }, + { + "epoch": 1.557599890155156, + "grad_norm": 0.48062869906425476, + "learning_rate": 0.00016019536019536016, + "loss": 0.7438, + "step": 1418 + }, + { + "epoch": 1.5586983385967321, + "grad_norm": 0.43834635615348816, + "learning_rate": 0.00016007326007326004, + "loss": 0.4248, + "step": 1419 + }, + { + "epoch": 1.5597967870383083, + "grad_norm": 0.5203731060028076, + "learning_rate": 0.00015995115995115996, + "loss": 0.91, + "step": 1420 + }, + { + "epoch": 1.5608952354798846, + "grad_norm": 0.5766960978507996, + "learning_rate": 0.00015982905982905981, + "loss": 0.7211, + "step": 1421 + }, + { + "epoch": 1.561993683921461, + "grad_norm": 0.3048666715621948, + "learning_rate": 0.0001597069597069597, + "loss": 0.5618, + "step": 1422 + }, + { + "epoch": 1.5630921323630371, + "grad_norm": 0.3916679322719574, + "learning_rate": 0.00015958485958485959, + "loss": 0.6954, + "step": 1423 + }, + { + "epoch": 1.5641905808046135, + "grad_norm": 0.6336612105369568, + "learning_rate": 0.00015946275946275944, + "loss": 0.6368, + "step": 1424 + }, + { + "epoch": 1.5652890292461898, + "grad_norm": 0.8314816355705261, + "learning_rate": 0.00015934065934065933, + "loss": 0.7633, + "step": 1425 + }, + { + "epoch": 1.566387477687766, + "grad_norm": 0.46973487734794617, + "learning_rate": 0.00015921855921855922, + "loss": 0.6915, + "step": 1426 + }, + { + "epoch": 1.5674859261293423, + "grad_norm": 0.48737633228302, + "learning_rate": 0.00015909645909645907, + "loss": 0.5346, + "step": 1427 + }, + { + "epoch": 1.5685843745709187, + "grad_norm": 0.548876941204071, + "learning_rate": 0.00015897435897435896, + "loss": 1.0449, + "step": 1428 + }, + { + "epoch": 1.5696828230124948, + "grad_norm": 0.5039654970169067, + "learning_rate": 0.00015885225885225884, + "loss": 0.9953, + "step": 1429 + }, + { + "epoch": 1.570781271454071, + "grad_norm": 0.7233378887176514, + "learning_rate": 0.0001587301587301587, + "loss": 0.7068, + "step": 1430 + }, + { + "epoch": 1.5718797198956473, + "grad_norm": 0.5767638683319092, + "learning_rate": 0.0001586080586080586, + "loss": 0.8055, + "step": 1431 + }, + { + "epoch": 1.5729781683372237, + "grad_norm": 0.34450021386146545, + "learning_rate": 0.00015848595848595847, + "loss": 0.726, + "step": 1432 + }, + { + "epoch": 1.5740766167787998, + "grad_norm": 0.8474962711334229, + "learning_rate": 0.00015836385836385833, + "loss": 0.6974, + "step": 1433 + }, + { + "epoch": 1.5751750652203762, + "grad_norm": 1.565746545791626, + "learning_rate": 0.00015824175824175824, + "loss": 0.7766, + "step": 1434 + }, + { + "epoch": 1.5762735136619526, + "grad_norm": 0.4393616020679474, + "learning_rate": 0.00015811965811965813, + "loss": 0.6071, + "step": 1435 + }, + { + "epoch": 1.5773719621035287, + "grad_norm": 0.5209214091300964, + "learning_rate": 0.000157997557997558, + "loss": 0.7546, + "step": 1436 + }, + { + "epoch": 1.578470410545105, + "grad_norm": 0.6069398522377014, + "learning_rate": 0.00015787545787545787, + "loss": 0.7322, + "step": 1437 + }, + { + "epoch": 1.5795688589866814, + "grad_norm": 0.6168296337127686, + "learning_rate": 0.00015775335775335773, + "loss": 0.5169, + "step": 1438 + }, + { + "epoch": 1.5806673074282576, + "grad_norm": 0.25368016958236694, + "learning_rate": 0.00015763125763125762, + "loss": 0.4838, + "step": 1439 + }, + { + "epoch": 1.5817657558698337, + "grad_norm": 0.4165039360523224, + "learning_rate": 0.0001575091575091575, + "loss": 1.0135, + "step": 1440 + }, + { + "epoch": 1.5828642043114103, + "grad_norm": 0.4596197307109833, + "learning_rate": 0.00015738705738705736, + "loss": 0.5545, + "step": 1441 + }, + { + "epoch": 1.5839626527529864, + "grad_norm": 0.5077592730522156, + "learning_rate": 0.00015726495726495725, + "loss": 0.7754, + "step": 1442 + }, + { + "epoch": 1.5850611011945626, + "grad_norm": 0.5041285157203674, + "learning_rate": 0.00015714285714285713, + "loss": 0.8384, + "step": 1443 + }, + { + "epoch": 1.586159549636139, + "grad_norm": 0.40924420952796936, + "learning_rate": 0.000157020757020757, + "loss": 0.5511, + "step": 1444 + }, + { + "epoch": 1.5872579980777153, + "grad_norm": 0.4800551235675812, + "learning_rate": 0.00015689865689865688, + "loss": 0.6154, + "step": 1445 + }, + { + "epoch": 1.5883564465192914, + "grad_norm": 0.433174729347229, + "learning_rate": 0.00015677655677655676, + "loss": 0.6158, + "step": 1446 + }, + { + "epoch": 1.5894548949608678, + "grad_norm": 0.29649895429611206, + "learning_rate": 0.00015665445665445662, + "loss": 0.5729, + "step": 1447 + }, + { + "epoch": 1.5905533434024441, + "grad_norm": 0.3815969228744507, + "learning_rate": 0.0001565323565323565, + "loss": 0.6748, + "step": 1448 + }, + { + "epoch": 1.5916517918440203, + "grad_norm": 0.4933919608592987, + "learning_rate": 0.00015641025641025642, + "loss": 0.7683, + "step": 1449 + }, + { + "epoch": 1.5927502402855966, + "grad_norm": 0.5053071975708008, + "learning_rate": 0.00015628815628815625, + "loss": 0.6779, + "step": 1450 + }, + { + "epoch": 1.593848688727173, + "grad_norm": 0.3900013566017151, + "learning_rate": 0.00015616605616605616, + "loss": 0.6326, + "step": 1451 + }, + { + "epoch": 1.5949471371687491, + "grad_norm": 0.5823982357978821, + "learning_rate": 0.00015604395604395605, + "loss": 0.6104, + "step": 1452 + }, + { + "epoch": 1.5960455856103253, + "grad_norm": 0.5277792811393738, + "learning_rate": 0.0001559218559218559, + "loss": 0.6647, + "step": 1453 + }, + { + "epoch": 1.5971440340519016, + "grad_norm": 0.32926440238952637, + "learning_rate": 0.0001557997557997558, + "loss": 0.6064, + "step": 1454 + }, + { + "epoch": 1.598242482493478, + "grad_norm": 0.7350378036499023, + "learning_rate": 0.00015567765567765568, + "loss": 0.7951, + "step": 1455 + }, + { + "epoch": 1.5993409309350541, + "grad_norm": 0.4125807285308838, + "learning_rate": 0.00015555555555555554, + "loss": 0.7761, + "step": 1456 + }, + { + "epoch": 1.6004393793766305, + "grad_norm": 0.49707722663879395, + "learning_rate": 0.00015543345543345542, + "loss": 0.7299, + "step": 1457 + }, + { + "epoch": 1.6015378278182069, + "grad_norm": 0.3240358829498291, + "learning_rate": 0.0001553113553113553, + "loss": 0.4832, + "step": 1458 + }, + { + "epoch": 1.602636276259783, + "grad_norm": 0.44430434703826904, + "learning_rate": 0.00015518925518925517, + "loss": 0.5968, + "step": 1459 + }, + { + "epoch": 1.6037347247013594, + "grad_norm": 0.3702992796897888, + "learning_rate": 0.00015506715506715505, + "loss": 0.7177, + "step": 1460 + }, + { + "epoch": 1.6048331731429357, + "grad_norm": 0.5001052618026733, + "learning_rate": 0.00015494505494505494, + "loss": 0.7448, + "step": 1461 + }, + { + "epoch": 1.6059316215845119, + "grad_norm": 0.45969969034194946, + "learning_rate": 0.0001548229548229548, + "loss": 0.8292, + "step": 1462 + }, + { + "epoch": 1.607030070026088, + "grad_norm": 0.46075674891471863, + "learning_rate": 0.00015470085470085468, + "loss": 0.5624, + "step": 1463 + }, + { + "epoch": 1.6081285184676646, + "grad_norm": 2.077080488204956, + "learning_rate": 0.00015457875457875454, + "loss": 0.6643, + "step": 1464 + }, + { + "epoch": 1.6092269669092407, + "grad_norm": 0.46008172631263733, + "learning_rate": 0.00015445665445665445, + "loss": 0.6329, + "step": 1465 + }, + { + "epoch": 1.6103254153508169, + "grad_norm": 0.5016405582427979, + "learning_rate": 0.00015433455433455434, + "loss": 0.7692, + "step": 1466 + }, + { + "epoch": 1.6114238637923932, + "grad_norm": 0.46292269229888916, + "learning_rate": 0.0001542124542124542, + "loss": 0.6485, + "step": 1467 + }, + { + "epoch": 1.6125223122339696, + "grad_norm": 0.4498538672924042, + "learning_rate": 0.00015409035409035408, + "loss": 0.598, + "step": 1468 + }, + { + "epoch": 1.6136207606755457, + "grad_norm": 0.3537295162677765, + "learning_rate": 0.00015396825396825397, + "loss": 0.6356, + "step": 1469 + }, + { + "epoch": 1.614719209117122, + "grad_norm": 0.9966747164726257, + "learning_rate": 0.00015384615384615382, + "loss": 0.6627, + "step": 1470 + }, + { + "epoch": 1.6158176575586984, + "grad_norm": 0.9386951327323914, + "learning_rate": 0.0001537240537240537, + "loss": 0.8148, + "step": 1471 + }, + { + "epoch": 1.6169161060002746, + "grad_norm": 0.3452979028224945, + "learning_rate": 0.0001536019536019536, + "loss": 0.5778, + "step": 1472 + }, + { + "epoch": 1.618014554441851, + "grad_norm": 0.3443523049354553, + "learning_rate": 0.00015347985347985345, + "loss": 0.9228, + "step": 1473 + }, + { + "epoch": 1.6191130028834273, + "grad_norm": 0.5345872044563293, + "learning_rate": 0.00015335775335775334, + "loss": 0.4682, + "step": 1474 + }, + { + "epoch": 1.6202114513250034, + "grad_norm": 0.35112351179122925, + "learning_rate": 0.00015323565323565322, + "loss": 0.5482, + "step": 1475 + }, + { + "epoch": 1.6213098997665796, + "grad_norm": 0.39090535044670105, + "learning_rate": 0.00015311355311355308, + "loss": 0.825, + "step": 1476 + }, + { + "epoch": 1.622408348208156, + "grad_norm": 1.1684538125991821, + "learning_rate": 0.00015299145299145297, + "loss": 0.6561, + "step": 1477 + }, + { + "epoch": 1.6235067966497323, + "grad_norm": 0.4006233513355255, + "learning_rate": 0.00015286935286935288, + "loss": 0.3647, + "step": 1478 + }, + { + "epoch": 1.6246052450913084, + "grad_norm": 0.30577126145362854, + "learning_rate": 0.0001527472527472527, + "loss": 0.4934, + "step": 1479 + }, + { + "epoch": 1.6257036935328848, + "grad_norm": 0.39927995204925537, + "learning_rate": 0.00015262515262515263, + "loss": 0.6028, + "step": 1480 + }, + { + "epoch": 1.6268021419744612, + "grad_norm": 0.49143150448799133, + "learning_rate": 0.0001525030525030525, + "loss": 0.4595, + "step": 1481 + }, + { + "epoch": 1.6279005904160373, + "grad_norm": 0.8603225946426392, + "learning_rate": 0.00015238095238095237, + "loss": 0.8617, + "step": 1482 + }, + { + "epoch": 1.6289990388576137, + "grad_norm": 0.534269392490387, + "learning_rate": 0.00015225885225885225, + "loss": 0.6648, + "step": 1483 + }, + { + "epoch": 1.63009748729919, + "grad_norm": 0.4987354278564453, + "learning_rate": 0.00015213675213675214, + "loss": 0.5908, + "step": 1484 + }, + { + "epoch": 1.6311959357407662, + "grad_norm": 0.5739774107933044, + "learning_rate": 0.000152014652014652, + "loss": 0.7652, + "step": 1485 + }, + { + "epoch": 1.6322943841823423, + "grad_norm": 0.5343801975250244, + "learning_rate": 0.00015189255189255188, + "loss": 0.6864, + "step": 1486 + }, + { + "epoch": 1.6333928326239189, + "grad_norm": 0.45683905482292175, + "learning_rate": 0.00015177045177045177, + "loss": 0.7179, + "step": 1487 + }, + { + "epoch": 1.634491281065495, + "grad_norm": 0.5020450949668884, + "learning_rate": 0.00015164835164835163, + "loss": 0.4356, + "step": 1488 + }, + { + "epoch": 1.6355897295070712, + "grad_norm": 0.3870914876461029, + "learning_rate": 0.0001515262515262515, + "loss": 0.692, + "step": 1489 + }, + { + "epoch": 1.6366881779486475, + "grad_norm": 0.5256255269050598, + "learning_rate": 0.00015140415140415137, + "loss": 0.7184, + "step": 1490 + }, + { + "epoch": 1.6377866263902239, + "grad_norm": 0.27588197588920593, + "learning_rate": 0.00015128205128205126, + "loss": 0.6928, + "step": 1491 + }, + { + "epoch": 1.6388850748318, + "grad_norm": 0.43336692452430725, + "learning_rate": 0.00015115995115995114, + "loss": 0.7357, + "step": 1492 + }, + { + "epoch": 1.6399835232733764, + "grad_norm": 0.7952486872673035, + "learning_rate": 0.000151037851037851, + "loss": 0.5536, + "step": 1493 + }, + { + "epoch": 1.6410819717149527, + "grad_norm": 3.8659090995788574, + "learning_rate": 0.00015091575091575089, + "loss": 0.6409, + "step": 1494 + }, + { + "epoch": 1.6421804201565289, + "grad_norm": 0.3824027478694916, + "learning_rate": 0.0001507936507936508, + "loss": 0.5988, + "step": 1495 + }, + { + "epoch": 1.643278868598105, + "grad_norm": 0.45106491446495056, + "learning_rate": 0.00015067155067155066, + "loss": 0.7568, + "step": 1496 + }, + { + "epoch": 1.6443773170396816, + "grad_norm": 0.719417154788971, + "learning_rate": 0.00015054945054945054, + "loss": 0.8191, + "step": 1497 + }, + { + "epoch": 1.6454757654812577, + "grad_norm": 0.4702167212963104, + "learning_rate": 0.00015042735042735043, + "loss": 0.6761, + "step": 1498 + }, + { + "epoch": 1.6465742139228339, + "grad_norm": 0.49441996216773987, + "learning_rate": 0.0001503052503052503, + "loss": 0.7323, + "step": 1499 + }, + { + "epoch": 1.6476726623644102, + "grad_norm": 0.623470664024353, + "learning_rate": 0.00015018315018315017, + "loss": 0.8384, + "step": 1500 + }, + { + "epoch": 1.6487711108059866, + "grad_norm": 0.5583334565162659, + "learning_rate": 0.00015006105006105006, + "loss": 0.8238, + "step": 1501 + }, + { + "epoch": 1.6498695592475627, + "grad_norm": 0.4803924560546875, + "learning_rate": 0.00014993894993894994, + "loss": 0.5322, + "step": 1502 + }, + { + "epoch": 1.650968007689139, + "grad_norm": 0.709605872631073, + "learning_rate": 0.0001498168498168498, + "loss": 0.8254, + "step": 1503 + }, + { + "epoch": 1.6520664561307155, + "grad_norm": 0.48047375679016113, + "learning_rate": 0.0001496947496947497, + "loss": 0.5263, + "step": 1504 + }, + { + "epoch": 1.6531649045722916, + "grad_norm": 0.41796261072158813, + "learning_rate": 0.00014957264957264957, + "loss": 0.5803, + "step": 1505 + }, + { + "epoch": 1.654263353013868, + "grad_norm": 0.7576707601547241, + "learning_rate": 0.00014945054945054943, + "loss": 0.545, + "step": 1506 + }, + { + "epoch": 1.6553618014554443, + "grad_norm": 0.4668630063533783, + "learning_rate": 0.00014932844932844932, + "loss": 0.6213, + "step": 1507 + }, + { + "epoch": 1.6564602498970205, + "grad_norm": 0.9730806350708008, + "learning_rate": 0.00014920634920634917, + "loss": 0.5415, + "step": 1508 + }, + { + "epoch": 1.6575586983385966, + "grad_norm": 0.39670151472091675, + "learning_rate": 0.0001490842490842491, + "loss": 0.7931, + "step": 1509 + }, + { + "epoch": 1.658657146780173, + "grad_norm": 0.6003556847572327, + "learning_rate": 0.00014896214896214895, + "loss": 0.7494, + "step": 1510 + }, + { + "epoch": 1.6597555952217493, + "grad_norm": 0.4335152506828308, + "learning_rate": 0.00014884004884004883, + "loss": 0.7003, + "step": 1511 + }, + { + "epoch": 1.6608540436633255, + "grad_norm": 0.34025630354881287, + "learning_rate": 0.00014871794871794872, + "loss": 0.9012, + "step": 1512 + }, + { + "epoch": 1.6619524921049018, + "grad_norm": 0.403934508562088, + "learning_rate": 0.00014859584859584858, + "loss": 0.717, + "step": 1513 + }, + { + "epoch": 1.6630509405464782, + "grad_norm": 0.45691147446632385, + "learning_rate": 0.00014847374847374846, + "loss": 0.4833, + "step": 1514 + }, + { + "epoch": 1.6641493889880543, + "grad_norm": 0.42266151309013367, + "learning_rate": 0.00014835164835164835, + "loss": 0.5892, + "step": 1515 + }, + { + "epoch": 1.6652478374296307, + "grad_norm": 0.392337441444397, + "learning_rate": 0.0001482295482295482, + "loss": 0.7748, + "step": 1516 + }, + { + "epoch": 1.666346285871207, + "grad_norm": 0.352081298828125, + "learning_rate": 0.0001481074481074481, + "loss": 0.6018, + "step": 1517 + }, + { + "epoch": 1.6674447343127832, + "grad_norm": 0.46293389797210693, + "learning_rate": 0.00014798534798534798, + "loss": 0.4696, + "step": 1518 + }, + { + "epoch": 1.6685431827543593, + "grad_norm": 0.6427372097969055, + "learning_rate": 0.00014786324786324786, + "loss": 0.7279, + "step": 1519 + }, + { + "epoch": 1.669641631195936, + "grad_norm": 0.500382125377655, + "learning_rate": 0.00014774114774114772, + "loss": 0.7395, + "step": 1520 + }, + { + "epoch": 1.670740079637512, + "grad_norm": 0.4410606920719147, + "learning_rate": 0.0001476190476190476, + "loss": 0.501, + "step": 1521 + }, + { + "epoch": 1.6718385280790882, + "grad_norm": 0.5587645769119263, + "learning_rate": 0.0001474969474969475, + "loss": 0.8655, + "step": 1522 + }, + { + "epoch": 1.6729369765206645, + "grad_norm": 0.4312286376953125, + "learning_rate": 0.00014737484737484735, + "loss": 0.9578, + "step": 1523 + }, + { + "epoch": 1.674035424962241, + "grad_norm": 0.48694175481796265, + "learning_rate": 0.00014725274725274723, + "loss": 0.6806, + "step": 1524 + }, + { + "epoch": 1.675133873403817, + "grad_norm": 0.39892563223838806, + "learning_rate": 0.00014713064713064712, + "loss": 0.598, + "step": 1525 + }, + { + "epoch": 1.6762323218453934, + "grad_norm": 0.4714735150337219, + "learning_rate": 0.000147008547008547, + "loss": 0.9637, + "step": 1526 + }, + { + "epoch": 1.6773307702869698, + "grad_norm": 0.8308823108673096, + "learning_rate": 0.00014688644688644686, + "loss": 0.7886, + "step": 1527 + }, + { + "epoch": 1.678429218728546, + "grad_norm": 0.5142358541488647, + "learning_rate": 0.00014676434676434675, + "loss": 0.8028, + "step": 1528 + }, + { + "epoch": 1.6795276671701223, + "grad_norm": 0.4001234471797943, + "learning_rate": 0.00014664224664224663, + "loss": 0.59, + "step": 1529 + }, + { + "epoch": 1.6806261156116986, + "grad_norm": 0.4112735688686371, + "learning_rate": 0.0001465201465201465, + "loss": 0.6523, + "step": 1530 + }, + { + "epoch": 1.6817245640532748, + "grad_norm": 0.4391016960144043, + "learning_rate": 0.0001463980463980464, + "loss": 0.7372, + "step": 1531 + }, + { + "epoch": 1.682823012494851, + "grad_norm": 0.7199782133102417, + "learning_rate": 0.00014627594627594626, + "loss": 0.8493, + "step": 1532 + }, + { + "epoch": 1.6839214609364273, + "grad_norm": 0.42379269003868103, + "learning_rate": 0.00014615384615384615, + "loss": 0.6609, + "step": 1533 + }, + { + "epoch": 1.6850199093780036, + "grad_norm": 0.41174909472465515, + "learning_rate": 0.000146031746031746, + "loss": 0.7021, + "step": 1534 + }, + { + "epoch": 1.6861183578195797, + "grad_norm": 0.4856640100479126, + "learning_rate": 0.0001459096459096459, + "loss": 0.6055, + "step": 1535 + }, + { + "epoch": 1.687216806261156, + "grad_norm": 0.5789656043052673, + "learning_rate": 0.00014578754578754578, + "loss": 0.7003, + "step": 1536 + }, + { + "epoch": 1.6883152547027325, + "grad_norm": 0.5711427330970764, + "learning_rate": 0.00014566544566544564, + "loss": 0.5762, + "step": 1537 + }, + { + "epoch": 1.6894137031443086, + "grad_norm": 0.3285518288612366, + "learning_rate": 0.00014554334554334552, + "loss": 0.6232, + "step": 1538 + }, + { + "epoch": 1.690512151585885, + "grad_norm": 0.48425230383872986, + "learning_rate": 0.0001454212454212454, + "loss": 0.5515, + "step": 1539 + }, + { + "epoch": 1.6916106000274613, + "grad_norm": 0.573079526424408, + "learning_rate": 0.0001452991452991453, + "loss": 0.7776, + "step": 1540 + }, + { + "epoch": 1.6927090484690375, + "grad_norm": 0.49084943532943726, + "learning_rate": 0.00014517704517704518, + "loss": 0.6504, + "step": 1541 + }, + { + "epoch": 1.6938074969106136, + "grad_norm": 0.46472617983818054, + "learning_rate": 0.00014505494505494504, + "loss": 0.6971, + "step": 1542 + }, + { + "epoch": 1.6949059453521902, + "grad_norm": 0.4890255033969879, + "learning_rate": 0.00014493284493284492, + "loss": 0.9292, + "step": 1543 + }, + { + "epoch": 1.6960043937937663, + "grad_norm": 0.42868301272392273, + "learning_rate": 0.0001448107448107448, + "loss": 0.6024, + "step": 1544 + }, + { + "epoch": 1.6971028422353425, + "grad_norm": 0.5118973255157471, + "learning_rate": 0.00014468864468864467, + "loss": 0.7598, + "step": 1545 + }, + { + "epoch": 1.6982012906769188, + "grad_norm": 0.40809181332588196, + "learning_rate": 0.00014456654456654455, + "loss": 0.5157, + "step": 1546 + }, + { + "epoch": 1.6992997391184952, + "grad_norm": 0.5236404538154602, + "learning_rate": 0.0001444444444444444, + "loss": 0.84, + "step": 1547 + }, + { + "epoch": 1.7003981875600713, + "grad_norm": 0.5712966322898865, + "learning_rate": 0.00014432234432234432, + "loss": 0.7208, + "step": 1548 + }, + { + "epoch": 1.7014966360016477, + "grad_norm": 0.2910475730895996, + "learning_rate": 0.00014420024420024418, + "loss": 0.4998, + "step": 1549 + }, + { + "epoch": 1.702595084443224, + "grad_norm": 0.5326736569404602, + "learning_rate": 0.00014407814407814407, + "loss": 0.5492, + "step": 1550 + }, + { + "epoch": 1.7036935328848002, + "grad_norm": 0.5454451441764832, + "learning_rate": 0.00014395604395604395, + "loss": 0.9016, + "step": 1551 + }, + { + "epoch": 1.7047919813263763, + "grad_norm": 0.45031625032424927, + "learning_rate": 0.0001438339438339438, + "loss": 0.671, + "step": 1552 + }, + { + "epoch": 1.705890429767953, + "grad_norm": 0.5496229529380798, + "learning_rate": 0.0001437118437118437, + "loss": 0.6333, + "step": 1553 + }, + { + "epoch": 1.706988878209529, + "grad_norm": 0.4200669825077057, + "learning_rate": 0.00014358974358974358, + "loss": 0.6158, + "step": 1554 + }, + { + "epoch": 1.7080873266511052, + "grad_norm": 0.7623536586761475, + "learning_rate": 0.00014346764346764347, + "loss": 0.686, + "step": 1555 + }, + { + "epoch": 1.7091857750926815, + "grad_norm": 0.3363445997238159, + "learning_rate": 0.00014334554334554333, + "loss": 0.305, + "step": 1556 + }, + { + "epoch": 1.710284223534258, + "grad_norm": 0.5042807459831238, + "learning_rate": 0.0001432234432234432, + "loss": 0.72, + "step": 1557 + }, + { + "epoch": 1.711382671975834, + "grad_norm": 0.5264353156089783, + "learning_rate": 0.0001431013431013431, + "loss": 0.6778, + "step": 1558 + }, + { + "epoch": 1.7124811204174104, + "grad_norm": 0.48960715532302856, + "learning_rate": 0.00014297924297924296, + "loss": 0.4935, + "step": 1559 + }, + { + "epoch": 1.7135795688589868, + "grad_norm": 0.4308861792087555, + "learning_rate": 0.00014285714285714284, + "loss": 0.6527, + "step": 1560 + }, + { + "epoch": 1.714678017300563, + "grad_norm": 0.42890703678131104, + "learning_rate": 0.00014273504273504273, + "loss": 0.4846, + "step": 1561 + }, + { + "epoch": 1.7157764657421393, + "grad_norm": 0.5222750902175903, + "learning_rate": 0.0001426129426129426, + "loss": 0.764, + "step": 1562 + }, + { + "epoch": 1.7168749141837156, + "grad_norm": 0.49664998054504395, + "learning_rate": 0.00014249084249084247, + "loss": 0.5728, + "step": 1563 + }, + { + "epoch": 1.7179733626252918, + "grad_norm": 0.3131520748138428, + "learning_rate": 0.00014236874236874236, + "loss": 0.5089, + "step": 1564 + }, + { + "epoch": 1.719071811066868, + "grad_norm": 0.5098987221717834, + "learning_rate": 0.00014224664224664224, + "loss": 0.781, + "step": 1565 + }, + { + "epoch": 1.7201702595084445, + "grad_norm": 0.4040893316268921, + "learning_rate": 0.0001421245421245421, + "loss": 0.7358, + "step": 1566 + }, + { + "epoch": 1.7212687079500206, + "grad_norm": 0.3601396679878235, + "learning_rate": 0.00014200244200244198, + "loss": 0.5531, + "step": 1567 + }, + { + "epoch": 1.7223671563915968, + "grad_norm": 0.6634377837181091, + "learning_rate": 0.00014188034188034187, + "loss": 0.6548, + "step": 1568 + }, + { + "epoch": 1.7234656048331731, + "grad_norm": 0.35935553908348083, + "learning_rate": 0.00014175824175824173, + "loss": 0.5653, + "step": 1569 + }, + { + "epoch": 1.7245640532747495, + "grad_norm": 0.4607802927494049, + "learning_rate": 0.00014163614163614164, + "loss": 0.9111, + "step": 1570 + }, + { + "epoch": 1.7256625017163256, + "grad_norm": 1.0116467475891113, + "learning_rate": 0.0001415140415140415, + "loss": 0.9226, + "step": 1571 + }, + { + "epoch": 1.726760950157902, + "grad_norm": 0.9484761953353882, + "learning_rate": 0.00014139194139194139, + "loss": 0.7536, + "step": 1572 + }, + { + "epoch": 1.7278593985994783, + "grad_norm": 0.3684981167316437, + "learning_rate": 0.00014126984126984124, + "loss": 0.5013, + "step": 1573 + }, + { + "epoch": 1.7289578470410545, + "grad_norm": 0.40037083625793457, + "learning_rate": 0.00014114774114774113, + "loss": 0.8069, + "step": 1574 + }, + { + "epoch": 1.7300562954826306, + "grad_norm": 0.42828282713890076, + "learning_rate": 0.00014102564102564101, + "loss": 0.5586, + "step": 1575 + }, + { + "epoch": 1.7311547439242072, + "grad_norm": 0.3461548686027527, + "learning_rate": 0.00014090354090354087, + "loss": 0.6045, + "step": 1576 + }, + { + "epoch": 1.7322531923657833, + "grad_norm": 0.622982919216156, + "learning_rate": 0.00014078144078144079, + "loss": 0.8943, + "step": 1577 + }, + { + "epoch": 1.7333516408073595, + "grad_norm": 0.3318479359149933, + "learning_rate": 0.00014065934065934064, + "loss": 0.4058, + "step": 1578 + }, + { + "epoch": 1.7344500892489358, + "grad_norm": 0.5178685188293457, + "learning_rate": 0.00014053724053724053, + "loss": 0.5839, + "step": 1579 + }, + { + "epoch": 1.7355485376905122, + "grad_norm": 0.44273868203163147, + "learning_rate": 0.00014041514041514042, + "loss": 0.5394, + "step": 1580 + }, + { + "epoch": 1.7366469861320883, + "grad_norm": 0.60169517993927, + "learning_rate": 0.00014029304029304027, + "loss": 0.6753, + "step": 1581 + }, + { + "epoch": 1.7377454345736647, + "grad_norm": 0.7691718339920044, + "learning_rate": 0.00014017094017094016, + "loss": 0.9618, + "step": 1582 + }, + { + "epoch": 1.738843883015241, + "grad_norm": 0.3900390565395355, + "learning_rate": 0.00014004884004884004, + "loss": 0.5809, + "step": 1583 + }, + { + "epoch": 1.7399423314568172, + "grad_norm": 0.6272429823875427, + "learning_rate": 0.00013992673992673993, + "loss": 0.8579, + "step": 1584 + }, + { + "epoch": 1.7410407798983936, + "grad_norm": 0.30017220973968506, + "learning_rate": 0.0001398046398046398, + "loss": 0.5335, + "step": 1585 + }, + { + "epoch": 1.74213922833997, + "grad_norm": 0.4937066435813904, + "learning_rate": 0.00013968253968253967, + "loss": 0.7941, + "step": 1586 + }, + { + "epoch": 1.743237676781546, + "grad_norm": 0.47317594289779663, + "learning_rate": 0.00013956043956043956, + "loss": 0.6013, + "step": 1587 + }, + { + "epoch": 1.7443361252231222, + "grad_norm": 1.9155733585357666, + "learning_rate": 0.00013943833943833942, + "loss": 0.6708, + "step": 1588 + }, + { + "epoch": 1.7454345736646986, + "grad_norm": 0.3844835162162781, + "learning_rate": 0.0001393162393162393, + "loss": 0.7176, + "step": 1589 + }, + { + "epoch": 1.746533022106275, + "grad_norm": 0.42810145020484924, + "learning_rate": 0.0001391941391941392, + "loss": 0.9255, + "step": 1590 + }, + { + "epoch": 1.747631470547851, + "grad_norm": 3.846015691757202, + "learning_rate": 0.00013907203907203905, + "loss": 0.6202, + "step": 1591 + }, + { + "epoch": 1.7487299189894274, + "grad_norm": 0.42783257365226746, + "learning_rate": 0.00013894993894993893, + "loss": 0.7451, + "step": 1592 + }, + { + "epoch": 1.7498283674310038, + "grad_norm": 0.5237023234367371, + "learning_rate": 0.00013882783882783882, + "loss": 0.7961, + "step": 1593 + }, + { + "epoch": 1.75092681587258, + "grad_norm": 2.5639729499816895, + "learning_rate": 0.0001387057387057387, + "loss": 0.7026, + "step": 1594 + }, + { + "epoch": 1.7520252643141563, + "grad_norm": 0.5686498284339905, + "learning_rate": 0.00013858363858363856, + "loss": 0.4916, + "step": 1595 + }, + { + "epoch": 1.7531237127557326, + "grad_norm": 0.561611533164978, + "learning_rate": 0.00013846153846153845, + "loss": 0.772, + "step": 1596 + }, + { + "epoch": 1.7542221611973088, + "grad_norm": 0.6220077872276306, + "learning_rate": 0.00013833943833943833, + "loss": 0.5694, + "step": 1597 + }, + { + "epoch": 1.755320609638885, + "grad_norm": 0.6902570724487305, + "learning_rate": 0.0001382173382173382, + "loss": 0.7963, + "step": 1598 + }, + { + "epoch": 1.7564190580804615, + "grad_norm": 2.0417702198028564, + "learning_rate": 0.00013809523809523808, + "loss": 0.6721, + "step": 1599 + }, + { + "epoch": 1.7575175065220376, + "grad_norm": 0.36764901876449585, + "learning_rate": 0.00013797313797313796, + "loss": 0.5714, + "step": 1600 + }, + { + "epoch": 1.7586159549636138, + "grad_norm": 0.6679022908210754, + "learning_rate": 0.00013785103785103785, + "loss": 0.7025, + "step": 1601 + }, + { + "epoch": 1.7597144034051901, + "grad_norm": 0.5749796628952026, + "learning_rate": 0.0001377289377289377, + "loss": 0.7381, + "step": 1602 + }, + { + "epoch": 1.7608128518467665, + "grad_norm": 0.9285687208175659, + "learning_rate": 0.0001376068376068376, + "loss": 0.6, + "step": 1603 + }, + { + "epoch": 1.7619113002883426, + "grad_norm": 0.8209772706031799, + "learning_rate": 0.00013748473748473748, + "loss": 0.5701, + "step": 1604 + }, + { + "epoch": 1.763009748729919, + "grad_norm": 0.7823337912559509, + "learning_rate": 0.00013736263736263734, + "loss": 0.6695, + "step": 1605 + }, + { + "epoch": 1.7641081971714954, + "grad_norm": 0.4885605275630951, + "learning_rate": 0.00013724053724053725, + "loss": 0.6487, + "step": 1606 + }, + { + "epoch": 1.7652066456130715, + "grad_norm": 0.36517488956451416, + "learning_rate": 0.0001371184371184371, + "loss": 0.5798, + "step": 1607 + }, + { + "epoch": 1.7663050940546479, + "grad_norm": 0.49961966276168823, + "learning_rate": 0.000136996336996337, + "loss": 0.4373, + "step": 1608 + }, + { + "epoch": 1.7674035424962242, + "grad_norm": 0.495263010263443, + "learning_rate": 0.00013687423687423688, + "loss": 0.5868, + "step": 1609 + }, + { + "epoch": 1.7685019909378004, + "grad_norm": 0.7384648323059082, + "learning_rate": 0.00013675213675213674, + "loss": 0.4957, + "step": 1610 + }, + { + "epoch": 1.7696004393793765, + "grad_norm": 0.465440034866333, + "learning_rate": 0.00013663003663003662, + "loss": 0.7424, + "step": 1611 + }, + { + "epoch": 1.7706988878209529, + "grad_norm": 0.68381667137146, + "learning_rate": 0.00013650793650793648, + "loss": 1.0421, + "step": 1612 + }, + { + "epoch": 1.7717973362625292, + "grad_norm": 4.455906867980957, + "learning_rate": 0.00013638583638583637, + "loss": 0.6626, + "step": 1613 + }, + { + "epoch": 1.7728957847041054, + "grad_norm": 0.6165801286697388, + "learning_rate": 0.00013626373626373625, + "loss": 0.6072, + "step": 1614 + }, + { + "epoch": 1.7739942331456817, + "grad_norm": 0.8296604156494141, + "learning_rate": 0.00013614163614163614, + "loss": 0.6507, + "step": 1615 + }, + { + "epoch": 1.775092681587258, + "grad_norm": 0.4678190350532532, + "learning_rate": 0.00013601953601953602, + "loss": 0.8466, + "step": 1616 + }, + { + "epoch": 1.7761911300288342, + "grad_norm": 1.2141482830047607, + "learning_rate": 0.00013589743589743588, + "loss": 0.513, + "step": 1617 + }, + { + "epoch": 1.7772895784704106, + "grad_norm": 0.4522024691104889, + "learning_rate": 0.00013577533577533577, + "loss": 0.7571, + "step": 1618 + }, + { + "epoch": 1.778388026911987, + "grad_norm": 2.0903220176696777, + "learning_rate": 0.00013565323565323565, + "loss": 0.7359, + "step": 1619 + }, + { + "epoch": 1.779486475353563, + "grad_norm": 0.5292307734489441, + "learning_rate": 0.0001355311355311355, + "loss": 0.6526, + "step": 1620 + }, + { + "epoch": 1.7805849237951392, + "grad_norm": 0.5047786235809326, + "learning_rate": 0.0001354090354090354, + "loss": 0.7056, + "step": 1621 + }, + { + "epoch": 1.7816833722367158, + "grad_norm": 0.4102507531642914, + "learning_rate": 0.00013528693528693528, + "loss": 0.8673, + "step": 1622 + }, + { + "epoch": 1.782781820678292, + "grad_norm": 0.471556693315506, + "learning_rate": 0.00013516483516483517, + "loss": 0.9424, + "step": 1623 + }, + { + "epoch": 1.783880269119868, + "grad_norm": 0.6595687866210938, + "learning_rate": 0.00013504273504273502, + "loss": 0.661, + "step": 1624 + }, + { + "epoch": 1.7849787175614444, + "grad_norm": 0.6221860647201538, + "learning_rate": 0.0001349206349206349, + "loss": 0.5457, + "step": 1625 + }, + { + "epoch": 1.7860771660030208, + "grad_norm": 0.9256211519241333, + "learning_rate": 0.0001347985347985348, + "loss": 0.9216, + "step": 1626 + }, + { + "epoch": 1.787175614444597, + "grad_norm": 0.31376492977142334, + "learning_rate": 0.00013467643467643465, + "loss": 0.7071, + "step": 1627 + }, + { + "epoch": 1.7882740628861733, + "grad_norm": 0.5313776135444641, + "learning_rate": 0.00013455433455433454, + "loss": 0.8111, + "step": 1628 + }, + { + "epoch": 1.7893725113277497, + "grad_norm": 0.8203330636024475, + "learning_rate": 0.00013443223443223442, + "loss": 0.5301, + "step": 1629 + }, + { + "epoch": 1.7904709597693258, + "grad_norm": 0.42774948477745056, + "learning_rate": 0.0001343101343101343, + "loss": 0.8359, + "step": 1630 + }, + { + "epoch": 1.791569408210902, + "grad_norm": 0.8165685534477234, + "learning_rate": 0.00013418803418803417, + "loss": 0.4894, + "step": 1631 + }, + { + "epoch": 1.7926678566524785, + "grad_norm": 0.5739139318466187, + "learning_rate": 0.00013406593406593405, + "loss": 0.7009, + "step": 1632 + }, + { + "epoch": 1.7937663050940547, + "grad_norm": 0.5102986097335815, + "learning_rate": 0.00013394383394383394, + "loss": 0.7174, + "step": 1633 + }, + { + "epoch": 1.7948647535356308, + "grad_norm": 1.1377652883529663, + "learning_rate": 0.0001338217338217338, + "loss": 0.79, + "step": 1634 + }, + { + "epoch": 1.7959632019772072, + "grad_norm": 0.44272491335868835, + "learning_rate": 0.00013369963369963368, + "loss": 0.6761, + "step": 1635 + }, + { + "epoch": 1.7970616504187835, + "grad_norm": 0.5084714889526367, + "learning_rate": 0.00013357753357753357, + "loss": 0.6848, + "step": 1636 + }, + { + "epoch": 1.7981600988603597, + "grad_norm": 0.752017080783844, + "learning_rate": 0.00013345543345543345, + "loss": 0.6107, + "step": 1637 + }, + { + "epoch": 1.799258547301936, + "grad_norm": 0.4430617690086365, + "learning_rate": 0.0001333333333333333, + "loss": 0.7639, + "step": 1638 + }, + { + "epoch": 1.8003569957435124, + "grad_norm": 0.8098049759864807, + "learning_rate": 0.0001332112332112332, + "loss": 0.8172, + "step": 1639 + }, + { + "epoch": 1.8014554441850885, + "grad_norm": 0.6817697286605835, + "learning_rate": 0.00013308913308913308, + "loss": 0.8274, + "step": 1640 + }, + { + "epoch": 1.8025538926266649, + "grad_norm": 0.5132669806480408, + "learning_rate": 0.00013296703296703294, + "loss": 0.6269, + "step": 1641 + }, + { + "epoch": 1.8036523410682412, + "grad_norm": 0.8487284183502197, + "learning_rate": 0.00013284493284493283, + "loss": 0.6734, + "step": 1642 + }, + { + "epoch": 1.8047507895098174, + "grad_norm": 0.7084116339683533, + "learning_rate": 0.0001327228327228327, + "loss": 0.703, + "step": 1643 + }, + { + "epoch": 1.8058492379513935, + "grad_norm": 0.39045432209968567, + "learning_rate": 0.00013260073260073257, + "loss": 0.5466, + "step": 1644 + }, + { + "epoch": 1.8069476863929699, + "grad_norm": 0.4408475160598755, + "learning_rate": 0.00013247863247863248, + "loss": 0.4998, + "step": 1645 + }, + { + "epoch": 1.8080461348345462, + "grad_norm": 0.41640380024909973, + "learning_rate": 0.00013235653235653234, + "loss": 0.49, + "step": 1646 + }, + { + "epoch": 1.8091445832761224, + "grad_norm": 0.6760729551315308, + "learning_rate": 0.00013223443223443223, + "loss": 0.4537, + "step": 1647 + }, + { + "epoch": 1.8102430317176987, + "grad_norm": 0.42953255772590637, + "learning_rate": 0.0001321123321123321, + "loss": 0.489, + "step": 1648 + }, + { + "epoch": 1.811341480159275, + "grad_norm": 0.3260825574398041, + "learning_rate": 0.00013199023199023197, + "loss": 0.6633, + "step": 1649 + }, + { + "epoch": 1.8124399286008512, + "grad_norm": 0.7073171138763428, + "learning_rate": 0.00013186813186813186, + "loss": 0.4953, + "step": 1650 + }, + { + "epoch": 1.8135383770424276, + "grad_norm": 0.36153069138526917, + "learning_rate": 0.00013174603174603172, + "loss": 0.7641, + "step": 1651 + }, + { + "epoch": 1.814636825484004, + "grad_norm": 0.4233636260032654, + "learning_rate": 0.00013162393162393163, + "loss": 0.7119, + "step": 1652 + }, + { + "epoch": 1.81573527392558, + "grad_norm": 0.5262153148651123, + "learning_rate": 0.0001315018315018315, + "loss": 0.4516, + "step": 1653 + }, + { + "epoch": 1.8168337223671562, + "grad_norm": 0.5263295769691467, + "learning_rate": 0.00013137973137973137, + "loss": 0.7786, + "step": 1654 + }, + { + "epoch": 1.8179321708087328, + "grad_norm": 0.3681116998195648, + "learning_rate": 0.00013125763125763126, + "loss": 0.5295, + "step": 1655 + }, + { + "epoch": 1.819030619250309, + "grad_norm": 0.5075433254241943, + "learning_rate": 0.00013113553113553112, + "loss": 0.6017, + "step": 1656 + }, + { + "epoch": 1.820129067691885, + "grad_norm": 0.2960616946220398, + "learning_rate": 0.000131013431013431, + "loss": 0.4951, + "step": 1657 + }, + { + "epoch": 1.8212275161334615, + "grad_norm": 0.4010205864906311, + "learning_rate": 0.0001308913308913309, + "loss": 0.8916, + "step": 1658 + }, + { + "epoch": 1.8223259645750378, + "grad_norm": 0.9112391471862793, + "learning_rate": 0.00013076923076923077, + "loss": 0.4978, + "step": 1659 + }, + { + "epoch": 1.823424413016614, + "grad_norm": 0.7214633226394653, + "learning_rate": 0.00013064713064713063, + "loss": 0.791, + "step": 1660 + }, + { + "epoch": 1.8245228614581903, + "grad_norm": 0.4174933433532715, + "learning_rate": 0.00013052503052503052, + "loss": 0.4099, + "step": 1661 + }, + { + "epoch": 1.8256213098997667, + "grad_norm": 0.4622137248516083, + "learning_rate": 0.0001304029304029304, + "loss": 1.1726, + "step": 1662 + }, + { + "epoch": 1.8267197583413428, + "grad_norm": 0.5991957783699036, + "learning_rate": 0.00013028083028083026, + "loss": 0.6713, + "step": 1663 + }, + { + "epoch": 1.8278182067829192, + "grad_norm": 0.43959730863571167, + "learning_rate": 0.00013015873015873015, + "loss": 0.5676, + "step": 1664 + }, + { + "epoch": 1.8289166552244955, + "grad_norm": 0.6271671056747437, + "learning_rate": 0.00013003663003663003, + "loss": 0.7399, + "step": 1665 + }, + { + "epoch": 1.8300151036660717, + "grad_norm": 0.6412084102630615, + "learning_rate": 0.0001299145299145299, + "loss": 0.7585, + "step": 1666 + }, + { + "epoch": 1.8311135521076478, + "grad_norm": 0.4066605269908905, + "learning_rate": 0.00012979242979242977, + "loss": 0.5756, + "step": 1667 + }, + { + "epoch": 1.8322120005492242, + "grad_norm": 0.3568172752857208, + "learning_rate": 0.00012967032967032966, + "loss": 0.968, + "step": 1668 + }, + { + "epoch": 1.8333104489908005, + "grad_norm": 0.5061100721359253, + "learning_rate": 0.00012954822954822955, + "loss": 0.5089, + "step": 1669 + }, + { + "epoch": 1.8344088974323767, + "grad_norm": 3.013622522354126, + "learning_rate": 0.0001294261294261294, + "loss": 0.5101, + "step": 1670 + }, + { + "epoch": 1.835507345873953, + "grad_norm": 0.40078219771385193, + "learning_rate": 0.0001293040293040293, + "loss": 0.5602, + "step": 1671 + }, + { + "epoch": 1.8366057943155294, + "grad_norm": 0.4108009338378906, + "learning_rate": 0.00012918192918192918, + "loss": 0.6338, + "step": 1672 + }, + { + "epoch": 1.8377042427571055, + "grad_norm": 0.5452212691307068, + "learning_rate": 0.00012905982905982903, + "loss": 0.5358, + "step": 1673 + }, + { + "epoch": 1.838802691198682, + "grad_norm": 0.4694603979587555, + "learning_rate": 0.00012893772893772895, + "loss": 0.7031, + "step": 1674 + }, + { + "epoch": 1.8399011396402583, + "grad_norm": 0.3787671625614166, + "learning_rate": 0.0001288156288156288, + "loss": 0.5667, + "step": 1675 + }, + { + "epoch": 1.8409995880818344, + "grad_norm": 0.4842737317085266, + "learning_rate": 0.0001286935286935287, + "loss": 0.5082, + "step": 1676 + }, + { + "epoch": 1.8420980365234105, + "grad_norm": 0.7690992951393127, + "learning_rate": 0.00012857142857142855, + "loss": 0.706, + "step": 1677 + }, + { + "epoch": 1.8431964849649871, + "grad_norm": 1.0891668796539307, + "learning_rate": 0.00012844932844932843, + "loss": 0.7162, + "step": 1678 + }, + { + "epoch": 1.8442949334065633, + "grad_norm": 0.4118032157421112, + "learning_rate": 0.00012832722832722832, + "loss": 0.7019, + "step": 1679 + }, + { + "epoch": 1.8453933818481394, + "grad_norm": 0.513157308101654, + "learning_rate": 0.00012820512820512818, + "loss": 0.4359, + "step": 1680 + }, + { + "epoch": 1.8464918302897158, + "grad_norm": 1.3229504823684692, + "learning_rate": 0.0001280830280830281, + "loss": 0.5555, + "step": 1681 + }, + { + "epoch": 1.8475902787312921, + "grad_norm": 0.6301699876785278, + "learning_rate": 0.00012796092796092795, + "loss": 0.5211, + "step": 1682 + }, + { + "epoch": 1.8486887271728683, + "grad_norm": 0.6125632524490356, + "learning_rate": 0.00012783882783882783, + "loss": 0.6287, + "step": 1683 + }, + { + "epoch": 1.8497871756144446, + "grad_norm": 1.806593418121338, + "learning_rate": 0.00012771672771672772, + "loss": 0.5794, + "step": 1684 + }, + { + "epoch": 1.850885624056021, + "grad_norm": 1.2972358465194702, + "learning_rate": 0.00012759462759462758, + "loss": 0.9205, + "step": 1685 + }, + { + "epoch": 1.8519840724975971, + "grad_norm": 1.0519033670425415, + "learning_rate": 0.00012747252747252746, + "loss": 0.7103, + "step": 1686 + }, + { + "epoch": 1.8530825209391735, + "grad_norm": 1.6489734649658203, + "learning_rate": 0.00012735042735042735, + "loss": 0.7585, + "step": 1687 + }, + { + "epoch": 1.8541809693807498, + "grad_norm": 0.7229527235031128, + "learning_rate": 0.0001272283272283272, + "loss": 0.8109, + "step": 1688 + }, + { + "epoch": 1.855279417822326, + "grad_norm": 0.35257261991500854, + "learning_rate": 0.0001271062271062271, + "loss": 0.8014, + "step": 1689 + }, + { + "epoch": 1.856377866263902, + "grad_norm": 0.4653327167034149, + "learning_rate": 0.00012698412698412698, + "loss": 0.6404, + "step": 1690 + }, + { + "epoch": 1.8574763147054785, + "grad_norm": 0.5230842232704163, + "learning_rate": 0.00012686202686202686, + "loss": 0.7413, + "step": 1691 + }, + { + "epoch": 1.8585747631470548, + "grad_norm": 0.42130210995674133, + "learning_rate": 0.00012673992673992672, + "loss": 0.7283, + "step": 1692 + }, + { + "epoch": 1.859673211588631, + "grad_norm": 1.4667960405349731, + "learning_rate": 0.0001266178266178266, + "loss": 0.5656, + "step": 1693 + }, + { + "epoch": 1.8607716600302073, + "grad_norm": 0.4077359139919281, + "learning_rate": 0.0001264957264957265, + "loss": 0.5891, + "step": 1694 + }, + { + "epoch": 1.8618701084717837, + "grad_norm": 0.503654956817627, + "learning_rate": 0.00012637362637362635, + "loss": 0.5912, + "step": 1695 + }, + { + "epoch": 1.8629685569133598, + "grad_norm": 1.6315315961837769, + "learning_rate": 0.00012625152625152624, + "loss": 0.5588, + "step": 1696 + }, + { + "epoch": 1.8640670053549362, + "grad_norm": 0.783920407295227, + "learning_rate": 0.00012612942612942612, + "loss": 0.6585, + "step": 1697 + }, + { + "epoch": 1.8651654537965126, + "grad_norm": 0.7186728715896606, + "learning_rate": 0.000126007326007326, + "loss": 0.9174, + "step": 1698 + }, + { + "epoch": 1.8662639022380887, + "grad_norm": 0.8784156441688538, + "learning_rate": 0.00012588522588522587, + "loss": 0.5835, + "step": 1699 + }, + { + "epoch": 1.8673623506796648, + "grad_norm": 0.7090787887573242, + "learning_rate": 0.00012576312576312575, + "loss": 0.7555, + "step": 1700 + }, + { + "epoch": 1.8684607991212414, + "grad_norm": 0.5508129596710205, + "learning_rate": 0.00012564102564102564, + "loss": 0.6168, + "step": 1701 + }, + { + "epoch": 1.8695592475628175, + "grad_norm": 0.40403681993484497, + "learning_rate": 0.0001255189255189255, + "loss": 0.4528, + "step": 1702 + }, + { + "epoch": 1.8706576960043937, + "grad_norm": 0.9553635716438293, + "learning_rate": 0.00012539682539682538, + "loss": 0.654, + "step": 1703 + }, + { + "epoch": 1.87175614444597, + "grad_norm": 1.0610092878341675, + "learning_rate": 0.00012527472527472527, + "loss": 0.6115, + "step": 1704 + }, + { + "epoch": 1.8728545928875464, + "grad_norm": 0.32898634672164917, + "learning_rate": 0.00012515262515262515, + "loss": 0.5651, + "step": 1705 + }, + { + "epoch": 1.8739530413291225, + "grad_norm": 0.4018780589103699, + "learning_rate": 0.000125030525030525, + "loss": 0.5919, + "step": 1706 + }, + { + "epoch": 1.875051489770699, + "grad_norm": 1.6521873474121094, + "learning_rate": 0.0001249084249084249, + "loss": 0.7137, + "step": 1707 + }, + { + "epoch": 1.8761499382122753, + "grad_norm": 0.5515930652618408, + "learning_rate": 0.00012478632478632478, + "loss": 0.4471, + "step": 1708 + }, + { + "epoch": 1.8772483866538514, + "grad_norm": 0.4156915545463562, + "learning_rate": 0.00012466422466422464, + "loss": 0.6575, + "step": 1709 + }, + { + "epoch": 1.8783468350954275, + "grad_norm": 0.41263312101364136, + "learning_rate": 0.00012454212454212453, + "loss": 0.542, + "step": 1710 + }, + { + "epoch": 1.8794452835370041, + "grad_norm": 1.0169517993927002, + "learning_rate": 0.0001244200244200244, + "loss": 1.1631, + "step": 1711 + }, + { + "epoch": 1.8805437319785803, + "grad_norm": 0.49169981479644775, + "learning_rate": 0.0001242979242979243, + "loss": 0.6707, + "step": 1712 + }, + { + "epoch": 1.8816421804201564, + "grad_norm": 0.44801297783851624, + "learning_rate": 0.00012417582417582416, + "loss": 1.0036, + "step": 1713 + }, + { + "epoch": 1.8827406288617328, + "grad_norm": 0.47181040048599243, + "learning_rate": 0.00012405372405372404, + "loss": 0.6693, + "step": 1714 + }, + { + "epoch": 1.8838390773033091, + "grad_norm": 0.39900457859039307, + "learning_rate": 0.00012393162393162393, + "loss": 0.6421, + "step": 1715 + }, + { + "epoch": 1.8849375257448853, + "grad_norm": 1.1160179376602173, + "learning_rate": 0.00012380952380952378, + "loss": 0.6599, + "step": 1716 + }, + { + "epoch": 1.8860359741864616, + "grad_norm": 0.6951555609703064, + "learning_rate": 0.00012368742368742367, + "loss": 0.743, + "step": 1717 + }, + { + "epoch": 1.887134422628038, + "grad_norm": 0.5381472706794739, + "learning_rate": 0.00012356532356532356, + "loss": 0.5051, + "step": 1718 + }, + { + "epoch": 1.8882328710696141, + "grad_norm": 0.48717793822288513, + "learning_rate": 0.00012344322344322341, + "loss": 0.7015, + "step": 1719 + }, + { + "epoch": 1.8893313195111905, + "grad_norm": 0.3720596432685852, + "learning_rate": 0.00012332112332112333, + "loss": 0.6743, + "step": 1720 + }, + { + "epoch": 1.8904297679527668, + "grad_norm": 1.1850451231002808, + "learning_rate": 0.00012319902319902318, + "loss": 0.6132, + "step": 1721 + }, + { + "epoch": 1.891528216394343, + "grad_norm": 0.4546525180339813, + "learning_rate": 0.00012307692307692307, + "loss": 0.5465, + "step": 1722 + }, + { + "epoch": 1.8926266648359191, + "grad_norm": 0.41415080428123474, + "learning_rate": 0.00012295482295482296, + "loss": 0.7259, + "step": 1723 + }, + { + "epoch": 1.8937251132774955, + "grad_norm": 0.44278842210769653, + "learning_rate": 0.00012283272283272281, + "loss": 0.7244, + "step": 1724 + }, + { + "epoch": 1.8948235617190718, + "grad_norm": 0.3887364864349365, + "learning_rate": 0.0001227106227106227, + "loss": 0.7124, + "step": 1725 + }, + { + "epoch": 1.895922010160648, + "grad_norm": 0.5405781269073486, + "learning_rate": 0.00012258852258852256, + "loss": 0.5153, + "step": 1726 + }, + { + "epoch": 1.8970204586022243, + "grad_norm": 0.3530559837818146, + "learning_rate": 0.00012246642246642247, + "loss": 0.5429, + "step": 1727 + }, + { + "epoch": 1.8981189070438007, + "grad_norm": 0.523621678352356, + "learning_rate": 0.00012234432234432233, + "loss": 0.5645, + "step": 1728 + }, + { + "epoch": 1.8992173554853768, + "grad_norm": 0.3893704116344452, + "learning_rate": 0.00012222222222222221, + "loss": 0.6419, + "step": 1729 + }, + { + "epoch": 1.9003158039269532, + "grad_norm": 0.7010704278945923, + "learning_rate": 0.0001221001221001221, + "loss": 0.5202, + "step": 1730 + }, + { + "epoch": 1.9014142523685296, + "grad_norm": 0.45551490783691406, + "learning_rate": 0.00012197802197802197, + "loss": 0.8492, + "step": 1731 + }, + { + "epoch": 1.9025127008101057, + "grad_norm": 1.0112484693527222, + "learning_rate": 0.00012185592185592184, + "loss": 0.8602, + "step": 1732 + }, + { + "epoch": 1.9036111492516818, + "grad_norm": 0.4509601294994354, + "learning_rate": 0.00012173382173382173, + "loss": 0.6138, + "step": 1733 + }, + { + "epoch": 1.9047095976932584, + "grad_norm": 0.4303388297557831, + "learning_rate": 0.0001216117216117216, + "loss": 0.4748, + "step": 1734 + }, + { + "epoch": 1.9058080461348346, + "grad_norm": 0.4452000558376312, + "learning_rate": 0.00012148962148962147, + "loss": 0.5869, + "step": 1735 + }, + { + "epoch": 1.9069064945764107, + "grad_norm": 0.5915077924728394, + "learning_rate": 0.00012136752136752136, + "loss": 0.8057, + "step": 1736 + }, + { + "epoch": 1.908004943017987, + "grad_norm": 0.38761547207832336, + "learning_rate": 0.00012124542124542123, + "loss": 0.5772, + "step": 1737 + }, + { + "epoch": 1.9091033914595634, + "grad_norm": 0.517752468585968, + "learning_rate": 0.00012112332112332112, + "loss": 0.7865, + "step": 1738 + }, + { + "epoch": 1.9102018399011396, + "grad_norm": 0.5325546860694885, + "learning_rate": 0.00012100122100122099, + "loss": 0.5934, + "step": 1739 + }, + { + "epoch": 1.911300288342716, + "grad_norm": 0.3930620551109314, + "learning_rate": 0.00012087912087912087, + "loss": 0.5974, + "step": 1740 + }, + { + "epoch": 1.9123987367842923, + "grad_norm": 1.1001818180084229, + "learning_rate": 0.00012075702075702075, + "loss": 0.6524, + "step": 1741 + }, + { + "epoch": 1.9134971852258684, + "grad_norm": 0.3690165877342224, + "learning_rate": 0.00012063492063492062, + "loss": 0.36, + "step": 1742 + }, + { + "epoch": 1.9145956336674448, + "grad_norm": 0.4403206408023834, + "learning_rate": 0.0001205128205128205, + "loss": 0.5737, + "step": 1743 + }, + { + "epoch": 1.9156940821090211, + "grad_norm": 0.651498019695282, + "learning_rate": 0.00012039072039072037, + "loss": 0.657, + "step": 1744 + }, + { + "epoch": 1.9167925305505973, + "grad_norm": 0.6880660057067871, + "learning_rate": 0.00012026862026862025, + "loss": 0.6891, + "step": 1745 + }, + { + "epoch": 1.9178909789921734, + "grad_norm": 0.4968664348125458, + "learning_rate": 0.00012014652014652015, + "loss": 0.841, + "step": 1746 + }, + { + "epoch": 1.9189894274337498, + "grad_norm": 0.4392407536506653, + "learning_rate": 0.00012002442002442002, + "loss": 0.7096, + "step": 1747 + }, + { + "epoch": 1.9200878758753261, + "grad_norm": 0.41028741002082825, + "learning_rate": 0.00011990231990231989, + "loss": 0.5838, + "step": 1748 + }, + { + "epoch": 1.9211863243169023, + "grad_norm": 0.7928158640861511, + "learning_rate": 0.00011978021978021978, + "loss": 0.6633, + "step": 1749 + }, + { + "epoch": 1.9222847727584786, + "grad_norm": 0.4970681071281433, + "learning_rate": 0.00011965811965811965, + "loss": 0.7764, + "step": 1750 + }, + { + "epoch": 1.923383221200055, + "grad_norm": 0.49581378698349, + "learning_rate": 0.00011953601953601952, + "loss": 0.7204, + "step": 1751 + }, + { + "epoch": 1.9244816696416311, + "grad_norm": 1.309241771697998, + "learning_rate": 0.00011941391941391939, + "loss": 0.5859, + "step": 1752 + }, + { + "epoch": 1.9255801180832075, + "grad_norm": 0.4651016592979431, + "learning_rate": 0.00011929181929181929, + "loss": 0.6425, + "step": 1753 + }, + { + "epoch": 1.9266785665247839, + "grad_norm": 0.5377634167671204, + "learning_rate": 0.00011916971916971916, + "loss": 0.8244, + "step": 1754 + }, + { + "epoch": 1.92777701496636, + "grad_norm": 0.6809287667274475, + "learning_rate": 0.00011904761904761903, + "loss": 0.5711, + "step": 1755 + }, + { + "epoch": 1.9288754634079361, + "grad_norm": 0.650701105594635, + "learning_rate": 0.00011892551892551892, + "loss": 0.8341, + "step": 1756 + }, + { + "epoch": 1.9299739118495127, + "grad_norm": 1.1710751056671143, + "learning_rate": 0.00011880341880341879, + "loss": 0.8093, + "step": 1757 + }, + { + "epoch": 1.9310723602910889, + "grad_norm": 0.4244484603404999, + "learning_rate": 0.00011868131868131866, + "loss": 0.5556, + "step": 1758 + }, + { + "epoch": 1.932170808732665, + "grad_norm": 0.43999040126800537, + "learning_rate": 0.00011855921855921855, + "loss": 0.4582, + "step": 1759 + }, + { + "epoch": 1.9332692571742414, + "grad_norm": 0.4197145700454712, + "learning_rate": 0.00011843711843711843, + "loss": 0.6475, + "step": 1760 + }, + { + "epoch": 1.9343677056158177, + "grad_norm": 0.36619749665260315, + "learning_rate": 0.0001183150183150183, + "loss": 0.5804, + "step": 1761 + }, + { + "epoch": 1.9354661540573939, + "grad_norm": 1.7230706214904785, + "learning_rate": 0.00011819291819291819, + "loss": 0.7064, + "step": 1762 + }, + { + "epoch": 1.9365646024989702, + "grad_norm": 0.7621874213218689, + "learning_rate": 0.00011807081807081806, + "loss": 0.6766, + "step": 1763 + }, + { + "epoch": 1.9376630509405466, + "grad_norm": 0.5920525789260864, + "learning_rate": 0.00011794871794871794, + "loss": 0.7092, + "step": 1764 + }, + { + "epoch": 1.9387614993821227, + "grad_norm": 1.5368432998657227, + "learning_rate": 0.00011782661782661781, + "loss": 0.3366, + "step": 1765 + }, + { + "epoch": 1.9398599478236989, + "grad_norm": 0.43197643756866455, + "learning_rate": 0.00011770451770451769, + "loss": 0.6158, + "step": 1766 + }, + { + "epoch": 1.9409583962652754, + "grad_norm": 0.4623143970966339, + "learning_rate": 0.00011758241758241756, + "loss": 0.6574, + "step": 1767 + }, + { + "epoch": 1.9420568447068516, + "grad_norm": 0.40638601779937744, + "learning_rate": 0.00011746031746031744, + "loss": 0.4385, + "step": 1768 + }, + { + "epoch": 1.9431552931484277, + "grad_norm": 0.5941652655601501, + "learning_rate": 0.00011733821733821734, + "loss": 0.8634, + "step": 1769 + }, + { + "epoch": 1.944253741590004, + "grad_norm": 0.9646288156509399, + "learning_rate": 0.00011721611721611721, + "loss": 0.7107, + "step": 1770 + }, + { + "epoch": 1.9453521900315804, + "grad_norm": 1.6859776973724365, + "learning_rate": 0.00011709401709401708, + "loss": 0.5544, + "step": 1771 + }, + { + "epoch": 1.9464506384731566, + "grad_norm": 0.4034999907016754, + "learning_rate": 0.00011697191697191697, + "loss": 0.559, + "step": 1772 + }, + { + "epoch": 1.947549086914733, + "grad_norm": 0.3644643723964691, + "learning_rate": 0.00011684981684981684, + "loss": 0.535, + "step": 1773 + }, + { + "epoch": 1.9486475353563093, + "grad_norm": 0.5826202034950256, + "learning_rate": 0.00011672771672771671, + "loss": 0.6405, + "step": 1774 + }, + { + "epoch": 1.9497459837978854, + "grad_norm": 0.5501505136489868, + "learning_rate": 0.00011660561660561661, + "loss": 0.5702, + "step": 1775 + }, + { + "epoch": 1.9508444322394618, + "grad_norm": 0.7928853631019592, + "learning_rate": 0.00011648351648351648, + "loss": 0.666, + "step": 1776 + }, + { + "epoch": 1.9519428806810382, + "grad_norm": 0.8168489933013916, + "learning_rate": 0.00011636141636141635, + "loss": 0.4451, + "step": 1777 + }, + { + "epoch": 1.9530413291226143, + "grad_norm": 0.3752410113811493, + "learning_rate": 0.00011623931623931622, + "loss": 0.6552, + "step": 1778 + }, + { + "epoch": 1.9541397775641904, + "grad_norm": 0.9020218849182129, + "learning_rate": 0.00011611721611721611, + "loss": 0.5994, + "step": 1779 + }, + { + "epoch": 1.9552382260057668, + "grad_norm": 0.7668479084968567, + "learning_rate": 0.00011599511599511598, + "loss": 0.5007, + "step": 1780 + }, + { + "epoch": 1.9563366744473432, + "grad_norm": 0.5034022331237793, + "learning_rate": 0.00011587301587301585, + "loss": 0.5211, + "step": 1781 + }, + { + "epoch": 1.9574351228889193, + "grad_norm": 1.0153850317001343, + "learning_rate": 0.00011575091575091575, + "loss": 0.5953, + "step": 1782 + }, + { + "epoch": 1.9585335713304957, + "grad_norm": 0.40088045597076416, + "learning_rate": 0.00011562881562881562, + "loss": 0.568, + "step": 1783 + }, + { + "epoch": 1.959632019772072, + "grad_norm": 1.4017099142074585, + "learning_rate": 0.0001155067155067155, + "loss": 0.7058, + "step": 1784 + }, + { + "epoch": 1.9607304682136482, + "grad_norm": 0.6009597778320312, + "learning_rate": 0.00011538461538461538, + "loss": 0.6239, + "step": 1785 + }, + { + "epoch": 1.9618289166552245, + "grad_norm": 0.5155071020126343, + "learning_rate": 0.00011526251526251525, + "loss": 0.6089, + "step": 1786 + }, + { + "epoch": 1.9629273650968009, + "grad_norm": 0.4248057007789612, + "learning_rate": 0.00011514041514041513, + "loss": 0.6481, + "step": 1787 + }, + { + "epoch": 1.964025813538377, + "grad_norm": 0.6521177887916565, + "learning_rate": 0.00011501831501831501, + "loss": 0.6598, + "step": 1788 + }, + { + "epoch": 1.9651242619799532, + "grad_norm": 0.44697993993759155, + "learning_rate": 0.00011489621489621488, + "loss": 0.8944, + "step": 1789 + }, + { + "epoch": 1.9662227104215297, + "grad_norm": 0.41537097096443176, + "learning_rate": 0.00011477411477411476, + "loss": 0.5304, + "step": 1790 + }, + { + "epoch": 1.9673211588631059, + "grad_norm": 0.48793885111808777, + "learning_rate": 0.00011465201465201464, + "loss": 0.7262, + "step": 1791 + }, + { + "epoch": 1.968419607304682, + "grad_norm": 0.8768893480300903, + "learning_rate": 0.00011452991452991453, + "loss": 0.6748, + "step": 1792 + }, + { + "epoch": 1.9695180557462584, + "grad_norm": 0.39224761724472046, + "learning_rate": 0.0001144078144078144, + "loss": 0.5503, + "step": 1793 + }, + { + "epoch": 1.9706165041878347, + "grad_norm": 0.5617446899414062, + "learning_rate": 0.00011428571428571427, + "loss": 0.7329, + "step": 1794 + }, + { + "epoch": 1.9717149526294109, + "grad_norm": 0.3787171542644501, + "learning_rate": 0.00011416361416361416, + "loss": 0.545, + "step": 1795 + }, + { + "epoch": 1.9728134010709872, + "grad_norm": 1.5167701244354248, + "learning_rate": 0.00011404151404151403, + "loss": 0.492, + "step": 1796 + }, + { + "epoch": 1.9739118495125636, + "grad_norm": 0.6436883807182312, + "learning_rate": 0.0001139194139194139, + "loss": 0.5644, + "step": 1797 + }, + { + "epoch": 1.9750102979541397, + "grad_norm": 0.7104658484458923, + "learning_rate": 0.0001137973137973138, + "loss": 0.7485, + "step": 1798 + }, + { + "epoch": 1.976108746395716, + "grad_norm": 0.7996894717216492, + "learning_rate": 0.00011367521367521367, + "loss": 0.6918, + "step": 1799 + }, + { + "epoch": 1.9772071948372925, + "grad_norm": 0.6419106721878052, + "learning_rate": 0.00011355311355311354, + "loss": 0.5945, + "step": 1800 + }, + { + "epoch": 1.9783056432788686, + "grad_norm": 0.5158131718635559, + "learning_rate": 0.00011343101343101343, + "loss": 0.6685, + "step": 1801 + }, + { + "epoch": 1.9794040917204447, + "grad_norm": 1.0825144052505493, + "learning_rate": 0.0001133089133089133, + "loss": 0.6774, + "step": 1802 + }, + { + "epoch": 1.980502540162021, + "grad_norm": 0.3999088704586029, + "learning_rate": 0.00011318681318681317, + "loss": 0.632, + "step": 1803 + }, + { + "epoch": 1.9816009886035975, + "grad_norm": 0.8866069316864014, + "learning_rate": 0.00011306471306471304, + "loss": 0.6541, + "step": 1804 + }, + { + "epoch": 1.9826994370451736, + "grad_norm": 0.3858928978443146, + "learning_rate": 0.00011294261294261294, + "loss": 0.6608, + "step": 1805 + }, + { + "epoch": 1.98379788548675, + "grad_norm": 0.513117790222168, + "learning_rate": 0.00011282051282051281, + "loss": 0.7598, + "step": 1806 + }, + { + "epoch": 1.9848963339283263, + "grad_norm": 0.3166581392288208, + "learning_rate": 0.00011269841269841269, + "loss": 0.781, + "step": 1807 + }, + { + "epoch": 1.9859947823699025, + "grad_norm": 0.3982362151145935, + "learning_rate": 0.00011257631257631257, + "loss": 0.873, + "step": 1808 + }, + { + "epoch": 1.9870932308114788, + "grad_norm": 0.3784008026123047, + "learning_rate": 0.00011245421245421244, + "loss": 0.7286, + "step": 1809 + }, + { + "epoch": 1.9881916792530552, + "grad_norm": 0.7578315138816833, + "learning_rate": 0.00011233211233211232, + "loss": 0.5958, + "step": 1810 + }, + { + "epoch": 1.9892901276946313, + "grad_norm": 0.8509061932563782, + "learning_rate": 0.0001122100122100122, + "loss": 0.557, + "step": 1811 + }, + { + "epoch": 1.9903885761362075, + "grad_norm": 0.5107323527336121, + "learning_rate": 0.00011208791208791207, + "loss": 0.6994, + "step": 1812 + }, + { + "epoch": 1.991487024577784, + "grad_norm": 0.5421388149261475, + "learning_rate": 0.00011196581196581196, + "loss": 0.8839, + "step": 1813 + }, + { + "epoch": 1.9925854730193602, + "grad_norm": 0.7442356944084167, + "learning_rate": 0.00011184371184371184, + "loss": 0.6676, + "step": 1814 + }, + { + "epoch": 1.9936839214609363, + "grad_norm": 0.34132111072540283, + "learning_rate": 0.00011172161172161172, + "loss": 0.5714, + "step": 1815 + }, + { + "epoch": 1.9947823699025127, + "grad_norm": 0.3995620906352997, + "learning_rate": 0.00011159951159951159, + "loss": 0.4811, + "step": 1816 + }, + { + "epoch": 1.995880818344089, + "grad_norm": 0.5613861083984375, + "learning_rate": 0.00011147741147741146, + "loss": 0.7495, + "step": 1817 + }, + { + "epoch": 1.9969792667856652, + "grad_norm": 0.4366309642791748, + "learning_rate": 0.00011135531135531135, + "loss": 0.6512, + "step": 1818 + }, + { + "epoch": 1.9980777152272415, + "grad_norm": 0.889916718006134, + "learning_rate": 0.00011123321123321122, + "loss": 0.5544, + "step": 1819 + }, + { + "epoch": 1.999176163668818, + "grad_norm": 0.512112021446228, + "learning_rate": 0.00011111111111111109, + "loss": 1.136, + "step": 1820 + }, + { + "epoch": 2.000274612110394, + "grad_norm": 0.5241844654083252, + "learning_rate": 0.00011098901098901099, + "loss": 0.5898, + "step": 1821 + }, + { + "epoch": 2.00137306055197, + "grad_norm": 0.38159477710723877, + "learning_rate": 0.00011086691086691086, + "loss": 0.5523, + "step": 1822 + }, + { + "epoch": 2.0024715089935468, + "grad_norm": 1.0415009260177612, + "learning_rate": 0.00011074481074481073, + "loss": 0.6963, + "step": 1823 + }, + { + "epoch": 2.003569957435123, + "grad_norm": 0.5349957942962646, + "learning_rate": 0.00011062271062271062, + "loss": 0.4422, + "step": 1824 + }, + { + "epoch": 2.004668405876699, + "grad_norm": 0.4512043297290802, + "learning_rate": 0.00011050061050061049, + "loss": 0.5467, + "step": 1825 + }, + { + "epoch": 2.0057668543182756, + "grad_norm": 0.8268045783042908, + "learning_rate": 0.00011037851037851036, + "loss": 0.6931, + "step": 1826 + }, + { + "epoch": 2.0068653027598518, + "grad_norm": 0.47922319173812866, + "learning_rate": 0.00011025641025641026, + "loss": 0.707, + "step": 1827 + }, + { + "epoch": 2.007963751201428, + "grad_norm": 1.352858304977417, + "learning_rate": 0.00011013431013431013, + "loss": 0.5658, + "step": 1828 + }, + { + "epoch": 2.0090621996430045, + "grad_norm": 0.6304643154144287, + "learning_rate": 0.00011001221001221, + "loss": 0.6526, + "step": 1829 + }, + { + "epoch": 2.0101606480845806, + "grad_norm": 0.3759060502052307, + "learning_rate": 0.00010989010989010988, + "loss": 0.627, + "step": 1830 + }, + { + "epoch": 2.0112590965261568, + "grad_norm": 0.5676531195640564, + "learning_rate": 0.00010976800976800976, + "loss": 0.7568, + "step": 1831 + }, + { + "epoch": 2.012357544967733, + "grad_norm": 0.7481321692466736, + "learning_rate": 0.00010964590964590963, + "loss": 0.7304, + "step": 1832 + }, + { + "epoch": 2.0134559934093095, + "grad_norm": 1.0350905656814575, + "learning_rate": 0.0001095238095238095, + "loss": 0.7414, + "step": 1833 + }, + { + "epoch": 2.0145544418508856, + "grad_norm": 0.7817292809486389, + "learning_rate": 0.00010940170940170939, + "loss": 0.7742, + "step": 1834 + }, + { + "epoch": 2.0156528902924618, + "grad_norm": 0.44659602642059326, + "learning_rate": 0.00010927960927960928, + "loss": 0.7872, + "step": 1835 + }, + { + "epoch": 2.0167513387340383, + "grad_norm": 0.46931198239326477, + "learning_rate": 0.00010915750915750915, + "loss": 0.5596, + "step": 1836 + }, + { + "epoch": 2.0178497871756145, + "grad_norm": 0.34634560346603394, + "learning_rate": 0.00010903540903540903, + "loss": 0.6861, + "step": 1837 + }, + { + "epoch": 2.0189482356171906, + "grad_norm": 0.36579200625419617, + "learning_rate": 0.0001089133089133089, + "loss": 0.6586, + "step": 1838 + }, + { + "epoch": 2.020046684058767, + "grad_norm": 0.9167144894599915, + "learning_rate": 0.00010879120879120878, + "loss": 0.7125, + "step": 1839 + }, + { + "epoch": 2.0211451325003433, + "grad_norm": 0.4107789993286133, + "learning_rate": 0.00010866910866910866, + "loss": 0.6089, + "step": 1840 + }, + { + "epoch": 2.0222435809419195, + "grad_norm": 1.0845204591751099, + "learning_rate": 0.00010854700854700854, + "loss": 0.499, + "step": 1841 + }, + { + "epoch": 2.0233420293834956, + "grad_norm": 0.382376492023468, + "learning_rate": 0.00010842490842490841, + "loss": 0.5505, + "step": 1842 + }, + { + "epoch": 2.024440477825072, + "grad_norm": 0.38339781761169434, + "learning_rate": 0.00010830280830280828, + "loss": 0.4593, + "step": 1843 + }, + { + "epoch": 2.0255389262666483, + "grad_norm": 0.45328769087791443, + "learning_rate": 0.00010818070818070818, + "loss": 0.8437, + "step": 1844 + }, + { + "epoch": 2.0266373747082245, + "grad_norm": 0.3051920533180237, + "learning_rate": 0.00010805860805860805, + "loss": 0.6096, + "step": 1845 + }, + { + "epoch": 2.027735823149801, + "grad_norm": 0.4249560236930847, + "learning_rate": 0.00010793650793650792, + "loss": 0.6441, + "step": 1846 + }, + { + "epoch": 2.028834271591377, + "grad_norm": 0.6639708280563354, + "learning_rate": 0.00010781440781440781, + "loss": 0.716, + "step": 1847 + }, + { + "epoch": 2.0299327200329533, + "grad_norm": 0.4324635863304138, + "learning_rate": 0.00010769230769230768, + "loss": 0.5288, + "step": 1848 + }, + { + "epoch": 2.03103116847453, + "grad_norm": 0.46487629413604736, + "learning_rate": 0.00010757020757020755, + "loss": 0.4908, + "step": 1849 + }, + { + "epoch": 2.032129616916106, + "grad_norm": 0.5104641318321228, + "learning_rate": 0.00010744810744810745, + "loss": 0.6367, + "step": 1850 + }, + { + "epoch": 2.033228065357682, + "grad_norm": 0.4010922312736511, + "learning_rate": 0.00010732600732600732, + "loss": 0.4266, + "step": 1851 + }, + { + "epoch": 2.0343265137992583, + "grad_norm": 0.6835510730743408, + "learning_rate": 0.0001072039072039072, + "loss": 1.0077, + "step": 1852 + }, + { + "epoch": 2.035424962240835, + "grad_norm": 0.7012602686882019, + "learning_rate": 0.00010708180708180708, + "loss": 0.7656, + "step": 1853 + }, + { + "epoch": 2.036523410682411, + "grad_norm": 0.8202001452445984, + "learning_rate": 0.00010695970695970695, + "loss": 0.9796, + "step": 1854 + }, + { + "epoch": 2.037621859123987, + "grad_norm": 0.37708353996276855, + "learning_rate": 0.00010683760683760682, + "loss": 0.3664, + "step": 1855 + }, + { + "epoch": 2.0387203075655638, + "grad_norm": 0.34818801283836365, + "learning_rate": 0.0001067155067155067, + "loss": 0.5365, + "step": 1856 + }, + { + "epoch": 2.03981875600714, + "grad_norm": 0.46427440643310547, + "learning_rate": 0.0001065934065934066, + "loss": 0.7503, + "step": 1857 + }, + { + "epoch": 2.040917204448716, + "grad_norm": 0.4782754182815552, + "learning_rate": 0.00010647130647130647, + "loss": 0.9247, + "step": 1858 + }, + { + "epoch": 2.0420156528902926, + "grad_norm": 0.6814667582511902, + "learning_rate": 0.00010634920634920634, + "loss": 0.5365, + "step": 1859 + }, + { + "epoch": 2.0431141013318688, + "grad_norm": 0.4782056510448456, + "learning_rate": 0.00010622710622710622, + "loss": 0.7444, + "step": 1860 + }, + { + "epoch": 2.044212549773445, + "grad_norm": 0.768439769744873, + "learning_rate": 0.0001061050061050061, + "loss": 0.6386, + "step": 1861 + }, + { + "epoch": 2.0453109982150215, + "grad_norm": 0.9991740584373474, + "learning_rate": 0.00010598290598290597, + "loss": 0.4762, + "step": 1862 + }, + { + "epoch": 2.0464094466565976, + "grad_norm": 0.4244922995567322, + "learning_rate": 0.00010586080586080585, + "loss": 0.4469, + "step": 1863 + }, + { + "epoch": 2.0475078950981738, + "grad_norm": 0.4085465371608734, + "learning_rate": 0.00010573870573870573, + "loss": 0.7215, + "step": 1864 + }, + { + "epoch": 2.04860634353975, + "grad_norm": 1.3068008422851562, + "learning_rate": 0.0001056166056166056, + "loss": 0.7781, + "step": 1865 + }, + { + "epoch": 2.0497047919813265, + "grad_norm": 0.3995974659919739, + "learning_rate": 0.0001054945054945055, + "loss": 0.6114, + "step": 1866 + }, + { + "epoch": 2.0508032404229026, + "grad_norm": 0.47944560647010803, + "learning_rate": 0.00010537240537240537, + "loss": 0.7355, + "step": 1867 + }, + { + "epoch": 2.0519016888644788, + "grad_norm": 1.6718720197677612, + "learning_rate": 0.00010525030525030524, + "loss": 0.5987, + "step": 1868 + }, + { + "epoch": 2.0530001373060554, + "grad_norm": 0.46015220880508423, + "learning_rate": 0.00010512820512820511, + "loss": 0.481, + "step": 1869 + }, + { + "epoch": 2.0540985857476315, + "grad_norm": 0.4863795042037964, + "learning_rate": 0.000105006105006105, + "loss": 0.5877, + "step": 1870 + }, + { + "epoch": 2.0551970341892076, + "grad_norm": 0.9190402030944824, + "learning_rate": 0.00010488400488400487, + "loss": 0.7941, + "step": 1871 + }, + { + "epoch": 2.056295482630784, + "grad_norm": 0.6056554317474365, + "learning_rate": 0.00010476190476190474, + "loss": 0.5455, + "step": 1872 + }, + { + "epoch": 2.0573939310723603, + "grad_norm": 0.7070736289024353, + "learning_rate": 0.00010463980463980464, + "loss": 0.6112, + "step": 1873 + }, + { + "epoch": 2.0584923795139365, + "grad_norm": 0.5415268540382385, + "learning_rate": 0.00010451770451770451, + "loss": 0.7141, + "step": 1874 + }, + { + "epoch": 2.0595908279555126, + "grad_norm": 0.45696091651916504, + "learning_rate": 0.00010439560439560438, + "loss": 0.7825, + "step": 1875 + }, + { + "epoch": 2.060689276397089, + "grad_norm": 0.5728979706764221, + "learning_rate": 0.00010427350427350427, + "loss": 0.5869, + "step": 1876 + }, + { + "epoch": 2.0617877248386653, + "grad_norm": 0.5910143852233887, + "learning_rate": 0.00010415140415140414, + "loss": 0.728, + "step": 1877 + }, + { + "epoch": 2.0628861732802415, + "grad_norm": 0.530915379524231, + "learning_rate": 0.00010402930402930401, + "loss": 0.6459, + "step": 1878 + }, + { + "epoch": 2.063984621721818, + "grad_norm": 0.36358964443206787, + "learning_rate": 0.00010390720390720391, + "loss": 0.7536, + "step": 1879 + }, + { + "epoch": 2.065083070163394, + "grad_norm": 2.7523410320281982, + "learning_rate": 0.00010378510378510379, + "loss": 0.6347, + "step": 1880 + }, + { + "epoch": 2.0661815186049703, + "grad_norm": 0.6842527389526367, + "learning_rate": 0.00010366300366300366, + "loss": 0.4943, + "step": 1881 + }, + { + "epoch": 2.067279967046547, + "grad_norm": 0.5830293297767639, + "learning_rate": 0.00010354090354090353, + "loss": 0.5855, + "step": 1882 + }, + { + "epoch": 2.068378415488123, + "grad_norm": 0.981920599937439, + "learning_rate": 0.00010341880341880341, + "loss": 0.4425, + "step": 1883 + }, + { + "epoch": 2.069476863929699, + "grad_norm": 2.0826029777526855, + "learning_rate": 0.00010329670329670329, + "loss": 0.5399, + "step": 1884 + }, + { + "epoch": 2.0705753123712753, + "grad_norm": 0.4648442268371582, + "learning_rate": 0.00010317460317460316, + "loss": 0.6203, + "step": 1885 + }, + { + "epoch": 2.071673760812852, + "grad_norm": 0.5086346864700317, + "learning_rate": 0.00010305250305250304, + "loss": 0.6091, + "step": 1886 + }, + { + "epoch": 2.072772209254428, + "grad_norm": 0.40404266119003296, + "learning_rate": 0.00010293040293040292, + "loss": 0.5013, + "step": 1887 + }, + { + "epoch": 2.073870657696004, + "grad_norm": 2.0507569313049316, + "learning_rate": 0.0001028083028083028, + "loss": 0.7822, + "step": 1888 + }, + { + "epoch": 2.074969106137581, + "grad_norm": 0.9318211078643799, + "learning_rate": 0.00010268620268620269, + "loss": 0.6638, + "step": 1889 + }, + { + "epoch": 2.076067554579157, + "grad_norm": 0.7601054310798645, + "learning_rate": 0.00010256410256410256, + "loss": 0.6085, + "step": 1890 + }, + { + "epoch": 2.077166003020733, + "grad_norm": 1.1299306154251099, + "learning_rate": 0.00010244200244200243, + "loss": 0.682, + "step": 1891 + }, + { + "epoch": 2.0782644514623096, + "grad_norm": 0.5009475350379944, + "learning_rate": 0.0001023199023199023, + "loss": 0.7229, + "step": 1892 + }, + { + "epoch": 2.079362899903886, + "grad_norm": 0.3432561159133911, + "learning_rate": 0.00010219780219780219, + "loss": 0.5991, + "step": 1893 + }, + { + "epoch": 2.080461348345462, + "grad_norm": 0.5224031805992126, + "learning_rate": 0.00010207570207570206, + "loss": 0.3687, + "step": 1894 + }, + { + "epoch": 2.0815597967870385, + "grad_norm": 0.4849548935890198, + "learning_rate": 0.00010195360195360193, + "loss": 0.507, + "step": 1895 + }, + { + "epoch": 2.0826582452286146, + "grad_norm": 0.6093185544013977, + "learning_rate": 0.00010183150183150183, + "loss": 0.7019, + "step": 1896 + }, + { + "epoch": 2.083756693670191, + "grad_norm": 0.7408457398414612, + "learning_rate": 0.0001017094017094017, + "loss": 0.6331, + "step": 1897 + }, + { + "epoch": 2.084855142111767, + "grad_norm": 0.67701655626297, + "learning_rate": 0.00010158730158730157, + "loss": 0.6685, + "step": 1898 + }, + { + "epoch": 2.0859535905533435, + "grad_norm": 0.2880030870437622, + "learning_rate": 0.00010146520146520146, + "loss": 0.4043, + "step": 1899 + }, + { + "epoch": 2.0870520389949196, + "grad_norm": 0.45890796184539795, + "learning_rate": 0.00010134310134310133, + "loss": 0.3695, + "step": 1900 + }, + { + "epoch": 2.088150487436496, + "grad_norm": 0.7898344397544861, + "learning_rate": 0.0001012210012210012, + "loss": 0.7875, + "step": 1901 + }, + { + "epoch": 2.0892489358780724, + "grad_norm": 0.5648753046989441, + "learning_rate": 0.0001010989010989011, + "loss": 0.6058, + "step": 1902 + }, + { + "epoch": 2.0903473843196485, + "grad_norm": 0.7880465984344482, + "learning_rate": 0.00010097680097680098, + "loss": 0.6403, + "step": 1903 + }, + { + "epoch": 2.0914458327612246, + "grad_norm": 0.4169737696647644, + "learning_rate": 0.00010085470085470085, + "loss": 0.71, + "step": 1904 + }, + { + "epoch": 2.0925442812028012, + "grad_norm": 0.33653560280799866, + "learning_rate": 0.00010073260073260072, + "loss": 0.6278, + "step": 1905 + }, + { + "epoch": 2.0936427296443774, + "grad_norm": 0.6861558556556702, + "learning_rate": 0.0001006105006105006, + "loss": 0.8463, + "step": 1906 + }, + { + "epoch": 2.0947411780859535, + "grad_norm": 0.29407018423080444, + "learning_rate": 0.00010048840048840048, + "loss": 0.5644, + "step": 1907 + }, + { + "epoch": 2.09583962652753, + "grad_norm": 0.673083484172821, + "learning_rate": 0.00010036630036630035, + "loss": 0.8353, + "step": 1908 + }, + { + "epoch": 2.0969380749691062, + "grad_norm": 0.429061621427536, + "learning_rate": 0.00010024420024420023, + "loss": 0.6381, + "step": 1909 + }, + { + "epoch": 2.0980365234106824, + "grad_norm": 0.5113368630409241, + "learning_rate": 0.00010012210012210012, + "loss": 0.7603, + "step": 1910 + }, + { + "epoch": 2.0991349718522585, + "grad_norm": 0.9005820751190186, + "learning_rate": 9.999999999999999e-05, + "loss": 0.6331, + "step": 1911 + }, + { + "epoch": 2.100233420293835, + "grad_norm": 0.489851176738739, + "learning_rate": 9.987789987789988e-05, + "loss": 0.8564, + "step": 1912 + }, + { + "epoch": 2.1013318687354112, + "grad_norm": 0.42647236585617065, + "learning_rate": 9.975579975579975e-05, + "loss": 0.5496, + "step": 1913 + }, + { + "epoch": 2.1024303171769874, + "grad_norm": 0.9061693549156189, + "learning_rate": 9.963369963369962e-05, + "loss": 0.4478, + "step": 1914 + }, + { + "epoch": 2.103528765618564, + "grad_norm": 0.4721933901309967, + "learning_rate": 9.95115995115995e-05, + "loss": 0.6066, + "step": 1915 + }, + { + "epoch": 2.10462721406014, + "grad_norm": 0.7265921831130981, + "learning_rate": 9.938949938949938e-05, + "loss": 0.7195, + "step": 1916 + }, + { + "epoch": 2.1057256625017162, + "grad_norm": 0.4521386921405792, + "learning_rate": 9.926739926739925e-05, + "loss": 0.6476, + "step": 1917 + }, + { + "epoch": 2.106824110943293, + "grad_norm": 0.42982912063598633, + "learning_rate": 9.914529914529912e-05, + "loss": 0.535, + "step": 1918 + }, + { + "epoch": 2.107922559384869, + "grad_norm": 0.4758259952068329, + "learning_rate": 9.902319902319902e-05, + "loss": 0.8106, + "step": 1919 + }, + { + "epoch": 2.109021007826445, + "grad_norm": 0.69195157289505, + "learning_rate": 9.890109890109889e-05, + "loss": 0.6643, + "step": 1920 + }, + { + "epoch": 2.110119456268021, + "grad_norm": 0.8207395672798157, + "learning_rate": 9.877899877899876e-05, + "loss": 0.7535, + "step": 1921 + }, + { + "epoch": 2.111217904709598, + "grad_norm": 1.4245035648345947, + "learning_rate": 9.865689865689865e-05, + "loss": 0.6721, + "step": 1922 + }, + { + "epoch": 2.112316353151174, + "grad_norm": 0.5496362447738647, + "learning_rate": 9.853479853479852e-05, + "loss": 0.5367, + "step": 1923 + }, + { + "epoch": 2.11341480159275, + "grad_norm": 0.5466665625572205, + "learning_rate": 9.84126984126984e-05, + "loss": 0.6083, + "step": 1924 + }, + { + "epoch": 2.1145132500343267, + "grad_norm": 0.7750464677810669, + "learning_rate": 9.829059829059829e-05, + "loss": 0.663, + "step": 1925 + }, + { + "epoch": 2.115611698475903, + "grad_norm": 0.4978208541870117, + "learning_rate": 9.816849816849817e-05, + "loss": 0.6334, + "step": 1926 + }, + { + "epoch": 2.116710146917479, + "grad_norm": 0.6415550708770752, + "learning_rate": 9.804639804639804e-05, + "loss": 0.6477, + "step": 1927 + }, + { + "epoch": 2.1178085953590555, + "grad_norm": 0.644123911857605, + "learning_rate": 9.792429792429792e-05, + "loss": 0.668, + "step": 1928 + }, + { + "epoch": 2.1189070438006317, + "grad_norm": 0.39706236124038696, + "learning_rate": 9.78021978021978e-05, + "loss": 0.5875, + "step": 1929 + }, + { + "epoch": 2.120005492242208, + "grad_norm": 1.3733233213424683, + "learning_rate": 9.768009768009767e-05, + "loss": 0.6023, + "step": 1930 + }, + { + "epoch": 2.121103940683784, + "grad_norm": 0.48839983344078064, + "learning_rate": 9.755799755799754e-05, + "loss": 0.5693, + "step": 1931 + }, + { + "epoch": 2.1222023891253605, + "grad_norm": 0.3107692301273346, + "learning_rate": 9.743589743589744e-05, + "loss": 0.5822, + "step": 1932 + }, + { + "epoch": 2.1233008375669367, + "grad_norm": 0.3988654911518097, + "learning_rate": 9.731379731379731e-05, + "loss": 0.5989, + "step": 1933 + }, + { + "epoch": 2.124399286008513, + "grad_norm": 1.1887754201889038, + "learning_rate": 9.719169719169718e-05, + "loss": 0.6382, + "step": 1934 + }, + { + "epoch": 2.1254977344500894, + "grad_norm": 0.43282651901245117, + "learning_rate": 9.706959706959707e-05, + "loss": 0.5649, + "step": 1935 + }, + { + "epoch": 2.1265961828916655, + "grad_norm": 0.39243975281715393, + "learning_rate": 9.694749694749694e-05, + "loss": 0.7005, + "step": 1936 + }, + { + "epoch": 2.1276946313332417, + "grad_norm": 0.7401454448699951, + "learning_rate": 9.682539682539681e-05, + "loss": 1.0632, + "step": 1937 + }, + { + "epoch": 2.1287930797748182, + "grad_norm": 0.6976983547210693, + "learning_rate": 9.67032967032967e-05, + "loss": 0.562, + "step": 1938 + }, + { + "epoch": 2.1298915282163944, + "grad_norm": 0.9784336686134338, + "learning_rate": 9.658119658119657e-05, + "loss": 0.8115, + "step": 1939 + }, + { + "epoch": 2.1309899766579705, + "grad_norm": 0.5289125442504883, + "learning_rate": 9.645909645909644e-05, + "loss": 0.6161, + "step": 1940 + }, + { + "epoch": 2.132088425099547, + "grad_norm": 1.414559006690979, + "learning_rate": 9.633699633699634e-05, + "loss": 0.7115, + "step": 1941 + }, + { + "epoch": 2.1331868735411232, + "grad_norm": 0.5444177389144897, + "learning_rate": 9.621489621489621e-05, + "loss": 0.6211, + "step": 1942 + }, + { + "epoch": 2.1342853219826994, + "grad_norm": 0.637030839920044, + "learning_rate": 9.609279609279608e-05, + "loss": 0.8747, + "step": 1943 + }, + { + "epoch": 2.1353837704242755, + "grad_norm": 0.5926198363304138, + "learning_rate": 9.597069597069595e-05, + "loss": 0.8673, + "step": 1944 + }, + { + "epoch": 2.136482218865852, + "grad_norm": 0.3638801872730255, + "learning_rate": 9.584859584859584e-05, + "loss": 0.4698, + "step": 1945 + }, + { + "epoch": 2.1375806673074282, + "grad_norm": 0.5823031067848206, + "learning_rate": 9.572649572649571e-05, + "loss": 0.6988, + "step": 1946 + }, + { + "epoch": 2.1386791157490044, + "grad_norm": 0.44348934292793274, + "learning_rate": 9.560439560439558e-05, + "loss": 0.6667, + "step": 1947 + }, + { + "epoch": 2.139777564190581, + "grad_norm": 3.177112579345703, + "learning_rate": 9.548229548229548e-05, + "loss": 0.8738, + "step": 1948 + }, + { + "epoch": 2.140876012632157, + "grad_norm": 1.3834997415542603, + "learning_rate": 9.536019536019536e-05, + "loss": 0.528, + "step": 1949 + }, + { + "epoch": 2.1419744610737332, + "grad_norm": 0.5514722466468811, + "learning_rate": 9.523809523809523e-05, + "loss": 0.5058, + "step": 1950 + }, + { + "epoch": 2.14307290951531, + "grad_norm": 0.8795000314712524, + "learning_rate": 9.511599511599511e-05, + "loss": 0.6368, + "step": 1951 + }, + { + "epoch": 2.144171357956886, + "grad_norm": 1.0043178796768188, + "learning_rate": 9.499389499389498e-05, + "loss": 0.5701, + "step": 1952 + }, + { + "epoch": 2.145269806398462, + "grad_norm": 1.8537780046463013, + "learning_rate": 9.487179487179486e-05, + "loss": 0.6978, + "step": 1953 + }, + { + "epoch": 2.1463682548400387, + "grad_norm": 0.5239475965499878, + "learning_rate": 9.474969474969476e-05, + "loss": 0.7093, + "step": 1954 + }, + { + "epoch": 2.147466703281615, + "grad_norm": 0.7944377064704895, + "learning_rate": 9.462759462759463e-05, + "loss": 0.7625, + "step": 1955 + }, + { + "epoch": 2.148565151723191, + "grad_norm": 0.7356003522872925, + "learning_rate": 9.45054945054945e-05, + "loss": 0.6845, + "step": 1956 + }, + { + "epoch": 2.149663600164767, + "grad_norm": 1.3590694665908813, + "learning_rate": 9.438339438339437e-05, + "loss": 0.6964, + "step": 1957 + }, + { + "epoch": 2.1507620486063437, + "grad_norm": 0.40889453887939453, + "learning_rate": 9.426129426129426e-05, + "loss": 0.6643, + "step": 1958 + }, + { + "epoch": 2.15186049704792, + "grad_norm": 0.6347643136978149, + "learning_rate": 9.413919413919413e-05, + "loss": 1.0002, + "step": 1959 + }, + { + "epoch": 2.152958945489496, + "grad_norm": 0.3661377429962158, + "learning_rate": 9.4017094017094e-05, + "loss": 0.5084, + "step": 1960 + }, + { + "epoch": 2.1540573939310725, + "grad_norm": 0.8262574672698975, + "learning_rate": 9.389499389499389e-05, + "loss": 0.5658, + "step": 1961 + }, + { + "epoch": 2.1551558423726487, + "grad_norm": 0.6054818034172058, + "learning_rate": 9.377289377289376e-05, + "loss": 0.6349, + "step": 1962 + }, + { + "epoch": 2.156254290814225, + "grad_norm": 0.3696078658103943, + "learning_rate": 9.365079365079364e-05, + "loss": 0.5746, + "step": 1963 + }, + { + "epoch": 2.157352739255801, + "grad_norm": 0.7613049745559692, + "learning_rate": 9.352869352869353e-05, + "loss": 0.5204, + "step": 1964 + }, + { + "epoch": 2.1584511876973775, + "grad_norm": 0.6841816306114197, + "learning_rate": 9.34065934065934e-05, + "loss": 0.813, + "step": 1965 + }, + { + "epoch": 2.1595496361389537, + "grad_norm": 0.902998685836792, + "learning_rate": 9.328449328449327e-05, + "loss": 0.6288, + "step": 1966 + }, + { + "epoch": 2.16064808458053, + "grad_norm": 0.5367470979690552, + "learning_rate": 9.316239316239316e-05, + "loss": 0.6689, + "step": 1967 + }, + { + "epoch": 2.1617465330221064, + "grad_norm": 0.9443572163581848, + "learning_rate": 9.304029304029303e-05, + "loss": 0.6864, + "step": 1968 + }, + { + "epoch": 2.1628449814636825, + "grad_norm": 0.42191457748413086, + "learning_rate": 9.29181929181929e-05, + "loss": 0.6509, + "step": 1969 + }, + { + "epoch": 2.1639434299052587, + "grad_norm": 0.6019404530525208, + "learning_rate": 9.279609279609277e-05, + "loss": 0.5252, + "step": 1970 + }, + { + "epoch": 2.1650418783468353, + "grad_norm": 1.9933907985687256, + "learning_rate": 9.267399267399267e-05, + "loss": 0.6042, + "step": 1971 + }, + { + "epoch": 2.1661403267884114, + "grad_norm": 0.33075836300849915, + "learning_rate": 9.255189255189255e-05, + "loss": 0.579, + "step": 1972 + }, + { + "epoch": 2.1672387752299875, + "grad_norm": 0.37899547815322876, + "learning_rate": 9.242979242979242e-05, + "loss": 0.5006, + "step": 1973 + }, + { + "epoch": 2.168337223671564, + "grad_norm": 0.6482734680175781, + "learning_rate": 9.23076923076923e-05, + "loss": 0.4844, + "step": 1974 + }, + { + "epoch": 2.1694356721131403, + "grad_norm": 0.47632062435150146, + "learning_rate": 9.218559218559217e-05, + "loss": 0.5844, + "step": 1975 + }, + { + "epoch": 2.1705341205547164, + "grad_norm": 0.3402813971042633, + "learning_rate": 9.206349206349205e-05, + "loss": 0.6397, + "step": 1976 + }, + { + "epoch": 2.1716325689962925, + "grad_norm": 0.47405871748924255, + "learning_rate": 9.194139194139195e-05, + "loss": 0.6436, + "step": 1977 + }, + { + "epoch": 2.172731017437869, + "grad_norm": 0.5474234223365784, + "learning_rate": 9.181929181929182e-05, + "loss": 0.5758, + "step": 1978 + }, + { + "epoch": 2.1738294658794453, + "grad_norm": 0.5423378348350525, + "learning_rate": 9.169719169719169e-05, + "loss": 0.5882, + "step": 1979 + }, + { + "epoch": 2.1749279143210214, + "grad_norm": 0.32848963141441345, + "learning_rate": 9.157509157509158e-05, + "loss": 0.5828, + "step": 1980 + }, + { + "epoch": 2.176026362762598, + "grad_norm": 0.6646802425384521, + "learning_rate": 9.145299145299145e-05, + "loss": 0.551, + "step": 1981 + }, + { + "epoch": 2.177124811204174, + "grad_norm": 0.4560980200767517, + "learning_rate": 9.133089133089132e-05, + "loss": 0.705, + "step": 1982 + }, + { + "epoch": 2.1782232596457503, + "grad_norm": 0.4531053304672241, + "learning_rate": 9.120879120879119e-05, + "loss": 0.7471, + "step": 1983 + }, + { + "epoch": 2.179321708087327, + "grad_norm": 0.5881507992744446, + "learning_rate": 9.108669108669108e-05, + "loss": 0.7559, + "step": 1984 + }, + { + "epoch": 2.180420156528903, + "grad_norm": 0.41462886333465576, + "learning_rate": 9.096459096459096e-05, + "loss": 0.5674, + "step": 1985 + }, + { + "epoch": 2.181518604970479, + "grad_norm": 0.46718108654022217, + "learning_rate": 9.084249084249083e-05, + "loss": 0.7149, + "step": 1986 + }, + { + "epoch": 2.1826170534120557, + "grad_norm": 0.49290111660957336, + "learning_rate": 9.072039072039072e-05, + "loss": 0.5641, + "step": 1987 + }, + { + "epoch": 2.183715501853632, + "grad_norm": 0.398296594619751, + "learning_rate": 9.059829059829059e-05, + "loss": 0.5177, + "step": 1988 + }, + { + "epoch": 2.184813950295208, + "grad_norm": 0.8241115212440491, + "learning_rate": 9.047619047619046e-05, + "loss": 0.7864, + "step": 1989 + }, + { + "epoch": 2.185912398736784, + "grad_norm": 1.1335865259170532, + "learning_rate": 9.035409035409035e-05, + "loss": 0.6167, + "step": 1990 + }, + { + "epoch": 2.1870108471783607, + "grad_norm": 0.4479789435863495, + "learning_rate": 9.023199023199022e-05, + "loss": 0.6365, + "step": 1991 + }, + { + "epoch": 2.188109295619937, + "grad_norm": 0.4892582297325134, + "learning_rate": 9.010989010989009e-05, + "loss": 0.6283, + "step": 1992 + }, + { + "epoch": 2.189207744061513, + "grad_norm": 0.8397974371910095, + "learning_rate": 8.998778998778999e-05, + "loss": 0.7123, + "step": 1993 + }, + { + "epoch": 2.1903061925030896, + "grad_norm": 0.5295377969741821, + "learning_rate": 8.986568986568986e-05, + "loss": 0.4033, + "step": 1994 + }, + { + "epoch": 2.1914046409446657, + "grad_norm": 0.464832067489624, + "learning_rate": 8.974358974358974e-05, + "loss": 0.8228, + "step": 1995 + }, + { + "epoch": 2.192503089386242, + "grad_norm": 0.381369024515152, + "learning_rate": 8.962148962148961e-05, + "loss": 0.6267, + "step": 1996 + }, + { + "epoch": 2.193601537827818, + "grad_norm": 0.7176710963249207, + "learning_rate": 8.949938949938949e-05, + "loss": 0.7008, + "step": 1997 + }, + { + "epoch": 2.1946999862693946, + "grad_norm": 2.569753885269165, + "learning_rate": 8.937728937728936e-05, + "loss": 0.6899, + "step": 1998 + }, + { + "epoch": 2.1957984347109707, + "grad_norm": 0.5020056962966919, + "learning_rate": 8.925518925518924e-05, + "loss": 0.527, + "step": 1999 + }, + { + "epoch": 2.196896883152547, + "grad_norm": 1.7054524421691895, + "learning_rate": 8.913308913308914e-05, + "loss": 0.5455, + "step": 2000 + }, + { + "epoch": 2.1979953315941234, + "grad_norm": 0.5037225484848022, + "learning_rate": 8.901098901098901e-05, + "loss": 0.7445, + "step": 2001 + }, + { + "epoch": 2.1990937800356996, + "grad_norm": 0.8109555840492249, + "learning_rate": 8.888888888888888e-05, + "loss": 0.624, + "step": 2002 + }, + { + "epoch": 2.2001922284772757, + "grad_norm": 0.47120043635368347, + "learning_rate": 8.876678876678877e-05, + "loss": 0.6858, + "step": 2003 + }, + { + "epoch": 2.2012906769188523, + "grad_norm": 0.6166191101074219, + "learning_rate": 8.864468864468864e-05, + "loss": 0.4528, + "step": 2004 + }, + { + "epoch": 2.2023891253604284, + "grad_norm": 0.4999128580093384, + "learning_rate": 8.852258852258851e-05, + "loss": 0.712, + "step": 2005 + }, + { + "epoch": 2.2034875738020046, + "grad_norm": 1.1858354806900024, + "learning_rate": 8.84004884004884e-05, + "loss": 0.7647, + "step": 2006 + }, + { + "epoch": 2.204586022243581, + "grad_norm": 0.4223528206348419, + "learning_rate": 8.827838827838828e-05, + "loss": 0.6553, + "step": 2007 + }, + { + "epoch": 2.2056844706851573, + "grad_norm": 0.41678956151008606, + "learning_rate": 8.815628815628815e-05, + "loss": 0.6033, + "step": 2008 + }, + { + "epoch": 2.2067829191267334, + "grad_norm": 0.5812666416168213, + "learning_rate": 8.803418803418802e-05, + "loss": 0.6016, + "step": 2009 + }, + { + "epoch": 2.2078813675683095, + "grad_norm": 0.5553560256958008, + "learning_rate": 8.791208791208791e-05, + "loss": 0.7621, + "step": 2010 + }, + { + "epoch": 2.208979816009886, + "grad_norm": 0.6392796635627747, + "learning_rate": 8.778998778998778e-05, + "loss": 0.567, + "step": 2011 + }, + { + "epoch": 2.2100782644514623, + "grad_norm": 1.0086902379989624, + "learning_rate": 8.766788766788765e-05, + "loss": 0.9432, + "step": 2012 + }, + { + "epoch": 2.2111767128930384, + "grad_norm": 1.3578602075576782, + "learning_rate": 8.754578754578754e-05, + "loss": 0.5107, + "step": 2013 + }, + { + "epoch": 2.212275161334615, + "grad_norm": 0.5530524849891663, + "learning_rate": 8.742368742368741e-05, + "loss": 0.6078, + "step": 2014 + }, + { + "epoch": 2.213373609776191, + "grad_norm": 0.3795104920864105, + "learning_rate": 8.730158730158728e-05, + "loss": 0.4889, + "step": 2015 + }, + { + "epoch": 2.2144720582177673, + "grad_norm": 0.40977227687835693, + "learning_rate": 8.717948717948718e-05, + "loss": 0.6295, + "step": 2016 + }, + { + "epoch": 2.215570506659344, + "grad_norm": 0.4882934093475342, + "learning_rate": 8.705738705738705e-05, + "loss": 0.7219, + "step": 2017 + }, + { + "epoch": 2.21666895510092, + "grad_norm": 0.7966530919075012, + "learning_rate": 8.693528693528693e-05, + "loss": 0.5342, + "step": 2018 + }, + { + "epoch": 2.217767403542496, + "grad_norm": 0.6992311477661133, + "learning_rate": 8.681318681318681e-05, + "loss": 0.5932, + "step": 2019 + }, + { + "epoch": 2.2188658519840727, + "grad_norm": 0.396427720785141, + "learning_rate": 8.669108669108668e-05, + "loss": 0.5838, + "step": 2020 + }, + { + "epoch": 2.219964300425649, + "grad_norm": 0.5625690817832947, + "learning_rate": 8.656898656898655e-05, + "loss": 0.7605, + "step": 2021 + }, + { + "epoch": 2.221062748867225, + "grad_norm": 0.6052583456039429, + "learning_rate": 8.644688644688643e-05, + "loss": 0.6572, + "step": 2022 + }, + { + "epoch": 2.222161197308801, + "grad_norm": 0.7201973795890808, + "learning_rate": 8.632478632478633e-05, + "loss": 0.4924, + "step": 2023 + }, + { + "epoch": 2.2232596457503777, + "grad_norm": 0.4222647249698639, + "learning_rate": 8.62026862026862e-05, + "loss": 0.7764, + "step": 2024 + }, + { + "epoch": 2.224358094191954, + "grad_norm": 0.5168121457099915, + "learning_rate": 8.608058608058607e-05, + "loss": 0.5766, + "step": 2025 + }, + { + "epoch": 2.22545654263353, + "grad_norm": 0.886203408241272, + "learning_rate": 8.595848595848596e-05, + "loss": 0.3804, + "step": 2026 + }, + { + "epoch": 2.2265549910751066, + "grad_norm": 1.7365875244140625, + "learning_rate": 8.583638583638583e-05, + "loss": 0.6583, + "step": 2027 + }, + { + "epoch": 2.2276534395166827, + "grad_norm": 0.44519639015197754, + "learning_rate": 8.57142857142857e-05, + "loss": 0.7322, + "step": 2028 + }, + { + "epoch": 2.228751887958259, + "grad_norm": 0.4888206422328949, + "learning_rate": 8.55921855921856e-05, + "loss": 0.6645, + "step": 2029 + }, + { + "epoch": 2.2298503363998354, + "grad_norm": 0.598225474357605, + "learning_rate": 8.547008547008547e-05, + "loss": 0.7903, + "step": 2030 + }, + { + "epoch": 2.2309487848414116, + "grad_norm": 0.8521910905838013, + "learning_rate": 8.534798534798534e-05, + "loss": 0.8573, + "step": 2031 + }, + { + "epoch": 2.2320472332829877, + "grad_norm": 1.6346311569213867, + "learning_rate": 8.522588522588523e-05, + "loss": 0.5653, + "step": 2032 + }, + { + "epoch": 2.233145681724564, + "grad_norm": 0.6574315428733826, + "learning_rate": 8.51037851037851e-05, + "loss": 0.5289, + "step": 2033 + }, + { + "epoch": 2.2342441301661404, + "grad_norm": 0.3821216821670532, + "learning_rate": 8.498168498168497e-05, + "loss": 0.4627, + "step": 2034 + }, + { + "epoch": 2.2353425786077166, + "grad_norm": 0.28965023159980774, + "learning_rate": 8.485958485958484e-05, + "loss": 0.3696, + "step": 2035 + }, + { + "epoch": 2.2364410270492927, + "grad_norm": 0.8256242275238037, + "learning_rate": 8.473748473748473e-05, + "loss": 0.6305, + "step": 2036 + }, + { + "epoch": 2.2375394754908693, + "grad_norm": 0.8374451398849487, + "learning_rate": 8.46153846153846e-05, + "loss": 0.5038, + "step": 2037 + }, + { + "epoch": 2.2386379239324454, + "grad_norm": 0.5931464433670044, + "learning_rate": 8.449328449328449e-05, + "loss": 0.6928, + "step": 2038 + }, + { + "epoch": 2.2397363723740216, + "grad_norm": 0.5120035409927368, + "learning_rate": 8.437118437118437e-05, + "loss": 0.6004, + "step": 2039 + }, + { + "epoch": 2.240834820815598, + "grad_norm": 0.6345282196998596, + "learning_rate": 8.424908424908424e-05, + "loss": 0.866, + "step": 2040 + }, + { + "epoch": 2.2419332692571743, + "grad_norm": 0.5632284283638, + "learning_rate": 8.412698412698412e-05, + "loss": 0.406, + "step": 2041 + }, + { + "epoch": 2.2430317176987504, + "grad_norm": 0.4784685969352722, + "learning_rate": 8.4004884004884e-05, + "loss": 0.4732, + "step": 2042 + }, + { + "epoch": 2.2441301661403266, + "grad_norm": 0.47678086161613464, + "learning_rate": 8.388278388278387e-05, + "loss": 0.502, + "step": 2043 + }, + { + "epoch": 2.245228614581903, + "grad_norm": 0.6543307304382324, + "learning_rate": 8.376068376068374e-05, + "loss": 0.7183, + "step": 2044 + }, + { + "epoch": 2.2463270630234793, + "grad_norm": 0.6147063374519348, + "learning_rate": 8.363858363858364e-05, + "loss": 0.618, + "step": 2045 + }, + { + "epoch": 2.2474255114650554, + "grad_norm": 0.5867168307304382, + "learning_rate": 8.351648351648352e-05, + "loss": 0.7749, + "step": 2046 + }, + { + "epoch": 2.248523959906632, + "grad_norm": 1.164838433265686, + "learning_rate": 8.339438339438339e-05, + "loss": 0.6261, + "step": 2047 + }, + { + "epoch": 2.249622408348208, + "grad_norm": 0.6695102453231812, + "learning_rate": 8.327228327228326e-05, + "loss": 0.6172, + "step": 2048 + }, + { + "epoch": 2.2507208567897843, + "grad_norm": 0.43873751163482666, + "learning_rate": 8.315018315018315e-05, + "loss": 0.7032, + "step": 2049 + }, + { + "epoch": 2.251819305231361, + "grad_norm": 0.439897745847702, + "learning_rate": 8.302808302808302e-05, + "loss": 0.7744, + "step": 2050 + }, + { + "epoch": 2.252917753672937, + "grad_norm": 0.6671053767204285, + "learning_rate": 8.290598290598289e-05, + "loss": 0.6877, + "step": 2051 + }, + { + "epoch": 2.254016202114513, + "grad_norm": 0.37354105710983276, + "learning_rate": 8.278388278388279e-05, + "loss": 0.5653, + "step": 2052 + }, + { + "epoch": 2.2551146505560897, + "grad_norm": 0.5615684390068054, + "learning_rate": 8.266178266178266e-05, + "loss": 0.5961, + "step": 2053 + }, + { + "epoch": 2.256213098997666, + "grad_norm": 2.0932323932647705, + "learning_rate": 8.253968253968253e-05, + "loss": 0.6139, + "step": 2054 + }, + { + "epoch": 2.257311547439242, + "grad_norm": 0.5486952066421509, + "learning_rate": 8.241758241758242e-05, + "loss": 0.7816, + "step": 2055 + }, + { + "epoch": 2.258409995880818, + "grad_norm": 0.7377699017524719, + "learning_rate": 8.229548229548229e-05, + "loss": 0.5036, + "step": 2056 + }, + { + "epoch": 2.2595084443223947, + "grad_norm": 0.7057545781135559, + "learning_rate": 8.217338217338216e-05, + "loss": 0.5788, + "step": 2057 + }, + { + "epoch": 2.260606892763971, + "grad_norm": 0.5388674736022949, + "learning_rate": 8.205128205128205e-05, + "loss": 0.7079, + "step": 2058 + }, + { + "epoch": 2.261705341205547, + "grad_norm": 0.620943546295166, + "learning_rate": 8.192918192918192e-05, + "loss": 0.6223, + "step": 2059 + }, + { + "epoch": 2.2628037896471236, + "grad_norm": 0.6159489154815674, + "learning_rate": 8.18070818070818e-05, + "loss": 0.7277, + "step": 2060 + }, + { + "epoch": 2.2639022380886997, + "grad_norm": 0.5745131373405457, + "learning_rate": 8.168498168498168e-05, + "loss": 0.6356, + "step": 2061 + }, + { + "epoch": 2.265000686530276, + "grad_norm": 0.4925720989704132, + "learning_rate": 8.156288156288156e-05, + "loss": 0.6342, + "step": 2062 + }, + { + "epoch": 2.2660991349718524, + "grad_norm": 0.410692036151886, + "learning_rate": 8.144078144078143e-05, + "loss": 0.5903, + "step": 2063 + }, + { + "epoch": 2.2671975834134286, + "grad_norm": 0.8246005177497864, + "learning_rate": 8.13186813186813e-05, + "loss": 0.4048, + "step": 2064 + }, + { + "epoch": 2.2682960318550047, + "grad_norm": 0.5054492950439453, + "learning_rate": 8.119658119658119e-05, + "loss": 0.5797, + "step": 2065 + }, + { + "epoch": 2.2693944802965813, + "grad_norm": 0.6249692440032959, + "learning_rate": 8.107448107448106e-05, + "loss": 0.5434, + "step": 2066 + }, + { + "epoch": 2.2704929287381574, + "grad_norm": 0.5582659244537354, + "learning_rate": 8.095238095238093e-05, + "loss": 0.5925, + "step": 2067 + }, + { + "epoch": 2.2715913771797336, + "grad_norm": 0.38472238183021545, + "learning_rate": 8.083028083028083e-05, + "loss": 0.7325, + "step": 2068 + }, + { + "epoch": 2.2726898256213097, + "grad_norm": 0.4649077355861664, + "learning_rate": 8.07081807081807e-05, + "loss": 0.6244, + "step": 2069 + }, + { + "epoch": 2.2737882740628863, + "grad_norm": 0.38582849502563477, + "learning_rate": 8.058608058608058e-05, + "loss": 0.7696, + "step": 2070 + }, + { + "epoch": 2.2748867225044624, + "grad_norm": 0.4612105190753937, + "learning_rate": 8.046398046398045e-05, + "loss": 0.6453, + "step": 2071 + }, + { + "epoch": 2.2759851709460386, + "grad_norm": 0.6572852730751038, + "learning_rate": 8.034188034188034e-05, + "loss": 0.7417, + "step": 2072 + }, + { + "epoch": 2.277083619387615, + "grad_norm": 0.6322109699249268, + "learning_rate": 8.021978021978021e-05, + "loss": 0.2827, + "step": 2073 + }, + { + "epoch": 2.2781820678291913, + "grad_norm": 1.2452771663665771, + "learning_rate": 8.009768009768008e-05, + "loss": 0.7441, + "step": 2074 + }, + { + "epoch": 2.2792805162707674, + "grad_norm": 0.32154834270477295, + "learning_rate": 7.997557997557998e-05, + "loss": 0.4606, + "step": 2075 + }, + { + "epoch": 2.2803789647123436, + "grad_norm": 1.0170034170150757, + "learning_rate": 7.985347985347985e-05, + "loss": 0.7003, + "step": 2076 + }, + { + "epoch": 2.28147741315392, + "grad_norm": 0.7780435085296631, + "learning_rate": 7.973137973137972e-05, + "loss": 0.5847, + "step": 2077 + }, + { + "epoch": 2.2825758615954963, + "grad_norm": 0.6422854661941528, + "learning_rate": 7.960927960927961e-05, + "loss": 0.6278, + "step": 2078 + }, + { + "epoch": 2.2836743100370724, + "grad_norm": 0.5440393090248108, + "learning_rate": 7.948717948717948e-05, + "loss": 0.6313, + "step": 2079 + }, + { + "epoch": 2.284772758478649, + "grad_norm": 0.5774940848350525, + "learning_rate": 7.936507936507935e-05, + "loss": 0.7504, + "step": 2080 + }, + { + "epoch": 2.285871206920225, + "grad_norm": 0.44180789589881897, + "learning_rate": 7.924297924297924e-05, + "loss": 0.5806, + "step": 2081 + }, + { + "epoch": 2.2869696553618013, + "grad_norm": 0.8452728390693665, + "learning_rate": 7.912087912087912e-05, + "loss": 0.5753, + "step": 2082 + }, + { + "epoch": 2.288068103803378, + "grad_norm": 0.40172943472862244, + "learning_rate": 7.8998778998779e-05, + "loss": 0.5565, + "step": 2083 + }, + { + "epoch": 2.289166552244954, + "grad_norm": 0.3919180929660797, + "learning_rate": 7.887667887667887e-05, + "loss": 0.4951, + "step": 2084 + }, + { + "epoch": 2.29026500068653, + "grad_norm": 1.0796260833740234, + "learning_rate": 7.875457875457875e-05, + "loss": 0.733, + "step": 2085 + }, + { + "epoch": 2.2913634491281067, + "grad_norm": 0.5640047788619995, + "learning_rate": 7.863247863247862e-05, + "loss": 0.4625, + "step": 2086 + }, + { + "epoch": 2.292461897569683, + "grad_norm": 0.8736083507537842, + "learning_rate": 7.85103785103785e-05, + "loss": 0.5532, + "step": 2087 + }, + { + "epoch": 2.293560346011259, + "grad_norm": 0.5358221530914307, + "learning_rate": 7.838827838827838e-05, + "loss": 0.6397, + "step": 2088 + }, + { + "epoch": 2.294658794452835, + "grad_norm": 5.207391262054443, + "learning_rate": 7.826617826617825e-05, + "loss": 0.6402, + "step": 2089 + }, + { + "epoch": 2.2957572428944117, + "grad_norm": 0.4122523069381714, + "learning_rate": 7.814407814407813e-05, + "loss": 0.474, + "step": 2090 + }, + { + "epoch": 2.296855691335988, + "grad_norm": 2.8296186923980713, + "learning_rate": 7.802197802197802e-05, + "loss": 0.5197, + "step": 2091 + }, + { + "epoch": 2.297954139777564, + "grad_norm": 0.6898410320281982, + "learning_rate": 7.78998778998779e-05, + "loss": 0.782, + "step": 2092 + }, + { + "epoch": 2.2990525882191406, + "grad_norm": 0.37363025546073914, + "learning_rate": 7.777777777777777e-05, + "loss": 0.5824, + "step": 2093 + }, + { + "epoch": 2.3001510366607167, + "grad_norm": 0.5120764374732971, + "learning_rate": 7.765567765567765e-05, + "loss": 0.7326, + "step": 2094 + }, + { + "epoch": 2.301249485102293, + "grad_norm": 0.6517985463142395, + "learning_rate": 7.753357753357753e-05, + "loss": 0.6274, + "step": 2095 + }, + { + "epoch": 2.3023479335438695, + "grad_norm": 0.8033846020698547, + "learning_rate": 7.74114774114774e-05, + "loss": 0.7093, + "step": 2096 + }, + { + "epoch": 2.3034463819854456, + "grad_norm": 0.896397590637207, + "learning_rate": 7.728937728937727e-05, + "loss": 0.6685, + "step": 2097 + }, + { + "epoch": 2.3045448304270217, + "grad_norm": 0.4606597423553467, + "learning_rate": 7.716727716727717e-05, + "loss": 0.5821, + "step": 2098 + }, + { + "epoch": 2.3056432788685983, + "grad_norm": 0.9286845922470093, + "learning_rate": 7.704517704517704e-05, + "loss": 0.7537, + "step": 2099 + }, + { + "epoch": 2.3067417273101745, + "grad_norm": 0.6514043211936951, + "learning_rate": 7.692307692307691e-05, + "loss": 0.5644, + "step": 2100 + }, + { + "epoch": 2.3078401757517506, + "grad_norm": 0.4881083369255066, + "learning_rate": 7.68009768009768e-05, + "loss": 0.5348, + "step": 2101 + }, + { + "epoch": 2.3089386241933267, + "grad_norm": 2.688716173171997, + "learning_rate": 7.667887667887667e-05, + "loss": 0.6732, + "step": 2102 + }, + { + "epoch": 2.3100370726349033, + "grad_norm": 0.4597708582878113, + "learning_rate": 7.655677655677654e-05, + "loss": 0.6166, + "step": 2103 + }, + { + "epoch": 2.3111355210764795, + "grad_norm": 0.7629315853118896, + "learning_rate": 7.643467643467644e-05, + "loss": 0.4677, + "step": 2104 + }, + { + "epoch": 2.3122339695180556, + "grad_norm": 0.7282788753509521, + "learning_rate": 7.631257631257631e-05, + "loss": 0.6841, + "step": 2105 + }, + { + "epoch": 2.313332417959632, + "grad_norm": 0.5421862006187439, + "learning_rate": 7.619047619047618e-05, + "loss": 0.7274, + "step": 2106 + }, + { + "epoch": 2.3144308664012083, + "grad_norm": 0.7396867871284485, + "learning_rate": 7.606837606837607e-05, + "loss": 0.6546, + "step": 2107 + }, + { + "epoch": 2.3155293148427845, + "grad_norm": 0.34731313586235046, + "learning_rate": 7.594627594627594e-05, + "loss": 0.72, + "step": 2108 + }, + { + "epoch": 2.3166277632843606, + "grad_norm": 1.1024978160858154, + "learning_rate": 7.582417582417581e-05, + "loss": 0.7304, + "step": 2109 + }, + { + "epoch": 2.317726211725937, + "grad_norm": 0.5866183638572693, + "learning_rate": 7.570207570207569e-05, + "loss": 0.4912, + "step": 2110 + }, + { + "epoch": 2.3188246601675133, + "grad_norm": 0.8068836331367493, + "learning_rate": 7.557997557997557e-05, + "loss": 0.5342, + "step": 2111 + }, + { + "epoch": 2.31992310860909, + "grad_norm": 0.6417646408081055, + "learning_rate": 7.545787545787544e-05, + "loss": 0.7642, + "step": 2112 + }, + { + "epoch": 2.321021557050666, + "grad_norm": 0.4545808434486389, + "learning_rate": 7.533577533577533e-05, + "loss": 0.5681, + "step": 2113 + }, + { + "epoch": 2.322120005492242, + "grad_norm": 0.3567211329936981, + "learning_rate": 7.521367521367521e-05, + "loss": 0.6368, + "step": 2114 + }, + { + "epoch": 2.3232184539338183, + "grad_norm": 0.5747010707855225, + "learning_rate": 7.509157509157509e-05, + "loss": 0.5848, + "step": 2115 + }, + { + "epoch": 2.324316902375395, + "grad_norm": 0.46303555369377136, + "learning_rate": 7.496947496947497e-05, + "loss": 0.6577, + "step": 2116 + }, + { + "epoch": 2.325415350816971, + "grad_norm": 0.5343080759048462, + "learning_rate": 7.484737484737484e-05, + "loss": 0.8531, + "step": 2117 + }, + { + "epoch": 2.326513799258547, + "grad_norm": 0.9027140736579895, + "learning_rate": 7.472527472527472e-05, + "loss": 0.6271, + "step": 2118 + }, + { + "epoch": 2.3276122477001238, + "grad_norm": 0.6390063166618347, + "learning_rate": 7.460317460317459e-05, + "loss": 0.5669, + "step": 2119 + }, + { + "epoch": 2.3287106961417, + "grad_norm": 0.4965013563632965, + "learning_rate": 7.448107448107447e-05, + "loss": 0.6362, + "step": 2120 + }, + { + "epoch": 2.329809144583276, + "grad_norm": 0.49252766370773315, + "learning_rate": 7.435897435897436e-05, + "loss": 0.6703, + "step": 2121 + }, + { + "epoch": 2.330907593024852, + "grad_norm": 0.7043023705482483, + "learning_rate": 7.423687423687423e-05, + "loss": 0.7114, + "step": 2122 + }, + { + "epoch": 2.3320060414664288, + "grad_norm": 0.4373185634613037, + "learning_rate": 7.41147741147741e-05, + "loss": 0.5656, + "step": 2123 + }, + { + "epoch": 2.333104489908005, + "grad_norm": 1.0036537647247314, + "learning_rate": 7.399267399267399e-05, + "loss": 0.6652, + "step": 2124 + }, + { + "epoch": 2.334202938349581, + "grad_norm": 2.06589937210083, + "learning_rate": 7.387057387057386e-05, + "loss": 0.6502, + "step": 2125 + }, + { + "epoch": 2.3353013867911576, + "grad_norm": 1.1616554260253906, + "learning_rate": 7.374847374847375e-05, + "loss": 0.7288, + "step": 2126 + }, + { + "epoch": 2.3363998352327338, + "grad_norm": 0.4532950520515442, + "learning_rate": 7.362637362637362e-05, + "loss": 0.7696, + "step": 2127 + }, + { + "epoch": 2.33749828367431, + "grad_norm": 1.0143449306488037, + "learning_rate": 7.35042735042735e-05, + "loss": 1.0185, + "step": 2128 + }, + { + "epoch": 2.3385967321158865, + "grad_norm": 2.2059850692749023, + "learning_rate": 7.338217338217337e-05, + "loss": 0.6267, + "step": 2129 + }, + { + "epoch": 2.3396951805574626, + "grad_norm": 0.4883456826210022, + "learning_rate": 7.326007326007325e-05, + "loss": 0.6081, + "step": 2130 + }, + { + "epoch": 2.3407936289990388, + "grad_norm": 0.42373138666152954, + "learning_rate": 7.313797313797313e-05, + "loss": 0.6204, + "step": 2131 + }, + { + "epoch": 2.3418920774406153, + "grad_norm": 0.43958979845046997, + "learning_rate": 7.3015873015873e-05, + "loss": 0.7608, + "step": 2132 + }, + { + "epoch": 2.3429905258821915, + "grad_norm": 0.4493010342121124, + "learning_rate": 7.289377289377289e-05, + "loss": 0.5985, + "step": 2133 + }, + { + "epoch": 2.3440889743237676, + "grad_norm": 0.38533085584640503, + "learning_rate": 7.277167277167276e-05, + "loss": 0.445, + "step": 2134 + }, + { + "epoch": 2.3451874227653438, + "grad_norm": 0.37900710105895996, + "learning_rate": 7.264957264957265e-05, + "loss": 0.8466, + "step": 2135 + }, + { + "epoch": 2.3462858712069203, + "grad_norm": 1.7598285675048828, + "learning_rate": 7.252747252747252e-05, + "loss": 0.6881, + "step": 2136 + }, + { + "epoch": 2.3473843196484965, + "grad_norm": 0.5551338791847229, + "learning_rate": 7.24053724053724e-05, + "loss": 0.5908, + "step": 2137 + }, + { + "epoch": 2.3484827680900726, + "grad_norm": 0.42995861172676086, + "learning_rate": 7.228327228327228e-05, + "loss": 0.689, + "step": 2138 + }, + { + "epoch": 2.349581216531649, + "grad_norm": 0.6428760290145874, + "learning_rate": 7.216117216117216e-05, + "loss": 0.5879, + "step": 2139 + }, + { + "epoch": 2.3506796649732253, + "grad_norm": 0.6199445724487305, + "learning_rate": 7.203907203907203e-05, + "loss": 0.5275, + "step": 2140 + }, + { + "epoch": 2.3517781134148015, + "grad_norm": 0.4687311053276062, + "learning_rate": 7.19169719169719e-05, + "loss": 0.7046, + "step": 2141 + }, + { + "epoch": 2.352876561856378, + "grad_norm": 0.47645121812820435, + "learning_rate": 7.179487179487179e-05, + "loss": 0.4787, + "step": 2142 + }, + { + "epoch": 2.353975010297954, + "grad_norm": 1.3774843215942383, + "learning_rate": 7.167277167277166e-05, + "loss": 0.565, + "step": 2143 + }, + { + "epoch": 2.3550734587395303, + "grad_norm": 0.9585548043251038, + "learning_rate": 7.155067155067155e-05, + "loss": 0.7496, + "step": 2144 + }, + { + "epoch": 2.356171907181107, + "grad_norm": 0.9073938131332397, + "learning_rate": 7.142857142857142e-05, + "loss": 0.6785, + "step": 2145 + }, + { + "epoch": 2.357270355622683, + "grad_norm": 1.4543087482452393, + "learning_rate": 7.13064713064713e-05, + "loss": 0.4827, + "step": 2146 + }, + { + "epoch": 2.358368804064259, + "grad_norm": 0.49685895442962646, + "learning_rate": 7.118437118437118e-05, + "loss": 0.5624, + "step": 2147 + }, + { + "epoch": 2.3594672525058353, + "grad_norm": 0.3820716142654419, + "learning_rate": 7.106227106227105e-05, + "loss": 0.5326, + "step": 2148 + }, + { + "epoch": 2.360565700947412, + "grad_norm": 0.6018278002738953, + "learning_rate": 7.094017094017094e-05, + "loss": 0.7372, + "step": 2149 + }, + { + "epoch": 2.361664149388988, + "grad_norm": 0.49245381355285645, + "learning_rate": 7.081807081807082e-05, + "loss": 0.714, + "step": 2150 + }, + { + "epoch": 2.362762597830564, + "grad_norm": 0.5913417339324951, + "learning_rate": 7.069597069597069e-05, + "loss": 0.6395, + "step": 2151 + }, + { + "epoch": 2.3638610462721408, + "grad_norm": 0.3142958879470825, + "learning_rate": 7.057387057387056e-05, + "loss": 0.4363, + "step": 2152 + }, + { + "epoch": 2.364959494713717, + "grad_norm": 0.44251006841659546, + "learning_rate": 7.045177045177044e-05, + "loss": 0.5751, + "step": 2153 + }, + { + "epoch": 2.366057943155293, + "grad_norm": 0.7642143964767456, + "learning_rate": 7.032967032967032e-05, + "loss": 0.9707, + "step": 2154 + }, + { + "epoch": 2.367156391596869, + "grad_norm": 0.3676380217075348, + "learning_rate": 7.020757020757021e-05, + "loss": 0.6142, + "step": 2155 + }, + { + "epoch": 2.3682548400384458, + "grad_norm": 0.43112027645111084, + "learning_rate": 7.008547008547008e-05, + "loss": 0.6194, + "step": 2156 + }, + { + "epoch": 2.369353288480022, + "grad_norm": 0.5463792681694031, + "learning_rate": 6.996336996336996e-05, + "loss": 0.5478, + "step": 2157 + }, + { + "epoch": 2.370451736921598, + "grad_norm": 0.5498053431510925, + "learning_rate": 6.984126984126984e-05, + "loss": 0.8373, + "step": 2158 + }, + { + "epoch": 2.3715501853631746, + "grad_norm": 0.5144299268722534, + "learning_rate": 6.971916971916971e-05, + "loss": 0.7033, + "step": 2159 + }, + { + "epoch": 2.3726486338047508, + "grad_norm": 0.4049033522605896, + "learning_rate": 6.95970695970696e-05, + "loss": 0.6257, + "step": 2160 + }, + { + "epoch": 2.373747082246327, + "grad_norm": 0.8007866740226746, + "learning_rate": 6.947496947496947e-05, + "loss": 1.1859, + "step": 2161 + }, + { + "epoch": 2.3748455306879035, + "grad_norm": 0.6302816867828369, + "learning_rate": 6.935286935286935e-05, + "loss": 0.4972, + "step": 2162 + }, + { + "epoch": 2.3759439791294796, + "grad_norm": 0.4181542694568634, + "learning_rate": 6.923076923076922e-05, + "loss": 0.5543, + "step": 2163 + }, + { + "epoch": 2.3770424275710558, + "grad_norm": 0.45409703254699707, + "learning_rate": 6.91086691086691e-05, + "loss": 0.6237, + "step": 2164 + }, + { + "epoch": 2.3781408760126324, + "grad_norm": 0.5172666907310486, + "learning_rate": 6.898656898656898e-05, + "loss": 0.5798, + "step": 2165 + }, + { + "epoch": 2.3792393244542085, + "grad_norm": 0.7849127054214478, + "learning_rate": 6.886446886446885e-05, + "loss": 0.8282, + "step": 2166 + }, + { + "epoch": 2.3803377728957846, + "grad_norm": 0.4041041135787964, + "learning_rate": 6.874236874236874e-05, + "loss": 0.5046, + "step": 2167 + }, + { + "epoch": 2.3814362213373608, + "grad_norm": 0.35880064964294434, + "learning_rate": 6.862026862026862e-05, + "loss": 0.4096, + "step": 2168 + }, + { + "epoch": 2.3825346697789374, + "grad_norm": 0.5949457883834839, + "learning_rate": 6.84981684981685e-05, + "loss": 0.6666, + "step": 2169 + }, + { + "epoch": 2.3836331182205135, + "grad_norm": 0.6332186460494995, + "learning_rate": 6.837606837606837e-05, + "loss": 0.9715, + "step": 2170 + }, + { + "epoch": 2.3847315666620896, + "grad_norm": 0.3173432946205139, + "learning_rate": 6.825396825396824e-05, + "loss": 0.6792, + "step": 2171 + }, + { + "epoch": 2.385830015103666, + "grad_norm": 0.7556782364845276, + "learning_rate": 6.813186813186813e-05, + "loss": 0.7267, + "step": 2172 + }, + { + "epoch": 2.3869284635452424, + "grad_norm": 0.43191683292388916, + "learning_rate": 6.800976800976801e-05, + "loss": 0.5841, + "step": 2173 + }, + { + "epoch": 2.3880269119868185, + "grad_norm": 0.4010660946369171, + "learning_rate": 6.788766788766788e-05, + "loss": 0.7491, + "step": 2174 + }, + { + "epoch": 2.389125360428395, + "grad_norm": 0.6889204382896423, + "learning_rate": 6.776556776556775e-05, + "loss": 0.4539, + "step": 2175 + }, + { + "epoch": 2.390223808869971, + "grad_norm": 0.4509136974811554, + "learning_rate": 6.764346764346764e-05, + "loss": 0.7066, + "step": 2176 + }, + { + "epoch": 2.3913222573115474, + "grad_norm": 0.4313298463821411, + "learning_rate": 6.752136752136751e-05, + "loss": 0.6292, + "step": 2177 + }, + { + "epoch": 2.392420705753124, + "grad_norm": 0.7713265419006348, + "learning_rate": 6.73992673992674e-05, + "loss": 0.8392, + "step": 2178 + }, + { + "epoch": 2.3935191541947, + "grad_norm": 0.5283428430557251, + "learning_rate": 6.727716727716727e-05, + "loss": 0.6912, + "step": 2179 + }, + { + "epoch": 2.394617602636276, + "grad_norm": 0.40429314970970154, + "learning_rate": 6.715506715506716e-05, + "loss": 0.4335, + "step": 2180 + }, + { + "epoch": 2.3957160510778523, + "grad_norm": 0.6888754367828369, + "learning_rate": 6.703296703296703e-05, + "loss": 0.6276, + "step": 2181 + }, + { + "epoch": 2.396814499519429, + "grad_norm": 0.5595026612281799, + "learning_rate": 6.69108669108669e-05, + "loss": 0.7806, + "step": 2182 + }, + { + "epoch": 2.397912947961005, + "grad_norm": 0.32394587993621826, + "learning_rate": 6.678876678876678e-05, + "loss": 0.5531, + "step": 2183 + }, + { + "epoch": 2.399011396402581, + "grad_norm": 0.5909039974212646, + "learning_rate": 6.666666666666666e-05, + "loss": 0.4932, + "step": 2184 + }, + { + "epoch": 2.400109844844158, + "grad_norm": 0.4148501455783844, + "learning_rate": 6.654456654456654e-05, + "loss": 0.5637, + "step": 2185 + }, + { + "epoch": 2.401208293285734, + "grad_norm": 0.558403491973877, + "learning_rate": 6.642246642246641e-05, + "loss": 0.5733, + "step": 2186 + }, + { + "epoch": 2.40230674172731, + "grad_norm": 0.5171149373054504, + "learning_rate": 6.630036630036629e-05, + "loss": 0.6931, + "step": 2187 + }, + { + "epoch": 2.403405190168886, + "grad_norm": 0.44966164231300354, + "learning_rate": 6.617826617826617e-05, + "loss": 0.5061, + "step": 2188 + }, + { + "epoch": 2.404503638610463, + "grad_norm": 0.45499417185783386, + "learning_rate": 6.605616605616606e-05, + "loss": 0.3726, + "step": 2189 + }, + { + "epoch": 2.405602087052039, + "grad_norm": 0.5790139436721802, + "learning_rate": 6.593406593406593e-05, + "loss": 0.6647, + "step": 2190 + }, + { + "epoch": 2.4067005354936155, + "grad_norm": 0.5948793292045593, + "learning_rate": 6.581196581196581e-05, + "loss": 0.765, + "step": 2191 + }, + { + "epoch": 2.4077989839351917, + "grad_norm": 0.5925643444061279, + "learning_rate": 6.568986568986569e-05, + "loss": 0.889, + "step": 2192 + }, + { + "epoch": 2.408897432376768, + "grad_norm": 0.5776219964027405, + "learning_rate": 6.556776556776556e-05, + "loss": 0.5506, + "step": 2193 + }, + { + "epoch": 2.409995880818344, + "grad_norm": 0.44397997856140137, + "learning_rate": 6.544566544566544e-05, + "loss": 0.5372, + "step": 2194 + }, + { + "epoch": 2.4110943292599205, + "grad_norm": 0.45733606815338135, + "learning_rate": 6.532356532356532e-05, + "loss": 0.7207, + "step": 2195 + }, + { + "epoch": 2.4121927777014966, + "grad_norm": 0.38223645091056824, + "learning_rate": 6.52014652014652e-05, + "loss": 0.5888, + "step": 2196 + }, + { + "epoch": 2.413291226143073, + "grad_norm": 0.3642580211162567, + "learning_rate": 6.507936507936507e-05, + "loss": 0.5687, + "step": 2197 + }, + { + "epoch": 2.4143896745846494, + "grad_norm": 0.42435723543167114, + "learning_rate": 6.495726495726494e-05, + "loss": 0.6056, + "step": 2198 + }, + { + "epoch": 2.4154881230262255, + "grad_norm": 0.4998740255832672, + "learning_rate": 6.483516483516483e-05, + "loss": 0.6813, + "step": 2199 + }, + { + "epoch": 2.4165865714678016, + "grad_norm": 0.47158849239349365, + "learning_rate": 6.47130647130647e-05, + "loss": 0.5585, + "step": 2200 + }, + { + "epoch": 2.417685019909378, + "grad_norm": 0.4780612289905548, + "learning_rate": 6.459096459096459e-05, + "loss": 0.4941, + "step": 2201 + }, + { + "epoch": 2.4187834683509544, + "grad_norm": 0.5073630809783936, + "learning_rate": 6.446886446886447e-05, + "loss": 0.4549, + "step": 2202 + }, + { + "epoch": 2.4198819167925305, + "grad_norm": 0.4311310052871704, + "learning_rate": 6.434676434676435e-05, + "loss": 0.4419, + "step": 2203 + }, + { + "epoch": 2.4209803652341066, + "grad_norm": 0.3557896316051483, + "learning_rate": 6.422466422466422e-05, + "loss": 0.6973, + "step": 2204 + }, + { + "epoch": 2.4220788136756832, + "grad_norm": 0.6171516180038452, + "learning_rate": 6.410256410256409e-05, + "loss": 0.7554, + "step": 2205 + }, + { + "epoch": 2.4231772621172594, + "grad_norm": 0.4687957465648651, + "learning_rate": 6.398046398046397e-05, + "loss": 0.7429, + "step": 2206 + }, + { + "epoch": 2.4242757105588355, + "grad_norm": 0.8685696125030518, + "learning_rate": 6.385836385836386e-05, + "loss": 0.5896, + "step": 2207 + }, + { + "epoch": 2.425374159000412, + "grad_norm": 0.39599040150642395, + "learning_rate": 6.373626373626373e-05, + "loss": 0.4744, + "step": 2208 + }, + { + "epoch": 2.4264726074419882, + "grad_norm": 0.9079630970954895, + "learning_rate": 6.36141636141636e-05, + "loss": 0.6067, + "step": 2209 + }, + { + "epoch": 2.4275710558835644, + "grad_norm": 0.5051462054252625, + "learning_rate": 6.349206349206349e-05, + "loss": 0.7314, + "step": 2210 + }, + { + "epoch": 2.428669504325141, + "grad_norm": 0.4899844825267792, + "learning_rate": 6.336996336996336e-05, + "loss": 0.7086, + "step": 2211 + }, + { + "epoch": 2.429767952766717, + "grad_norm": 0.5135432481765747, + "learning_rate": 6.324786324786325e-05, + "loss": 0.5261, + "step": 2212 + }, + { + "epoch": 2.4308664012082932, + "grad_norm": 0.6025048494338989, + "learning_rate": 6.312576312576312e-05, + "loss": 0.5276, + "step": 2213 + }, + { + "epoch": 2.4319648496498694, + "grad_norm": 0.6931442022323608, + "learning_rate": 6.3003663003663e-05, + "loss": 0.6535, + "step": 2214 + }, + { + "epoch": 2.433063298091446, + "grad_norm": 0.695106565952301, + "learning_rate": 6.288156288156288e-05, + "loss": 0.9183, + "step": 2215 + }, + { + "epoch": 2.434161746533022, + "grad_norm": 0.450100302696228, + "learning_rate": 6.275946275946275e-05, + "loss": 0.5049, + "step": 2216 + }, + { + "epoch": 2.4352601949745982, + "grad_norm": 0.5539785623550415, + "learning_rate": 6.263736263736263e-05, + "loss": 0.5735, + "step": 2217 + }, + { + "epoch": 2.436358643416175, + "grad_norm": 0.5560977458953857, + "learning_rate": 6.25152625152625e-05, + "loss": 0.7364, + "step": 2218 + }, + { + "epoch": 2.437457091857751, + "grad_norm": 0.740195095539093, + "learning_rate": 6.239316239316239e-05, + "loss": 0.7839, + "step": 2219 + }, + { + "epoch": 2.438555540299327, + "grad_norm": 0.9324271082878113, + "learning_rate": 6.227106227106226e-05, + "loss": 0.6365, + "step": 2220 + }, + { + "epoch": 2.4396539887409037, + "grad_norm": 0.5540104508399963, + "learning_rate": 6.214896214896215e-05, + "loss": 0.6586, + "step": 2221 + }, + { + "epoch": 2.44075243718248, + "grad_norm": 0.5028054714202881, + "learning_rate": 6.202686202686202e-05, + "loss": 0.4422, + "step": 2222 + }, + { + "epoch": 2.441850885624056, + "grad_norm": 0.7052125930786133, + "learning_rate": 6.190476190476189e-05, + "loss": 0.7248, + "step": 2223 + }, + { + "epoch": 2.4429493340656325, + "grad_norm": 0.6705207824707031, + "learning_rate": 6.178266178266178e-05, + "loss": 0.81, + "step": 2224 + }, + { + "epoch": 2.4440477825072087, + "grad_norm": 0.7996514439582825, + "learning_rate": 6.166056166056166e-05, + "loss": 0.382, + "step": 2225 + }, + { + "epoch": 2.445146230948785, + "grad_norm": 1.5169689655303955, + "learning_rate": 6.153846153846154e-05, + "loss": 0.7373, + "step": 2226 + }, + { + "epoch": 2.446244679390361, + "grad_norm": 0.8039339780807495, + "learning_rate": 6.141636141636141e-05, + "loss": 0.8609, + "step": 2227 + }, + { + "epoch": 2.4473431278319375, + "grad_norm": 0.6489125490188599, + "learning_rate": 6.129426129426128e-05, + "loss": 0.6309, + "step": 2228 + }, + { + "epoch": 2.4484415762735137, + "grad_norm": 0.533184826374054, + "learning_rate": 6.117216117216116e-05, + "loss": 0.5166, + "step": 2229 + }, + { + "epoch": 2.44954002471509, + "grad_norm": 0.5699225068092346, + "learning_rate": 6.105006105006105e-05, + "loss": 0.7276, + "step": 2230 + }, + { + "epoch": 2.4506384731566664, + "grad_norm": 0.5552012324333191, + "learning_rate": 6.092796092796092e-05, + "loss": 0.636, + "step": 2231 + }, + { + "epoch": 2.4517369215982425, + "grad_norm": 0.4785599112510681, + "learning_rate": 6.08058608058608e-05, + "loss": 0.6362, + "step": 2232 + }, + { + "epoch": 2.4528353700398187, + "grad_norm": 0.740872859954834, + "learning_rate": 6.068376068376068e-05, + "loss": 0.5603, + "step": 2233 + }, + { + "epoch": 2.453933818481395, + "grad_norm": 0.5217441916465759, + "learning_rate": 6.056166056166056e-05, + "loss": 0.6306, + "step": 2234 + }, + { + "epoch": 2.4550322669229714, + "grad_norm": 0.446481853723526, + "learning_rate": 6.043956043956044e-05, + "loss": 0.8156, + "step": 2235 + }, + { + "epoch": 2.4561307153645475, + "grad_norm": 0.6527410745620728, + "learning_rate": 6.031746031746031e-05, + "loss": 0.7057, + "step": 2236 + }, + { + "epoch": 2.4572291638061237, + "grad_norm": 0.6801958680152893, + "learning_rate": 6.019536019536019e-05, + "loss": 0.7718, + "step": 2237 + }, + { + "epoch": 2.4583276122477002, + "grad_norm": 1.0723007917404175, + "learning_rate": 6.007326007326007e-05, + "loss": 0.5552, + "step": 2238 + }, + { + "epoch": 2.4594260606892764, + "grad_norm": 0.4058208763599396, + "learning_rate": 5.9951159951159945e-05, + "loss": 0.5035, + "step": 2239 + }, + { + "epoch": 2.4605245091308525, + "grad_norm": 0.5384330153465271, + "learning_rate": 5.9829059829059824e-05, + "loss": 0.5059, + "step": 2240 + }, + { + "epoch": 2.461622957572429, + "grad_norm": 0.7797716856002808, + "learning_rate": 5.9706959706959696e-05, + "loss": 0.5613, + "step": 2241 + }, + { + "epoch": 2.4627214060140052, + "grad_norm": 2.9689226150512695, + "learning_rate": 5.958485958485958e-05, + "loss": 0.6219, + "step": 2242 + }, + { + "epoch": 2.4638198544555814, + "grad_norm": 0.47863152623176575, + "learning_rate": 5.946275946275946e-05, + "loss": 0.5498, + "step": 2243 + }, + { + "epoch": 2.464918302897158, + "grad_norm": 0.49707144498825073, + "learning_rate": 5.934065934065933e-05, + "loss": 0.775, + "step": 2244 + }, + { + "epoch": 2.466016751338734, + "grad_norm": 0.3437495529651642, + "learning_rate": 5.921855921855922e-05, + "loss": 0.4592, + "step": 2245 + }, + { + "epoch": 2.4671151997803102, + "grad_norm": 0.7298309206962585, + "learning_rate": 5.9096459096459096e-05, + "loss": 0.5374, + "step": 2246 + }, + { + "epoch": 2.4682136482218864, + "grad_norm": 0.6666691303253174, + "learning_rate": 5.897435897435897e-05, + "loss": 0.424, + "step": 2247 + }, + { + "epoch": 2.469312096663463, + "grad_norm": 0.5841661691665649, + "learning_rate": 5.8852258852258847e-05, + "loss": 0.5316, + "step": 2248 + }, + { + "epoch": 2.470410545105039, + "grad_norm": 0.4921081066131592, + "learning_rate": 5.873015873015872e-05, + "loss": 0.6901, + "step": 2249 + }, + { + "epoch": 2.4715089935466152, + "grad_norm": 0.4779987633228302, + "learning_rate": 5.8608058608058604e-05, + "loss": 0.8976, + "step": 2250 + }, + { + "epoch": 2.472607441988192, + "grad_norm": 0.43142780661582947, + "learning_rate": 5.848595848595848e-05, + "loss": 0.4915, + "step": 2251 + }, + { + "epoch": 2.473705890429768, + "grad_norm": 1.132870078086853, + "learning_rate": 5.8363858363858355e-05, + "loss": 0.6633, + "step": 2252 + }, + { + "epoch": 2.474804338871344, + "grad_norm": 0.5674893856048584, + "learning_rate": 5.824175824175824e-05, + "loss": 0.5023, + "step": 2253 + }, + { + "epoch": 2.4759027873129207, + "grad_norm": 0.42495957016944885, + "learning_rate": 5.811965811965811e-05, + "loss": 0.6544, + "step": 2254 + }, + { + "epoch": 2.477001235754497, + "grad_norm": 0.8031434416770935, + "learning_rate": 5.799755799755799e-05, + "loss": 0.892, + "step": 2255 + }, + { + "epoch": 2.478099684196073, + "grad_norm": 0.7715115547180176, + "learning_rate": 5.7875457875457876e-05, + "loss": 0.5659, + "step": 2256 + }, + { + "epoch": 2.4791981326376495, + "grad_norm": 0.6882114410400391, + "learning_rate": 5.775335775335775e-05, + "loss": 0.5154, + "step": 2257 + }, + { + "epoch": 2.4802965810792257, + "grad_norm": 0.4994114935398102, + "learning_rate": 5.763125763125763e-05, + "loss": 0.6001, + "step": 2258 + }, + { + "epoch": 2.481395029520802, + "grad_norm": 0.45008450746536255, + "learning_rate": 5.7509157509157506e-05, + "loss": 0.7076, + "step": 2259 + }, + { + "epoch": 2.482493477962378, + "grad_norm": 0.654270350933075, + "learning_rate": 5.738705738705738e-05, + "loss": 0.5809, + "step": 2260 + }, + { + "epoch": 2.4835919264039545, + "grad_norm": 0.6344896554946899, + "learning_rate": 5.726495726495726e-05, + "loss": 0.6059, + "step": 2261 + }, + { + "epoch": 2.4846903748455307, + "grad_norm": 0.44090238213539124, + "learning_rate": 5.7142857142857135e-05, + "loss": 0.7953, + "step": 2262 + }, + { + "epoch": 2.485788823287107, + "grad_norm": 0.47564128041267395, + "learning_rate": 5.7020757020757014e-05, + "loss": 0.5062, + "step": 2263 + }, + { + "epoch": 2.4868872717286834, + "grad_norm": 0.3644583225250244, + "learning_rate": 5.68986568986569e-05, + "loss": 0.6417, + "step": 2264 + }, + { + "epoch": 2.4879857201702595, + "grad_norm": 0.5264548659324646, + "learning_rate": 5.677655677655677e-05, + "loss": 0.5971, + "step": 2265 + }, + { + "epoch": 2.4890841686118357, + "grad_norm": 0.7300589680671692, + "learning_rate": 5.665445665445665e-05, + "loss": 0.6249, + "step": 2266 + }, + { + "epoch": 2.490182617053412, + "grad_norm": 0.9016311764717102, + "learning_rate": 5.653235653235652e-05, + "loss": 0.5761, + "step": 2267 + }, + { + "epoch": 2.4912810654949884, + "grad_norm": 0.7480237483978271, + "learning_rate": 5.641025641025641e-05, + "loss": 0.4026, + "step": 2268 + }, + { + "epoch": 2.4923795139365645, + "grad_norm": 0.5738864541053772, + "learning_rate": 5.6288156288156286e-05, + "loss": 0.8657, + "step": 2269 + }, + { + "epoch": 2.493477962378141, + "grad_norm": 0.7320820093154907, + "learning_rate": 5.616605616605616e-05, + "loss": 0.7341, + "step": 2270 + }, + { + "epoch": 2.4945764108197173, + "grad_norm": 0.7029497623443604, + "learning_rate": 5.6043956043956037e-05, + "loss": 0.7597, + "step": 2271 + }, + { + "epoch": 2.4956748592612934, + "grad_norm": 0.5160001516342163, + "learning_rate": 5.592185592185592e-05, + "loss": 0.6488, + "step": 2272 + }, + { + "epoch": 2.4967733077028695, + "grad_norm": 0.5425933003425598, + "learning_rate": 5.5799755799755794e-05, + "loss": 0.7102, + "step": 2273 + }, + { + "epoch": 2.497871756144446, + "grad_norm": 0.5881295204162598, + "learning_rate": 5.567765567765567e-05, + "loss": 0.8123, + "step": 2274 + }, + { + "epoch": 2.4989702045860223, + "grad_norm": 0.6021397113800049, + "learning_rate": 5.5555555555555545e-05, + "loss": 0.8887, + "step": 2275 + }, + { + "epoch": 2.5000686530275984, + "grad_norm": 0.4754411578178406, + "learning_rate": 5.543345543345543e-05, + "loss": 0.8162, + "step": 2276 + }, + { + "epoch": 2.501167101469175, + "grad_norm": 0.46976983547210693, + "learning_rate": 5.531135531135531e-05, + "loss": 0.4177, + "step": 2277 + }, + { + "epoch": 2.502265549910751, + "grad_norm": 0.4946482181549072, + "learning_rate": 5.518925518925518e-05, + "loss": 0.6997, + "step": 2278 + }, + { + "epoch": 2.5033639983523273, + "grad_norm": 0.49166280031204224, + "learning_rate": 5.5067155067155066e-05, + "loss": 0.6436, + "step": 2279 + }, + { + "epoch": 2.5044624467939034, + "grad_norm": 0.40157628059387207, + "learning_rate": 5.494505494505494e-05, + "loss": 0.6998, + "step": 2280 + }, + { + "epoch": 2.50556089523548, + "grad_norm": 0.4139937162399292, + "learning_rate": 5.482295482295482e-05, + "loss": 0.4021, + "step": 2281 + }, + { + "epoch": 2.506659343677056, + "grad_norm": 3.6814892292022705, + "learning_rate": 5.4700854700854696e-05, + "loss": 0.6402, + "step": 2282 + }, + { + "epoch": 2.5077577921186327, + "grad_norm": 0.3136257529258728, + "learning_rate": 5.4578754578754574e-05, + "loss": 0.5364, + "step": 2283 + }, + { + "epoch": 2.508856240560209, + "grad_norm": 0.42901432514190674, + "learning_rate": 5.445665445665445e-05, + "loss": 0.6838, + "step": 2284 + }, + { + "epoch": 2.509954689001785, + "grad_norm": 0.8462406992912292, + "learning_rate": 5.433455433455433e-05, + "loss": 0.4232, + "step": 2285 + }, + { + "epoch": 2.511053137443361, + "grad_norm": 1.244150996208191, + "learning_rate": 5.4212454212454204e-05, + "loss": 0.6192, + "step": 2286 + }, + { + "epoch": 2.5121515858849373, + "grad_norm": 0.834296703338623, + "learning_rate": 5.409035409035409e-05, + "loss": 0.548, + "step": 2287 + }, + { + "epoch": 2.513250034326514, + "grad_norm": 0.4279276430606842, + "learning_rate": 5.396825396825396e-05, + "loss": 0.7549, + "step": 2288 + }, + { + "epoch": 2.51434848276809, + "grad_norm": 0.5770757794380188, + "learning_rate": 5.384615384615384e-05, + "loss": 0.6156, + "step": 2289 + }, + { + "epoch": 2.5154469312096666, + "grad_norm": 0.41763821244239807, + "learning_rate": 5.3724053724053725e-05, + "loss": 0.5019, + "step": 2290 + }, + { + "epoch": 2.5165453796512427, + "grad_norm": 0.5212944746017456, + "learning_rate": 5.36019536019536e-05, + "loss": 0.6132, + "step": 2291 + }, + { + "epoch": 2.517643828092819, + "grad_norm": 0.44493457674980164, + "learning_rate": 5.3479853479853476e-05, + "loss": 0.4162, + "step": 2292 + }, + { + "epoch": 2.518742276534395, + "grad_norm": 0.46922022104263306, + "learning_rate": 5.335775335775335e-05, + "loss": 0.4624, + "step": 2293 + }, + { + "epoch": 2.5198407249759716, + "grad_norm": 0.41906213760375977, + "learning_rate": 5.3235653235653233e-05, + "loss": 0.612, + "step": 2294 + }, + { + "epoch": 2.5209391734175477, + "grad_norm": 0.620276153087616, + "learning_rate": 5.311355311355311e-05, + "loss": 0.6322, + "step": 2295 + }, + { + "epoch": 2.522037621859124, + "grad_norm": 0.6597051620483398, + "learning_rate": 5.2991452991452984e-05, + "loss": 0.7659, + "step": 2296 + }, + { + "epoch": 2.5231360703007004, + "grad_norm": 4.377660274505615, + "learning_rate": 5.286935286935286e-05, + "loss": 0.8294, + "step": 2297 + }, + { + "epoch": 2.5242345187422766, + "grad_norm": 0.6086331009864807, + "learning_rate": 5.274725274725275e-05, + "loss": 0.5164, + "step": 2298 + }, + { + "epoch": 2.5253329671838527, + "grad_norm": 0.5100352168083191, + "learning_rate": 5.262515262515262e-05, + "loss": 0.6319, + "step": 2299 + }, + { + "epoch": 2.526431415625429, + "grad_norm": 0.6642487049102783, + "learning_rate": 5.25030525030525e-05, + "loss": 0.533, + "step": 2300 + }, + { + "epoch": 2.5275298640670054, + "grad_norm": 0.5834927558898926, + "learning_rate": 5.238095238095237e-05, + "loss": 0.5669, + "step": 2301 + }, + { + "epoch": 2.5286283125085816, + "grad_norm": 0.530815064907074, + "learning_rate": 5.2258852258852256e-05, + "loss": 0.6189, + "step": 2302 + }, + { + "epoch": 2.529726760950158, + "grad_norm": 0.6275864243507385, + "learning_rate": 5.2136752136752135e-05, + "loss": 0.8403, + "step": 2303 + }, + { + "epoch": 2.5308252093917343, + "grad_norm": 0.5878366827964783, + "learning_rate": 5.201465201465201e-05, + "loss": 0.6176, + "step": 2304 + }, + { + "epoch": 2.5319236578333104, + "grad_norm": 0.37410980463027954, + "learning_rate": 5.189255189255189e-05, + "loss": 0.6337, + "step": 2305 + }, + { + "epoch": 2.5330221062748866, + "grad_norm": 0.43912917375564575, + "learning_rate": 5.1770451770451764e-05, + "loss": 0.5348, + "step": 2306 + }, + { + "epoch": 2.534120554716463, + "grad_norm": 1.4737471342086792, + "learning_rate": 5.164835164835164e-05, + "loss": 0.4862, + "step": 2307 + }, + { + "epoch": 2.5352190031580393, + "grad_norm": 0.3978705108165741, + "learning_rate": 5.152625152625152e-05, + "loss": 0.7929, + "step": 2308 + }, + { + "epoch": 2.5363174515996154, + "grad_norm": 0.3852058947086334, + "learning_rate": 5.14041514041514e-05, + "loss": 0.5895, + "step": 2309 + }, + { + "epoch": 2.537415900041192, + "grad_norm": 17.968448638916016, + "learning_rate": 5.128205128205128e-05, + "loss": 0.4661, + "step": 2310 + }, + { + "epoch": 2.538514348482768, + "grad_norm": 0.9369175434112549, + "learning_rate": 5.115995115995115e-05, + "loss": 0.5957, + "step": 2311 + }, + { + "epoch": 2.5396127969243443, + "grad_norm": 0.612750768661499, + "learning_rate": 5.103785103785103e-05, + "loss": 0.6786, + "step": 2312 + }, + { + "epoch": 2.5407112453659204, + "grad_norm": 0.588512659072876, + "learning_rate": 5.0915750915750915e-05, + "loss": 1.0482, + "step": 2313 + }, + { + "epoch": 2.541809693807497, + "grad_norm": 0.4964143633842468, + "learning_rate": 5.079365079365079e-05, + "loss": 0.5673, + "step": 2314 + }, + { + "epoch": 2.542908142249073, + "grad_norm": 0.5807982683181763, + "learning_rate": 5.0671550671550666e-05, + "loss": 0.5493, + "step": 2315 + }, + { + "epoch": 2.5440065906906497, + "grad_norm": 0.5131386518478394, + "learning_rate": 5.054945054945055e-05, + "loss": 0.5947, + "step": 2316 + }, + { + "epoch": 2.545105039132226, + "grad_norm": 0.4521124064922333, + "learning_rate": 5.0427350427350424e-05, + "loss": 0.5554, + "step": 2317 + }, + { + "epoch": 2.546203487573802, + "grad_norm": 0.9441378712654114, + "learning_rate": 5.03052503052503e-05, + "loss": 0.6991, + "step": 2318 + }, + { + "epoch": 2.547301936015378, + "grad_norm": 0.6353013515472412, + "learning_rate": 5.0183150183150174e-05, + "loss": 0.5308, + "step": 2319 + }, + { + "epoch": 2.5484003844569547, + "grad_norm": 0.5940631628036499, + "learning_rate": 5.006105006105006e-05, + "loss": 0.6536, + "step": 2320 + }, + { + "epoch": 2.549498832898531, + "grad_norm": 0.5457591414451599, + "learning_rate": 4.993894993894994e-05, + "loss": 0.6927, + "step": 2321 + }, + { + "epoch": 2.550597281340107, + "grad_norm": 0.6265937685966492, + "learning_rate": 4.981684981684981e-05, + "loss": 0.6341, + "step": 2322 + }, + { + "epoch": 2.5516957297816836, + "grad_norm": 0.5842925310134888, + "learning_rate": 4.969474969474969e-05, + "loss": 0.4583, + "step": 2323 + }, + { + "epoch": 2.5527941782232597, + "grad_norm": 0.5363351106643677, + "learning_rate": 4.957264957264956e-05, + "loss": 0.6882, + "step": 2324 + }, + { + "epoch": 2.553892626664836, + "grad_norm": 0.3677682876586914, + "learning_rate": 4.9450549450549446e-05, + "loss": 0.5671, + "step": 2325 + }, + { + "epoch": 2.554991075106412, + "grad_norm": 1.222985863685608, + "learning_rate": 4.9328449328449325e-05, + "loss": 0.4936, + "step": 2326 + }, + { + "epoch": 2.5560895235479886, + "grad_norm": 1.187898874282837, + "learning_rate": 4.92063492063492e-05, + "loss": 0.4893, + "step": 2327 + }, + { + "epoch": 2.5571879719895647, + "grad_norm": 0.38843801617622375, + "learning_rate": 4.908424908424908e-05, + "loss": 0.6512, + "step": 2328 + }, + { + "epoch": 2.558286420431141, + "grad_norm": 0.9550036191940308, + "learning_rate": 4.896214896214896e-05, + "loss": 0.6055, + "step": 2329 + }, + { + "epoch": 2.5593848688727174, + "grad_norm": 0.80762779712677, + "learning_rate": 4.884004884004883e-05, + "loss": 0.8852, + "step": 2330 + }, + { + "epoch": 2.5604833173142936, + "grad_norm": 0.7496643662452698, + "learning_rate": 4.871794871794872e-05, + "loss": 0.6535, + "step": 2331 + }, + { + "epoch": 2.5615817657558697, + "grad_norm": 0.5532578825950623, + "learning_rate": 4.859584859584859e-05, + "loss": 0.6336, + "step": 2332 + }, + { + "epoch": 2.562680214197446, + "grad_norm": 0.4058012366294861, + "learning_rate": 4.847374847374847e-05, + "loss": 0.6529, + "step": 2333 + }, + { + "epoch": 2.5637786626390224, + "grad_norm": 3.1913115978240967, + "learning_rate": 4.835164835164835e-05, + "loss": 0.548, + "step": 2334 + }, + { + "epoch": 2.5648771110805986, + "grad_norm": 0.47375988960266113, + "learning_rate": 4.822954822954822e-05, + "loss": 0.7567, + "step": 2335 + }, + { + "epoch": 2.565975559522175, + "grad_norm": 0.5287726521492004, + "learning_rate": 4.8107448107448106e-05, + "loss": 0.6009, + "step": 2336 + }, + { + "epoch": 2.5670740079637513, + "grad_norm": 0.43966931104660034, + "learning_rate": 4.798534798534798e-05, + "loss": 0.5538, + "step": 2337 + }, + { + "epoch": 2.5681724564053274, + "grad_norm": 0.6683239340782166, + "learning_rate": 4.7863247863247856e-05, + "loss": 0.3999, + "step": 2338 + }, + { + "epoch": 2.5692709048469036, + "grad_norm": 0.5260687470436096, + "learning_rate": 4.774114774114774e-05, + "loss": 0.7212, + "step": 2339 + }, + { + "epoch": 2.57036935328848, + "grad_norm": 1.086850881576538, + "learning_rate": 4.7619047619047614e-05, + "loss": 0.7439, + "step": 2340 + }, + { + "epoch": 2.5714678017300563, + "grad_norm": 0.9744517207145691, + "learning_rate": 4.749694749694749e-05, + "loss": 0.5625, + "step": 2341 + }, + { + "epoch": 2.5725662501716324, + "grad_norm": 0.6829352974891663, + "learning_rate": 4.737484737484738e-05, + "loss": 0.5241, + "step": 2342 + }, + { + "epoch": 2.573664698613209, + "grad_norm": 0.9441612958908081, + "learning_rate": 4.725274725274725e-05, + "loss": 0.8815, + "step": 2343 + }, + { + "epoch": 2.574763147054785, + "grad_norm": 0.9406607151031494, + "learning_rate": 4.713064713064713e-05, + "loss": 0.7176, + "step": 2344 + }, + { + "epoch": 2.5758615954963613, + "grad_norm": 0.6601364016532898, + "learning_rate": 4.7008547008547e-05, + "loss": 0.7713, + "step": 2345 + }, + { + "epoch": 2.5769600439379374, + "grad_norm": 2.5189599990844727, + "learning_rate": 4.688644688644688e-05, + "loss": 0.5572, + "step": 2346 + }, + { + "epoch": 2.578058492379514, + "grad_norm": 0.7295210957527161, + "learning_rate": 4.6764346764346765e-05, + "loss": 0.4431, + "step": 2347 + }, + { + "epoch": 2.57915694082109, + "grad_norm": 0.5053385496139526, + "learning_rate": 4.6642246642246637e-05, + "loss": 0.4881, + "step": 2348 + }, + { + "epoch": 2.5802553892626667, + "grad_norm": 0.6556063890457153, + "learning_rate": 4.6520146520146515e-05, + "loss": 0.5168, + "step": 2349 + }, + { + "epoch": 2.581353837704243, + "grad_norm": 0.37052014470100403, + "learning_rate": 4.639804639804639e-05, + "loss": 0.3954, + "step": 2350 + }, + { + "epoch": 2.582452286145819, + "grad_norm": 0.5975561738014221, + "learning_rate": 4.627594627594627e-05, + "loss": 0.5714, + "step": 2351 + }, + { + "epoch": 2.583550734587395, + "grad_norm": 0.7273014187812805, + "learning_rate": 4.615384615384615e-05, + "loss": 0.7287, + "step": 2352 + }, + { + "epoch": 2.5846491830289717, + "grad_norm": 0.566586971282959, + "learning_rate": 4.603174603174602e-05, + "loss": 0.5589, + "step": 2353 + }, + { + "epoch": 2.585747631470548, + "grad_norm": 0.5846517086029053, + "learning_rate": 4.590964590964591e-05, + "loss": 0.5061, + "step": 2354 + }, + { + "epoch": 2.586846079912124, + "grad_norm": 0.7470859885215759, + "learning_rate": 4.578754578754579e-05, + "loss": 0.5433, + "step": 2355 + }, + { + "epoch": 2.5879445283537006, + "grad_norm": 0.5419175624847412, + "learning_rate": 4.566544566544566e-05, + "loss": 0.5502, + "step": 2356 + }, + { + "epoch": 2.5890429767952767, + "grad_norm": 1.507851004600525, + "learning_rate": 4.554334554334554e-05, + "loss": 0.7399, + "step": 2357 + }, + { + "epoch": 2.590141425236853, + "grad_norm": 1.4420006275177002, + "learning_rate": 4.542124542124542e-05, + "loss": 0.4233, + "step": 2358 + }, + { + "epoch": 2.591239873678429, + "grad_norm": 0.6471789479255676, + "learning_rate": 4.5299145299145296e-05, + "loss": 0.4052, + "step": 2359 + }, + { + "epoch": 2.5923383221200056, + "grad_norm": 0.5886567831039429, + "learning_rate": 4.5177045177045174e-05, + "loss": 0.7197, + "step": 2360 + }, + { + "epoch": 2.5934367705615817, + "grad_norm": 0.843024492263794, + "learning_rate": 4.5054945054945046e-05, + "loss": 0.7636, + "step": 2361 + }, + { + "epoch": 2.5945352190031583, + "grad_norm": 0.8689064979553223, + "learning_rate": 4.493284493284493e-05, + "loss": 0.6694, + "step": 2362 + }, + { + "epoch": 2.5956336674447344, + "grad_norm": 0.5112485289573669, + "learning_rate": 4.4810744810744804e-05, + "loss": 0.5338, + "step": 2363 + }, + { + "epoch": 2.5967321158863106, + "grad_norm": 0.4828614294528961, + "learning_rate": 4.468864468864468e-05, + "loss": 0.8519, + "step": 2364 + }, + { + "epoch": 2.5978305643278867, + "grad_norm": 0.5644575357437134, + "learning_rate": 4.456654456654457e-05, + "loss": 0.5605, + "step": 2365 + }, + { + "epoch": 2.598929012769463, + "grad_norm": 0.7749584913253784, + "learning_rate": 4.444444444444444e-05, + "loss": 0.6697, + "step": 2366 + }, + { + "epoch": 2.6000274612110394, + "grad_norm": 0.9038271307945251, + "learning_rate": 4.432234432234432e-05, + "loss": 0.7242, + "step": 2367 + }, + { + "epoch": 2.6011259096526156, + "grad_norm": 0.5102944374084473, + "learning_rate": 4.42002442002442e-05, + "loss": 0.5841, + "step": 2368 + }, + { + "epoch": 2.602224358094192, + "grad_norm": 0.5072823762893677, + "learning_rate": 4.4078144078144076e-05, + "loss": 0.4927, + "step": 2369 + }, + { + "epoch": 2.6033228065357683, + "grad_norm": 0.3654184341430664, + "learning_rate": 4.3956043956043955e-05, + "loss": 0.6449, + "step": 2370 + }, + { + "epoch": 2.6044212549773444, + "grad_norm": 1.7309939861297607, + "learning_rate": 4.3833943833943827e-05, + "loss": 0.6979, + "step": 2371 + }, + { + "epoch": 2.6055197034189206, + "grad_norm": 0.7982075214385986, + "learning_rate": 4.3711843711843705e-05, + "loss": 0.6589, + "step": 2372 + }, + { + "epoch": 2.606618151860497, + "grad_norm": 0.6989462375640869, + "learning_rate": 4.358974358974359e-05, + "loss": 0.7104, + "step": 2373 + }, + { + "epoch": 2.6077166003020733, + "grad_norm": 0.7331676483154297, + "learning_rate": 4.346764346764346e-05, + "loss": 0.7565, + "step": 2374 + }, + { + "epoch": 2.6088150487436494, + "grad_norm": 1.0566400289535522, + "learning_rate": 4.334554334554334e-05, + "loss": 0.6967, + "step": 2375 + }, + { + "epoch": 2.609913497185226, + "grad_norm": 0.5988017320632935, + "learning_rate": 4.322344322344321e-05, + "loss": 0.7871, + "step": 2376 + }, + { + "epoch": 2.611011945626802, + "grad_norm": 0.4248102307319641, + "learning_rate": 4.31013431013431e-05, + "loss": 0.6891, + "step": 2377 + }, + { + "epoch": 2.6121103940683783, + "grad_norm": 1.9839611053466797, + "learning_rate": 4.297924297924298e-05, + "loss": 0.6647, + "step": 2378 + }, + { + "epoch": 2.6132088425099544, + "grad_norm": 0.4382665455341339, + "learning_rate": 4.285714285714285e-05, + "loss": 0.5969, + "step": 2379 + }, + { + "epoch": 2.614307290951531, + "grad_norm": 1.1918715238571167, + "learning_rate": 4.2735042735042735e-05, + "loss": 0.7788, + "step": 2380 + }, + { + "epoch": 2.615405739393107, + "grad_norm": 0.38117820024490356, + "learning_rate": 4.2612942612942614e-05, + "loss": 0.4967, + "step": 2381 + }, + { + "epoch": 2.6165041878346837, + "grad_norm": 0.6454489827156067, + "learning_rate": 4.2490842490842486e-05, + "loss": 0.7724, + "step": 2382 + }, + { + "epoch": 2.61760263627626, + "grad_norm": 1.0696319341659546, + "learning_rate": 4.2368742368742364e-05, + "loss": 0.5292, + "step": 2383 + }, + { + "epoch": 2.618701084717836, + "grad_norm": 0.5887579321861267, + "learning_rate": 4.224664224664224e-05, + "loss": 0.5317, + "step": 2384 + }, + { + "epoch": 2.619799533159412, + "grad_norm": 0.557188093662262, + "learning_rate": 4.212454212454212e-05, + "loss": 0.7172, + "step": 2385 + }, + { + "epoch": 2.6208979816009887, + "grad_norm": 0.5122195482254028, + "learning_rate": 4.2002442002442e-05, + "loss": 0.6398, + "step": 2386 + }, + { + "epoch": 2.621996430042565, + "grad_norm": 0.520722508430481, + "learning_rate": 4.188034188034187e-05, + "loss": 0.3984, + "step": 2387 + }, + { + "epoch": 2.623094878484141, + "grad_norm": 1.2077422142028809, + "learning_rate": 4.175824175824176e-05, + "loss": 0.6686, + "step": 2388 + }, + { + "epoch": 2.6241933269257176, + "grad_norm": 1.1437829732894897, + "learning_rate": 4.163614163614163e-05, + "loss": 0.6653, + "step": 2389 + }, + { + "epoch": 2.6252917753672937, + "grad_norm": 0.6157158017158508, + "learning_rate": 4.151404151404151e-05, + "loss": 0.7074, + "step": 2390 + }, + { + "epoch": 2.62639022380887, + "grad_norm": 1.8944931030273438, + "learning_rate": 4.1391941391941394e-05, + "loss": 0.5991, + "step": 2391 + }, + { + "epoch": 2.627488672250446, + "grad_norm": 0.6598528623580933, + "learning_rate": 4.1269841269841266e-05, + "loss": 0.6051, + "step": 2392 + }, + { + "epoch": 2.6285871206920226, + "grad_norm": 0.9341129660606384, + "learning_rate": 4.1147741147741145e-05, + "loss": 0.3795, + "step": 2393 + }, + { + "epoch": 2.6296855691335987, + "grad_norm": 0.4246079921722412, + "learning_rate": 4.1025641025641023e-05, + "loss": 0.4603, + "step": 2394 + }, + { + "epoch": 2.6307840175751753, + "grad_norm": 0.6639881134033203, + "learning_rate": 4.09035409035409e-05, + "loss": 0.5862, + "step": 2395 + }, + { + "epoch": 2.6318824660167515, + "grad_norm": 1.297917366027832, + "learning_rate": 4.078144078144078e-05, + "loss": 0.6175, + "step": 2396 + }, + { + "epoch": 2.6329809144583276, + "grad_norm": 0.7880698442459106, + "learning_rate": 4.065934065934065e-05, + "loss": 0.7034, + "step": 2397 + }, + { + "epoch": 2.6340793628999037, + "grad_norm": 0.6197066903114319, + "learning_rate": 4.053724053724053e-05, + "loss": 0.659, + "step": 2398 + }, + { + "epoch": 2.6351778113414803, + "grad_norm": 0.7560408711433411, + "learning_rate": 4.041514041514042e-05, + "loss": 0.5543, + "step": 2399 + }, + { + "epoch": 2.6362762597830565, + "grad_norm": 2.2571635246276855, + "learning_rate": 4.029304029304029e-05, + "loss": 0.712, + "step": 2400 + }, + { + "epoch": 2.6373747082246326, + "grad_norm": 0.8119613528251648, + "learning_rate": 4.017094017094017e-05, + "loss": 0.6407, + "step": 2401 + }, + { + "epoch": 2.638473156666209, + "grad_norm": 3.9773592948913574, + "learning_rate": 4.004884004884004e-05, + "loss": 0.6434, + "step": 2402 + }, + { + "epoch": 2.6395716051077853, + "grad_norm": 1.2648125886917114, + "learning_rate": 3.9926739926739925e-05, + "loss": 0.689, + "step": 2403 + }, + { + "epoch": 2.6406700535493615, + "grad_norm": 0.7015364170074463, + "learning_rate": 3.9804639804639804e-05, + "loss": 0.4175, + "step": 2404 + }, + { + "epoch": 2.6417685019909376, + "grad_norm": 0.941303551197052, + "learning_rate": 3.9682539682539676e-05, + "loss": 0.4126, + "step": 2405 + }, + { + "epoch": 2.642866950432514, + "grad_norm": 0.7533726096153259, + "learning_rate": 3.956043956043956e-05, + "loss": 0.7401, + "step": 2406 + }, + { + "epoch": 2.6439653988740903, + "grad_norm": 0.5480525493621826, + "learning_rate": 3.943833943833943e-05, + "loss": 0.5567, + "step": 2407 + }, + { + "epoch": 2.6450638473156665, + "grad_norm": 0.6171422004699707, + "learning_rate": 3.931623931623931e-05, + "loss": 0.721, + "step": 2408 + }, + { + "epoch": 2.646162295757243, + "grad_norm": 0.6719728708267212, + "learning_rate": 3.919413919413919e-05, + "loss": 0.5015, + "step": 2409 + }, + { + "epoch": 2.647260744198819, + "grad_norm": 1.8106555938720703, + "learning_rate": 3.907203907203906e-05, + "loss": 0.6954, + "step": 2410 + }, + { + "epoch": 2.6483591926403953, + "grad_norm": 0.42534878849983215, + "learning_rate": 3.894993894993895e-05, + "loss": 0.5241, + "step": 2411 + }, + { + "epoch": 2.6494576410819715, + "grad_norm": 0.8733202219009399, + "learning_rate": 3.882783882783883e-05, + "loss": 0.4485, + "step": 2412 + }, + { + "epoch": 2.650556089523548, + "grad_norm": 0.9050257802009583, + "learning_rate": 3.87057387057387e-05, + "loss": 0.6202, + "step": 2413 + }, + { + "epoch": 2.651654537965124, + "grad_norm": 0.650347888469696, + "learning_rate": 3.8583638583638584e-05, + "loss": 0.621, + "step": 2414 + }, + { + "epoch": 2.6527529864067008, + "grad_norm": 6.092042446136475, + "learning_rate": 3.8461538461538456e-05, + "loss": 0.5143, + "step": 2415 + }, + { + "epoch": 2.653851434848277, + "grad_norm": 0.7801241874694824, + "learning_rate": 3.8339438339438335e-05, + "loss": 0.5424, + "step": 2416 + }, + { + "epoch": 2.654949883289853, + "grad_norm": 0.5492686629295349, + "learning_rate": 3.821733821733822e-05, + "loss": 0.642, + "step": 2417 + }, + { + "epoch": 2.656048331731429, + "grad_norm": 0.4257514774799347, + "learning_rate": 3.809523809523809e-05, + "loss": 0.8273, + "step": 2418 + }, + { + "epoch": 2.6571467801730058, + "grad_norm": 1.0180964469909668, + "learning_rate": 3.797313797313797e-05, + "loss": 0.6962, + "step": 2419 + }, + { + "epoch": 2.658245228614582, + "grad_norm": 0.3844882547855377, + "learning_rate": 3.785103785103784e-05, + "loss": 0.7315, + "step": 2420 + }, + { + "epoch": 2.659343677056158, + "grad_norm": 0.46182385087013245, + "learning_rate": 3.772893772893772e-05, + "loss": 0.3889, + "step": 2421 + }, + { + "epoch": 2.6604421254977346, + "grad_norm": 0.562627375125885, + "learning_rate": 3.760683760683761e-05, + "loss": 0.6415, + "step": 2422 + }, + { + "epoch": 2.6615405739393108, + "grad_norm": 0.3234645128250122, + "learning_rate": 3.7484737484737486e-05, + "loss": 0.4819, + "step": 2423 + }, + { + "epoch": 2.662639022380887, + "grad_norm": 0.6804205775260925, + "learning_rate": 3.736263736263736e-05, + "loss": 0.4248, + "step": 2424 + }, + { + "epoch": 2.663737470822463, + "grad_norm": 0.5543864369392395, + "learning_rate": 3.7240537240537236e-05, + "loss": 0.5259, + "step": 2425 + }, + { + "epoch": 2.6648359192640396, + "grad_norm": 0.8411497473716736, + "learning_rate": 3.7118437118437115e-05, + "loss": 0.5448, + "step": 2426 + }, + { + "epoch": 2.6659343677056158, + "grad_norm": 0.4386245608329773, + "learning_rate": 3.6996336996336994e-05, + "loss": 0.9601, + "step": 2427 + }, + { + "epoch": 2.6670328161471923, + "grad_norm": 0.773210346698761, + "learning_rate": 3.687423687423687e-05, + "loss": 0.8601, + "step": 2428 + }, + { + "epoch": 2.6681312645887685, + "grad_norm": 0.4636232852935791, + "learning_rate": 3.675213675213675e-05, + "loss": 0.6322, + "step": 2429 + }, + { + "epoch": 2.6692297130303446, + "grad_norm": 1.6318496465682983, + "learning_rate": 3.663003663003662e-05, + "loss": 0.4402, + "step": 2430 + }, + { + "epoch": 2.6703281614719208, + "grad_norm": 0.5299782156944275, + "learning_rate": 3.65079365079365e-05, + "loss": 0.5622, + "step": 2431 + }, + { + "epoch": 2.6714266099134973, + "grad_norm": 1.1223825216293335, + "learning_rate": 3.638583638583638e-05, + "loss": 0.5994, + "step": 2432 + }, + { + "epoch": 2.6725250583550735, + "grad_norm": 1.8495402336120605, + "learning_rate": 3.626373626373626e-05, + "loss": 0.669, + "step": 2433 + }, + { + "epoch": 2.6736235067966496, + "grad_norm": 0.4963383972644806, + "learning_rate": 3.614163614163614e-05, + "loss": 0.5412, + "step": 2434 + }, + { + "epoch": 2.674721955238226, + "grad_norm": 0.5644822716712952, + "learning_rate": 3.601953601953602e-05, + "loss": 0.5768, + "step": 2435 + }, + { + "epoch": 2.6758204036798023, + "grad_norm": 0.5272318720817566, + "learning_rate": 3.5897435897435896e-05, + "loss": 0.5909, + "step": 2436 + }, + { + "epoch": 2.6769188521213785, + "grad_norm": 0.29838863015174866, + "learning_rate": 3.5775335775335774e-05, + "loss": 0.5625, + "step": 2437 + }, + { + "epoch": 2.6780173005629546, + "grad_norm": 0.5375344157218933, + "learning_rate": 3.565323565323565e-05, + "loss": 0.5932, + "step": 2438 + }, + { + "epoch": 2.679115749004531, + "grad_norm": 0.7850833535194397, + "learning_rate": 3.5531135531135525e-05, + "loss": 0.6706, + "step": 2439 + }, + { + "epoch": 2.6802141974461073, + "grad_norm": 0.5286651253700256, + "learning_rate": 3.540903540903541e-05, + "loss": 0.6865, + "step": 2440 + }, + { + "epoch": 2.681312645887684, + "grad_norm": 0.9832364320755005, + "learning_rate": 3.528693528693528e-05, + "loss": 0.7941, + "step": 2441 + }, + { + "epoch": 2.68241109432926, + "grad_norm": 0.4431805908679962, + "learning_rate": 3.516483516483516e-05, + "loss": 0.4706, + "step": 2442 + }, + { + "epoch": 2.683509542770836, + "grad_norm": 1.7264482975006104, + "learning_rate": 3.504273504273504e-05, + "loss": 0.6308, + "step": 2443 + }, + { + "epoch": 2.6846079912124123, + "grad_norm": 0.6196084022521973, + "learning_rate": 3.492063492063492e-05, + "loss": 1.0233, + "step": 2444 + }, + { + "epoch": 2.6857064396539885, + "grad_norm": 0.855876088142395, + "learning_rate": 3.47985347985348e-05, + "loss": 0.5522, + "step": 2445 + }, + { + "epoch": 2.686804888095565, + "grad_norm": 0.45323798060417175, + "learning_rate": 3.4676434676434676e-05, + "loss": 0.6232, + "step": 2446 + }, + { + "epoch": 2.687903336537141, + "grad_norm": 0.577273964881897, + "learning_rate": 3.455433455433455e-05, + "loss": 0.5051, + "step": 2447 + }, + { + "epoch": 2.689001784978718, + "grad_norm": 0.4999620020389557, + "learning_rate": 3.4432234432234427e-05, + "loss": 0.4881, + "step": 2448 + }, + { + "epoch": 2.690100233420294, + "grad_norm": 0.5028046369552612, + "learning_rate": 3.431013431013431e-05, + "loss": 0.6575, + "step": 2449 + }, + { + "epoch": 2.69119868186187, + "grad_norm": 2.122028350830078, + "learning_rate": 3.4188034188034184e-05, + "loss": 0.7226, + "step": 2450 + }, + { + "epoch": 2.692297130303446, + "grad_norm": 0.4979703426361084, + "learning_rate": 3.406593406593406e-05, + "loss": 0.5768, + "step": 2451 + }, + { + "epoch": 2.693395578745023, + "grad_norm": 0.9270527958869934, + "learning_rate": 3.394383394383394e-05, + "loss": 0.6464, + "step": 2452 + }, + { + "epoch": 2.694494027186599, + "grad_norm": 1.0739809274673462, + "learning_rate": 3.382173382173382e-05, + "loss": 0.753, + "step": 2453 + }, + { + "epoch": 2.695592475628175, + "grad_norm": 0.6039335131645203, + "learning_rate": 3.36996336996337e-05, + "loss": 0.7909, + "step": 2454 + }, + { + "epoch": 2.6966909240697516, + "grad_norm": 0.49040424823760986, + "learning_rate": 3.357753357753358e-05, + "loss": 0.6112, + "step": 2455 + }, + { + "epoch": 2.6977893725113278, + "grad_norm": 0.6890440583229065, + "learning_rate": 3.345543345543345e-05, + "loss": 0.6849, + "step": 2456 + }, + { + "epoch": 2.698887820952904, + "grad_norm": 0.7819212675094604, + "learning_rate": 3.333333333333333e-05, + "loss": 0.6797, + "step": 2457 + }, + { + "epoch": 2.69998626939448, + "grad_norm": 1.0147050619125366, + "learning_rate": 3.321123321123321e-05, + "loss": 0.6867, + "step": 2458 + }, + { + "epoch": 2.7010847178360566, + "grad_norm": 1.3562036752700806, + "learning_rate": 3.3089133089133086e-05, + "loss": 0.7811, + "step": 2459 + }, + { + "epoch": 2.7021831662776328, + "grad_norm": 0.5813838839530945, + "learning_rate": 3.2967032967032964e-05, + "loss": 0.5405, + "step": 2460 + }, + { + "epoch": 2.7032816147192094, + "grad_norm": 0.6152640581130981, + "learning_rate": 3.284493284493284e-05, + "loss": 0.425, + "step": 2461 + }, + { + "epoch": 2.7043800631607855, + "grad_norm": 1.1984590291976929, + "learning_rate": 3.272283272283272e-05, + "loss": 0.592, + "step": 2462 + }, + { + "epoch": 2.7054785116023616, + "grad_norm": 0.48487693071365356, + "learning_rate": 3.26007326007326e-05, + "loss": 0.5223, + "step": 2463 + }, + { + "epoch": 2.7065769600439378, + "grad_norm": 0.47191065549850464, + "learning_rate": 3.247863247863247e-05, + "loss": 0.6479, + "step": 2464 + }, + { + "epoch": 2.7076754084855144, + "grad_norm": 1.3167297840118408, + "learning_rate": 3.235653235653235e-05, + "loss": 0.4552, + "step": 2465 + }, + { + "epoch": 2.7087738569270905, + "grad_norm": 1.3219714164733887, + "learning_rate": 3.2234432234432237e-05, + "loss": 0.5839, + "step": 2466 + }, + { + "epoch": 2.7098723053686666, + "grad_norm": 0.8047394752502441, + "learning_rate": 3.211233211233211e-05, + "loss": 0.795, + "step": 2467 + }, + { + "epoch": 2.710970753810243, + "grad_norm": 0.6053475737571716, + "learning_rate": 3.199023199023199e-05, + "loss": 0.743, + "step": 2468 + }, + { + "epoch": 2.7120692022518194, + "grad_norm": 0.4619985818862915, + "learning_rate": 3.1868131868131866e-05, + "loss": 0.642, + "step": 2469 + }, + { + "epoch": 2.7131676506933955, + "grad_norm": 0.8241426944732666, + "learning_rate": 3.1746031746031745e-05, + "loss": 0.521, + "step": 2470 + }, + { + "epoch": 2.7142660991349716, + "grad_norm": 0.4344565272331238, + "learning_rate": 3.162393162393162e-05, + "loss": 0.4615, + "step": 2471 + }, + { + "epoch": 2.715364547576548, + "grad_norm": 0.9640605449676514, + "learning_rate": 3.15018315018315e-05, + "loss": 0.4735, + "step": 2472 + }, + { + "epoch": 2.7164629960181244, + "grad_norm": 0.49423810839653015, + "learning_rate": 3.1379731379731374e-05, + "loss": 0.7547, + "step": 2473 + }, + { + "epoch": 2.717561444459701, + "grad_norm": 0.7234408855438232, + "learning_rate": 3.125763125763125e-05, + "loss": 0.464, + "step": 2474 + }, + { + "epoch": 2.718659892901277, + "grad_norm": 0.542647123336792, + "learning_rate": 3.113553113553113e-05, + "loss": 0.5563, + "step": 2475 + }, + { + "epoch": 2.719758341342853, + "grad_norm": 0.555722177028656, + "learning_rate": 3.101343101343101e-05, + "loss": 0.6899, + "step": 2476 + }, + { + "epoch": 2.7208567897844294, + "grad_norm": 0.6171600222587585, + "learning_rate": 3.089133089133089e-05, + "loss": 0.6088, + "step": 2477 + }, + { + "epoch": 2.7219552382260055, + "grad_norm": 0.9118738770484924, + "learning_rate": 3.076923076923077e-05, + "loss": 0.7778, + "step": 2478 + }, + { + "epoch": 2.723053686667582, + "grad_norm": 0.6610655784606934, + "learning_rate": 3.064713064713064e-05, + "loss": 0.6935, + "step": 2479 + }, + { + "epoch": 2.724152135109158, + "grad_norm": 0.6729289889335632, + "learning_rate": 3.0525030525030525e-05, + "loss": 0.792, + "step": 2480 + }, + { + "epoch": 2.725250583550735, + "grad_norm": 0.4955647587776184, + "learning_rate": 3.04029304029304e-05, + "loss": 0.6746, + "step": 2481 + }, + { + "epoch": 2.726349031992311, + "grad_norm": 0.42975953221321106, + "learning_rate": 3.028083028083028e-05, + "loss": 0.5318, + "step": 2482 + }, + { + "epoch": 2.727447480433887, + "grad_norm": 0.3555055856704712, + "learning_rate": 3.0158730158730154e-05, + "loss": 0.6377, + "step": 2483 + }, + { + "epoch": 2.728545928875463, + "grad_norm": 3.138209342956543, + "learning_rate": 3.0036630036630036e-05, + "loss": 0.6296, + "step": 2484 + }, + { + "epoch": 2.72964437731704, + "grad_norm": 0.5710242390632629, + "learning_rate": 2.9914529914529912e-05, + "loss": 0.8987, + "step": 2485 + }, + { + "epoch": 2.730742825758616, + "grad_norm": 0.5200769305229187, + "learning_rate": 2.979242979242979e-05, + "loss": 0.5154, + "step": 2486 + }, + { + "epoch": 2.731841274200192, + "grad_norm": 0.797572910785675, + "learning_rate": 2.9670329670329666e-05, + "loss": 0.8039, + "step": 2487 + }, + { + "epoch": 2.7329397226417687, + "grad_norm": 0.4667447805404663, + "learning_rate": 2.9548229548229548e-05, + "loss": 0.586, + "step": 2488 + }, + { + "epoch": 2.734038171083345, + "grad_norm": 0.5500869154930115, + "learning_rate": 2.9426129426129423e-05, + "loss": 0.7007, + "step": 2489 + }, + { + "epoch": 2.735136619524921, + "grad_norm": 0.5311625003814697, + "learning_rate": 2.9304029304029302e-05, + "loss": 0.4257, + "step": 2490 + }, + { + "epoch": 2.736235067966497, + "grad_norm": 0.6474941968917847, + "learning_rate": 2.9181929181929177e-05, + "loss": 0.4747, + "step": 2491 + }, + { + "epoch": 2.7373335164080737, + "grad_norm": 1.1186646223068237, + "learning_rate": 2.9059829059829056e-05, + "loss": 0.8177, + "step": 2492 + }, + { + "epoch": 2.73843196484965, + "grad_norm": 2.455371379852295, + "learning_rate": 2.8937728937728938e-05, + "loss": 0.6535, + "step": 2493 + }, + { + "epoch": 2.7395304132912264, + "grad_norm": 0.5033484101295471, + "learning_rate": 2.8815628815628813e-05, + "loss": 0.525, + "step": 2494 + }, + { + "epoch": 2.7406288617328025, + "grad_norm": 0.5826357007026672, + "learning_rate": 2.869352869352869e-05, + "loss": 0.476, + "step": 2495 + }, + { + "epoch": 2.7417273101743787, + "grad_norm": 0.5875104665756226, + "learning_rate": 2.8571428571428567e-05, + "loss": 0.6903, + "step": 2496 + }, + { + "epoch": 2.742825758615955, + "grad_norm": 0.6006028056144714, + "learning_rate": 2.844932844932845e-05, + "loss": 0.8522, + "step": 2497 + }, + { + "epoch": 2.7439242070575314, + "grad_norm": 0.5605003833770752, + "learning_rate": 2.8327228327228325e-05, + "loss": 0.5312, + "step": 2498 + }, + { + "epoch": 2.7450226554991075, + "grad_norm": 0.7641153931617737, + "learning_rate": 2.8205128205128204e-05, + "loss": 0.6841, + "step": 2499 + }, + { + "epoch": 2.7461211039406836, + "grad_norm": 0.5523414015769958, + "learning_rate": 2.808302808302808e-05, + "loss": 0.6582, + "step": 2500 + }, + { + "epoch": 2.7472195523822602, + "grad_norm": 0.40714672207832336, + "learning_rate": 2.796092796092796e-05, + "loss": 0.7493, + "step": 2501 + }, + { + "epoch": 2.7483180008238364, + "grad_norm": 0.6960926651954651, + "learning_rate": 2.7838827838827836e-05, + "loss": 0.7104, + "step": 2502 + }, + { + "epoch": 2.7494164492654125, + "grad_norm": 0.42409783601760864, + "learning_rate": 2.7716727716727715e-05, + "loss": 0.5643, + "step": 2503 + }, + { + "epoch": 2.7505148977069886, + "grad_norm": 0.5174455046653748, + "learning_rate": 2.759462759462759e-05, + "loss": 0.4545, + "step": 2504 + }, + { + "epoch": 2.7516133461485652, + "grad_norm": 0.6353528499603271, + "learning_rate": 2.747252747252747e-05, + "loss": 0.5068, + "step": 2505 + }, + { + "epoch": 2.7527117945901414, + "grad_norm": 0.46814125776290894, + "learning_rate": 2.7350427350427348e-05, + "loss": 0.7979, + "step": 2506 + }, + { + "epoch": 2.753810243031718, + "grad_norm": 0.7229417562484741, + "learning_rate": 2.7228327228327227e-05, + "loss": 0.6212, + "step": 2507 + }, + { + "epoch": 2.754908691473294, + "grad_norm": 1.2155603170394897, + "learning_rate": 2.7106227106227102e-05, + "loss": 0.8444, + "step": 2508 + }, + { + "epoch": 2.7560071399148702, + "grad_norm": 0.462703138589859, + "learning_rate": 2.698412698412698e-05, + "loss": 0.8263, + "step": 2509 + }, + { + "epoch": 2.7571055883564464, + "grad_norm": 0.9474642872810364, + "learning_rate": 2.6862026862026863e-05, + "loss": 0.7586, + "step": 2510 + }, + { + "epoch": 2.758204036798023, + "grad_norm": 4.502622127532959, + "learning_rate": 2.6739926739926738e-05, + "loss": 0.5806, + "step": 2511 + }, + { + "epoch": 2.759302485239599, + "grad_norm": 1.1251213550567627, + "learning_rate": 2.6617826617826617e-05, + "loss": 0.6333, + "step": 2512 + }, + { + "epoch": 2.7604009336811752, + "grad_norm": 0.7035579681396484, + "learning_rate": 2.6495726495726492e-05, + "loss": 0.4739, + "step": 2513 + }, + { + "epoch": 2.761499382122752, + "grad_norm": 0.5279493927955627, + "learning_rate": 2.6373626373626374e-05, + "loss": 0.597, + "step": 2514 + }, + { + "epoch": 2.762597830564328, + "grad_norm": 0.5512554049491882, + "learning_rate": 2.625152625152625e-05, + "loss": 0.6471, + "step": 2515 + }, + { + "epoch": 2.763696279005904, + "grad_norm": 0.857778012752533, + "learning_rate": 2.6129426129426128e-05, + "loss": 0.6172, + "step": 2516 + }, + { + "epoch": 2.7647947274474802, + "grad_norm": 0.5348466634750366, + "learning_rate": 2.6007326007326004e-05, + "loss": 0.8074, + "step": 2517 + }, + { + "epoch": 2.765893175889057, + "grad_norm": 0.5413629412651062, + "learning_rate": 2.5885225885225882e-05, + "loss": 0.3879, + "step": 2518 + }, + { + "epoch": 2.766991624330633, + "grad_norm": 0.569411039352417, + "learning_rate": 2.576312576312576e-05, + "loss": 0.4392, + "step": 2519 + }, + { + "epoch": 2.7680900727722095, + "grad_norm": 0.5127429962158203, + "learning_rate": 2.564102564102564e-05, + "loss": 0.6566, + "step": 2520 + }, + { + "epoch": 2.7691885212137857, + "grad_norm": 0.7328614592552185, + "learning_rate": 2.5518925518925515e-05, + "loss": 0.6801, + "step": 2521 + }, + { + "epoch": 2.770286969655362, + "grad_norm": 0.615686297416687, + "learning_rate": 2.5396825396825394e-05, + "loss": 0.6366, + "step": 2522 + }, + { + "epoch": 2.771385418096938, + "grad_norm": 0.5250161290168762, + "learning_rate": 2.5274725274725276e-05, + "loss": 0.5737, + "step": 2523 + }, + { + "epoch": 2.772483866538514, + "grad_norm": 0.6708832383155823, + "learning_rate": 2.515262515262515e-05, + "loss": 0.6681, + "step": 2524 + }, + { + "epoch": 2.7735823149800907, + "grad_norm": 0.6120278835296631, + "learning_rate": 2.503052503052503e-05, + "loss": 0.4964, + "step": 2525 + }, + { + "epoch": 2.774680763421667, + "grad_norm": 0.7024976015090942, + "learning_rate": 2.4908424908424905e-05, + "loss": 0.7984, + "step": 2526 + }, + { + "epoch": 2.7757792118632434, + "grad_norm": 7.281716823577881, + "learning_rate": 2.478632478632478e-05, + "loss": 0.7191, + "step": 2527 + }, + { + "epoch": 2.7768776603048195, + "grad_norm": 0.7347024083137512, + "learning_rate": 2.4664224664224663e-05, + "loss": 0.8684, + "step": 2528 + }, + { + "epoch": 2.7779761087463957, + "grad_norm": 1.1338274478912354, + "learning_rate": 2.454212454212454e-05, + "loss": 0.5936, + "step": 2529 + }, + { + "epoch": 2.779074557187972, + "grad_norm": 0.4176536202430725, + "learning_rate": 2.4420024420024417e-05, + "loss": 0.445, + "step": 2530 + }, + { + "epoch": 2.7801730056295484, + "grad_norm": 0.9390072822570801, + "learning_rate": 2.4297924297924295e-05, + "loss": 0.5821, + "step": 2531 + }, + { + "epoch": 2.7812714540711245, + "grad_norm": 1.1045840978622437, + "learning_rate": 2.4175824175824174e-05, + "loss": 0.7372, + "step": 2532 + }, + { + "epoch": 2.7823699025127007, + "grad_norm": 0.5568689703941345, + "learning_rate": 2.4053724053724053e-05, + "loss": 0.5005, + "step": 2533 + }, + { + "epoch": 2.7834683509542772, + "grad_norm": 0.2747582793235779, + "learning_rate": 2.3931623931623928e-05, + "loss": 0.5778, + "step": 2534 + }, + { + "epoch": 2.7845667993958534, + "grad_norm": 1.4027804136276245, + "learning_rate": 2.3809523809523807e-05, + "loss": 0.5368, + "step": 2535 + }, + { + "epoch": 2.7856652478374295, + "grad_norm": 0.7523220777511597, + "learning_rate": 2.368742368742369e-05, + "loss": 0.58, + "step": 2536 + }, + { + "epoch": 2.7867636962790057, + "grad_norm": 0.33777353167533875, + "learning_rate": 2.3565323565323564e-05, + "loss": 0.5269, + "step": 2537 + }, + { + "epoch": 2.7878621447205822, + "grad_norm": 0.5818787217140198, + "learning_rate": 2.344322344322344e-05, + "loss": 0.4459, + "step": 2538 + }, + { + "epoch": 2.7889605931621584, + "grad_norm": 0.36858034133911133, + "learning_rate": 2.3321123321123318e-05, + "loss": 0.712, + "step": 2539 + }, + { + "epoch": 2.790059041603735, + "grad_norm": 0.5299241542816162, + "learning_rate": 2.3199023199023194e-05, + "loss": 0.6086, + "step": 2540 + }, + { + "epoch": 2.791157490045311, + "grad_norm": 2.432325601577759, + "learning_rate": 2.3076923076923076e-05, + "loss": 1.0386, + "step": 2541 + }, + { + "epoch": 2.7922559384868872, + "grad_norm": 0.746638834476471, + "learning_rate": 2.2954822954822954e-05, + "loss": 0.7372, + "step": 2542 + }, + { + "epoch": 2.7933543869284634, + "grad_norm": 0.6017647981643677, + "learning_rate": 2.283272283272283e-05, + "loss": 0.9134, + "step": 2543 + }, + { + "epoch": 2.79445283537004, + "grad_norm": 0.7385385036468506, + "learning_rate": 2.271062271062271e-05, + "loss": 0.6827, + "step": 2544 + }, + { + "epoch": 2.795551283811616, + "grad_norm": 0.6607246994972229, + "learning_rate": 2.2588522588522587e-05, + "loss": 0.6333, + "step": 2545 + }, + { + "epoch": 2.7966497322531922, + "grad_norm": 0.40185117721557617, + "learning_rate": 2.2466422466422466e-05, + "loss": 0.6589, + "step": 2546 + }, + { + "epoch": 2.797748180694769, + "grad_norm": 0.48225662112236023, + "learning_rate": 2.234432234432234e-05, + "loss": 0.6571, + "step": 2547 + }, + { + "epoch": 2.798846629136345, + "grad_norm": 0.8996065855026245, + "learning_rate": 2.222222222222222e-05, + "loss": 0.7518, + "step": 2548 + }, + { + "epoch": 2.799945077577921, + "grad_norm": 0.7139112949371338, + "learning_rate": 2.21001221001221e-05, + "loss": 0.6517, + "step": 2549 + }, + { + "epoch": 2.8010435260194972, + "grad_norm": 0.5433416366577148, + "learning_rate": 2.1978021978021977e-05, + "loss": 0.3799, + "step": 2550 + }, + { + "epoch": 2.802141974461074, + "grad_norm": 0.3883088231086731, + "learning_rate": 2.1855921855921853e-05, + "loss": 0.9269, + "step": 2551 + }, + { + "epoch": 2.80324042290265, + "grad_norm": 0.5275357961654663, + "learning_rate": 2.173382173382173e-05, + "loss": 0.6606, + "step": 2552 + }, + { + "epoch": 2.8043388713442265, + "grad_norm": 0.4666341543197632, + "learning_rate": 2.1611721611721607e-05, + "loss": 0.6982, + "step": 2553 + }, + { + "epoch": 2.8054373197858027, + "grad_norm": 0.9221529364585876, + "learning_rate": 2.148962148962149e-05, + "loss": 0.4769, + "step": 2554 + }, + { + "epoch": 2.806535768227379, + "grad_norm": 0.7469640374183655, + "learning_rate": 2.1367521367521368e-05, + "loss": 0.6985, + "step": 2555 + }, + { + "epoch": 2.807634216668955, + "grad_norm": 0.6858775615692139, + "learning_rate": 2.1245421245421243e-05, + "loss": 0.4511, + "step": 2556 + }, + { + "epoch": 2.808732665110531, + "grad_norm": 1.266801357269287, + "learning_rate": 2.112332112332112e-05, + "loss": 0.421, + "step": 2557 + }, + { + "epoch": 2.8098311135521077, + "grad_norm": 0.5506262183189392, + "learning_rate": 2.1001221001221e-05, + "loss": 0.6082, + "step": 2558 + }, + { + "epoch": 2.810929561993684, + "grad_norm": 0.5359029173851013, + "learning_rate": 2.087912087912088e-05, + "loss": 0.8111, + "step": 2559 + }, + { + "epoch": 2.8120280104352604, + "grad_norm": 0.6969206929206848, + "learning_rate": 2.0757020757020754e-05, + "loss": 0.8331, + "step": 2560 + }, + { + "epoch": 2.8131264588768365, + "grad_norm": 0.6040379405021667, + "learning_rate": 2.0634920634920633e-05, + "loss": 0.575, + "step": 2561 + }, + { + "epoch": 2.8142249073184127, + "grad_norm": 1.3847273588180542, + "learning_rate": 2.0512820512820512e-05, + "loss": 0.5442, + "step": 2562 + }, + { + "epoch": 2.815323355759989, + "grad_norm": 0.8050490617752075, + "learning_rate": 2.039072039072039e-05, + "loss": 0.6267, + "step": 2563 + }, + { + "epoch": 2.8164218042015654, + "grad_norm": 0.5663136839866638, + "learning_rate": 2.0268620268620266e-05, + "loss": 0.5246, + "step": 2564 + }, + { + "epoch": 2.8175202526431415, + "grad_norm": 0.3316130042076111, + "learning_rate": 2.0146520146520144e-05, + "loss": 0.5175, + "step": 2565 + }, + { + "epoch": 2.8186187010847177, + "grad_norm": 0.4782855808734894, + "learning_rate": 2.002442002442002e-05, + "loss": 0.5111, + "step": 2566 + }, + { + "epoch": 2.8197171495262943, + "grad_norm": 0.44766396284103394, + "learning_rate": 1.9902319902319902e-05, + "loss": 0.5825, + "step": 2567 + }, + { + "epoch": 2.8208155979678704, + "grad_norm": 0.6830618977546692, + "learning_rate": 1.978021978021978e-05, + "loss": 0.5685, + "step": 2568 + }, + { + "epoch": 2.8219140464094465, + "grad_norm": 0.5860748887062073, + "learning_rate": 1.9658119658119656e-05, + "loss": 0.7557, + "step": 2569 + }, + { + "epoch": 2.8230124948510227, + "grad_norm": 0.49533459544181824, + "learning_rate": 1.953601953601953e-05, + "loss": 0.7326, + "step": 2570 + }, + { + "epoch": 2.8241109432925993, + "grad_norm": 0.4989941418170929, + "learning_rate": 1.9413919413919413e-05, + "loss": 0.5757, + "step": 2571 + }, + { + "epoch": 2.8252093917341754, + "grad_norm": 0.4973461627960205, + "learning_rate": 1.9291819291819292e-05, + "loss": 0.5357, + "step": 2572 + }, + { + "epoch": 2.826307840175752, + "grad_norm": 0.7442370057106018, + "learning_rate": 1.9169719169719167e-05, + "loss": 0.7283, + "step": 2573 + }, + { + "epoch": 2.827406288617328, + "grad_norm": 1.3321865797042847, + "learning_rate": 1.9047619047619046e-05, + "loss": 0.5107, + "step": 2574 + }, + { + "epoch": 2.8285047370589043, + "grad_norm": 0.47394871711730957, + "learning_rate": 1.892551892551892e-05, + "loss": 0.5495, + "step": 2575 + }, + { + "epoch": 2.8296031855004804, + "grad_norm": 0.6102151274681091, + "learning_rate": 1.8803418803418804e-05, + "loss": 0.5983, + "step": 2576 + }, + { + "epoch": 2.830701633942057, + "grad_norm": 0.4657471179962158, + "learning_rate": 1.868131868131868e-05, + "loss": 0.5937, + "step": 2577 + }, + { + "epoch": 2.831800082383633, + "grad_norm": 0.41180238127708435, + "learning_rate": 1.8559218559218558e-05, + "loss": 0.7775, + "step": 2578 + }, + { + "epoch": 2.8328985308252093, + "grad_norm": 3.5043845176696777, + "learning_rate": 1.8437118437118436e-05, + "loss": 0.5304, + "step": 2579 + }, + { + "epoch": 2.833996979266786, + "grad_norm": 0.4502231776714325, + "learning_rate": 1.831501831501831e-05, + "loss": 0.6556, + "step": 2580 + }, + { + "epoch": 2.835095427708362, + "grad_norm": 0.6165898442268372, + "learning_rate": 1.819291819291819e-05, + "loss": 0.8434, + "step": 2581 + }, + { + "epoch": 2.836193876149938, + "grad_norm": 0.5112649202346802, + "learning_rate": 1.807081807081807e-05, + "loss": 0.7429, + "step": 2582 + }, + { + "epoch": 2.8372923245915143, + "grad_norm": 0.4834790527820587, + "learning_rate": 1.7948717948717948e-05, + "loss": 0.5772, + "step": 2583 + }, + { + "epoch": 2.838390773033091, + "grad_norm": 0.4251219630241394, + "learning_rate": 1.7826617826617826e-05, + "loss": 0.5192, + "step": 2584 + }, + { + "epoch": 2.839489221474667, + "grad_norm": 0.7645363807678223, + "learning_rate": 1.7704517704517705e-05, + "loss": 0.6624, + "step": 2585 + }, + { + "epoch": 2.8405876699162436, + "grad_norm": 0.5651314854621887, + "learning_rate": 1.758241758241758e-05, + "loss": 0.5829, + "step": 2586 + }, + { + "epoch": 2.8416861183578197, + "grad_norm": 1.059164047241211, + "learning_rate": 1.746031746031746e-05, + "loss": 0.6688, + "step": 2587 + }, + { + "epoch": 2.842784566799396, + "grad_norm": 2.2424001693725586, + "learning_rate": 1.7338217338217338e-05, + "loss": 0.4515, + "step": 2588 + }, + { + "epoch": 2.843883015240972, + "grad_norm": 0.6211466789245605, + "learning_rate": 1.7216117216117213e-05, + "loss": 0.836, + "step": 2589 + }, + { + "epoch": 2.8449814636825486, + "grad_norm": 0.4224345088005066, + "learning_rate": 1.7094017094017092e-05, + "loss": 0.536, + "step": 2590 + }, + { + "epoch": 2.8460799121241247, + "grad_norm": 0.7985780239105225, + "learning_rate": 1.697191697191697e-05, + "loss": 0.7433, + "step": 2591 + }, + { + "epoch": 2.847178360565701, + "grad_norm": 1.4033039808273315, + "learning_rate": 1.684981684981685e-05, + "loss": 0.7479, + "step": 2592 + }, + { + "epoch": 2.8482768090072774, + "grad_norm": 1.1432255506515503, + "learning_rate": 1.6727716727716725e-05, + "loss": 0.652, + "step": 2593 + }, + { + "epoch": 2.8493752574488536, + "grad_norm": 0.9324535727500916, + "learning_rate": 1.6605616605616603e-05, + "loss": 0.5225, + "step": 2594 + }, + { + "epoch": 2.8504737058904297, + "grad_norm": 0.5573447942733765, + "learning_rate": 1.6483516483516482e-05, + "loss": 0.6649, + "step": 2595 + }, + { + "epoch": 2.851572154332006, + "grad_norm": 0.6875207424163818, + "learning_rate": 1.636141636141636e-05, + "loss": 0.7334, + "step": 2596 + }, + { + "epoch": 2.8526706027735824, + "grad_norm": 0.32099124789237976, + "learning_rate": 1.6239316239316236e-05, + "loss": 0.5732, + "step": 2597 + }, + { + "epoch": 2.8537690512151586, + "grad_norm": 0.4142940938472748, + "learning_rate": 1.6117216117216118e-05, + "loss": 0.6605, + "step": 2598 + }, + { + "epoch": 2.8548674996567347, + "grad_norm": 0.5377205610275269, + "learning_rate": 1.5995115995115994e-05, + "loss": 0.5556, + "step": 2599 + }, + { + "epoch": 2.8559659480983113, + "grad_norm": 0.43509960174560547, + "learning_rate": 1.5873015873015872e-05, + "loss": 0.8321, + "step": 2600 + }, + { + "epoch": 2.8570643965398874, + "grad_norm": 0.4376494586467743, + "learning_rate": 1.575091575091575e-05, + "loss": 0.6392, + "step": 2601 + }, + { + "epoch": 2.8581628449814636, + "grad_norm": 0.507837176322937, + "learning_rate": 1.5628815628815626e-05, + "loss": 0.5326, + "step": 2602 + }, + { + "epoch": 2.8592612934230397, + "grad_norm": 29.0502986907959, + "learning_rate": 1.5506715506715505e-05, + "loss": 0.5478, + "step": 2603 + }, + { + "epoch": 2.8603597418646163, + "grad_norm": 0.6940420866012573, + "learning_rate": 1.5384615384615384e-05, + "loss": 1.3063, + "step": 2604 + }, + { + "epoch": 2.8614581903061924, + "grad_norm": 0.7178813219070435, + "learning_rate": 1.5262515262515263e-05, + "loss": 0.7447, + "step": 2605 + }, + { + "epoch": 2.862556638747769, + "grad_norm": 0.6209506392478943, + "learning_rate": 1.514041514041514e-05, + "loss": 0.5496, + "step": 2606 + }, + { + "epoch": 2.863655087189345, + "grad_norm": 0.5526819825172424, + "learning_rate": 1.5018315018315018e-05, + "loss": 0.4224, + "step": 2607 + }, + { + "epoch": 2.8647535356309213, + "grad_norm": 0.5056405663490295, + "learning_rate": 1.4896214896214895e-05, + "loss": 0.6248, + "step": 2608 + }, + { + "epoch": 2.8658519840724974, + "grad_norm": 2.416952610015869, + "learning_rate": 1.4774114774114774e-05, + "loss": 0.7551, + "step": 2609 + }, + { + "epoch": 2.866950432514074, + "grad_norm": 0.52223140001297, + "learning_rate": 1.4652014652014651e-05, + "loss": 1.1146, + "step": 2610 + }, + { + "epoch": 2.86804888095565, + "grad_norm": 0.685767650604248, + "learning_rate": 1.4529914529914528e-05, + "loss": 0.715, + "step": 2611 + }, + { + "epoch": 2.8691473293972263, + "grad_norm": 0.650374174118042, + "learning_rate": 1.4407814407814407e-05, + "loss": 0.8844, + "step": 2612 + }, + { + "epoch": 2.870245777838803, + "grad_norm": 0.46946465969085693, + "learning_rate": 1.4285714285714284e-05, + "loss": 0.9545, + "step": 2613 + }, + { + "epoch": 2.871344226280379, + "grad_norm": 0.5312052369117737, + "learning_rate": 1.4163614163614162e-05, + "loss": 0.5204, + "step": 2614 + }, + { + "epoch": 2.872442674721955, + "grad_norm": 0.41921889781951904, + "learning_rate": 1.404151404151404e-05, + "loss": 0.4614, + "step": 2615 + }, + { + "epoch": 2.8735411231635313, + "grad_norm": 0.513203501701355, + "learning_rate": 1.3919413919413918e-05, + "loss": 0.613, + "step": 2616 + }, + { + "epoch": 2.874639571605108, + "grad_norm": 1.1020901203155518, + "learning_rate": 1.3797313797313795e-05, + "loss": 0.525, + "step": 2617 + }, + { + "epoch": 2.875738020046684, + "grad_norm": 0.39301392436027527, + "learning_rate": 1.3675213675213674e-05, + "loss": 0.5799, + "step": 2618 + }, + { + "epoch": 2.8768364684882606, + "grad_norm": 1.576910376548767, + "learning_rate": 1.3553113553113551e-05, + "loss": 0.6286, + "step": 2619 + }, + { + "epoch": 2.8779349169298367, + "grad_norm": 0.36711424589157104, + "learning_rate": 1.3431013431013431e-05, + "loss": 0.7542, + "step": 2620 + }, + { + "epoch": 2.879033365371413, + "grad_norm": 1.2777636051177979, + "learning_rate": 1.3308913308913308e-05, + "loss": 0.6269, + "step": 2621 + }, + { + "epoch": 2.880131813812989, + "grad_norm": 0.5584180355072021, + "learning_rate": 1.3186813186813187e-05, + "loss": 0.5633, + "step": 2622 + }, + { + "epoch": 2.8812302622545656, + "grad_norm": 1.2418673038482666, + "learning_rate": 1.3064713064713064e-05, + "loss": 0.537, + "step": 2623 + }, + { + "epoch": 2.8823287106961417, + "grad_norm": 0.5850531458854675, + "learning_rate": 1.2942612942612941e-05, + "loss": 0.595, + "step": 2624 + }, + { + "epoch": 2.883427159137718, + "grad_norm": 1.054592251777649, + "learning_rate": 1.282051282051282e-05, + "loss": 0.8308, + "step": 2625 + }, + { + "epoch": 2.8845256075792944, + "grad_norm": 0.3231412470340729, + "learning_rate": 1.2698412698412697e-05, + "loss": 0.4044, + "step": 2626 + }, + { + "epoch": 2.8856240560208706, + "grad_norm": 0.47942933440208435, + "learning_rate": 1.2576312576312576e-05, + "loss": 0.6299, + "step": 2627 + }, + { + "epoch": 2.8867225044624467, + "grad_norm": 0.4884187579154968, + "learning_rate": 1.2454212454212453e-05, + "loss": 0.6606, + "step": 2628 + }, + { + "epoch": 2.887820952904023, + "grad_norm": 0.6658734083175659, + "learning_rate": 1.2332112332112331e-05, + "loss": 0.642, + "step": 2629 + }, + { + "epoch": 2.8889194013455994, + "grad_norm": 0.24990247189998627, + "learning_rate": 1.2210012210012208e-05, + "loss": 0.4041, + "step": 2630 + }, + { + "epoch": 2.8900178497871756, + "grad_norm": 0.6446508169174194, + "learning_rate": 1.2087912087912087e-05, + "loss": 0.7126, + "step": 2631 + }, + { + "epoch": 2.891116298228752, + "grad_norm": 0.7800988554954529, + "learning_rate": 1.1965811965811964e-05, + "loss": 0.6733, + "step": 2632 + }, + { + "epoch": 2.8922147466703283, + "grad_norm": 0.5319482684135437, + "learning_rate": 1.1843711843711844e-05, + "loss": 0.6445, + "step": 2633 + }, + { + "epoch": 2.8933131951119044, + "grad_norm": 0.6029678583145142, + "learning_rate": 1.172161172161172e-05, + "loss": 0.7642, + "step": 2634 + }, + { + "epoch": 2.8944116435534806, + "grad_norm": 0.9029693007469177, + "learning_rate": 1.1599511599511597e-05, + "loss": 0.635, + "step": 2635 + }, + { + "epoch": 2.8955100919950567, + "grad_norm": 0.6022691130638123, + "learning_rate": 1.1477411477411477e-05, + "loss": 0.5361, + "step": 2636 + }, + { + "epoch": 2.8966085404366333, + "grad_norm": 0.6777801513671875, + "learning_rate": 1.1355311355311354e-05, + "loss": 0.5099, + "step": 2637 + }, + { + "epoch": 2.8977069888782094, + "grad_norm": 0.4157528877258301, + "learning_rate": 1.1233211233211233e-05, + "loss": 0.5038, + "step": 2638 + }, + { + "epoch": 2.898805437319786, + "grad_norm": 2.6101133823394775, + "learning_rate": 1.111111111111111e-05, + "loss": 0.6324, + "step": 2639 + }, + { + "epoch": 2.899903885761362, + "grad_norm": 0.6885612607002258, + "learning_rate": 1.0989010989010989e-05, + "loss": 0.4931, + "step": 2640 + }, + { + "epoch": 2.9010023342029383, + "grad_norm": 0.5510079264640808, + "learning_rate": 1.0866910866910866e-05, + "loss": 0.5088, + "step": 2641 + }, + { + "epoch": 2.9021007826445144, + "grad_norm": 0.6099854111671448, + "learning_rate": 1.0744810744810744e-05, + "loss": 0.4647, + "step": 2642 + }, + { + "epoch": 2.903199231086091, + "grad_norm": 0.4390881657600403, + "learning_rate": 1.0622710622710621e-05, + "loss": 0.6787, + "step": 2643 + }, + { + "epoch": 2.904297679527667, + "grad_norm": 0.46238628029823303, + "learning_rate": 1.05006105006105e-05, + "loss": 0.5655, + "step": 2644 + }, + { + "epoch": 2.9053961279692433, + "grad_norm": 0.479106605052948, + "learning_rate": 1.0378510378510377e-05, + "loss": 0.7833, + "step": 2645 + }, + { + "epoch": 2.90649457641082, + "grad_norm": 0.4643683135509491, + "learning_rate": 1.0256410256410256e-05, + "loss": 0.4563, + "step": 2646 + }, + { + "epoch": 2.907593024852396, + "grad_norm": 0.4173976480960846, + "learning_rate": 1.0134310134310133e-05, + "loss": 0.6614, + "step": 2647 + }, + { + "epoch": 2.908691473293972, + "grad_norm": 0.7158990502357483, + "learning_rate": 1.001221001221001e-05, + "loss": 0.7342, + "step": 2648 + }, + { + "epoch": 2.9097899217355483, + "grad_norm": 0.7276301980018616, + "learning_rate": 9.89010989010989e-06, + "loss": 0.6883, + "step": 2649 + }, + { + "epoch": 2.910888370177125, + "grad_norm": 0.63588947057724, + "learning_rate": 9.768009768009766e-06, + "loss": 0.7533, + "step": 2650 + }, + { + "epoch": 2.911986818618701, + "grad_norm": 1.8038127422332764, + "learning_rate": 9.645909645909646e-06, + "loss": 0.6238, + "step": 2651 + }, + { + "epoch": 2.9130852670602776, + "grad_norm": 0.7289617657661438, + "learning_rate": 9.523809523809523e-06, + "loss": 0.4767, + "step": 2652 + }, + { + "epoch": 2.9141837155018537, + "grad_norm": 0.3828502893447876, + "learning_rate": 9.401709401709402e-06, + "loss": 0.4812, + "step": 2653 + }, + { + "epoch": 2.91528216394343, + "grad_norm": 0.5157826542854309, + "learning_rate": 9.279609279609279e-06, + "loss": 0.703, + "step": 2654 + }, + { + "epoch": 2.916380612385006, + "grad_norm": 0.6833345890045166, + "learning_rate": 9.157509157509156e-06, + "loss": 0.7471, + "step": 2655 + }, + { + "epoch": 2.9174790608265826, + "grad_norm": 1.0189886093139648, + "learning_rate": 9.035409035409035e-06, + "loss": 0.6065, + "step": 2656 + }, + { + "epoch": 2.9185775092681587, + "grad_norm": 0.5197221040725708, + "learning_rate": 8.913308913308913e-06, + "loss": 0.5904, + "step": 2657 + }, + { + "epoch": 2.919675957709735, + "grad_norm": 0.6265780925750732, + "learning_rate": 8.79120879120879e-06, + "loss": 0.5622, + "step": 2658 + }, + { + "epoch": 2.9207744061513115, + "grad_norm": 0.5703533887863159, + "learning_rate": 8.669108669108669e-06, + "loss": 0.8005, + "step": 2659 + }, + { + "epoch": 2.9218728545928876, + "grad_norm": 0.8656613230705261, + "learning_rate": 8.547008547008546e-06, + "loss": 0.4942, + "step": 2660 + }, + { + "epoch": 2.9229713030344637, + "grad_norm": 0.6180423498153687, + "learning_rate": 8.424908424908425e-06, + "loss": 0.8163, + "step": 2661 + }, + { + "epoch": 2.92406975147604, + "grad_norm": 0.7308143377304077, + "learning_rate": 8.302808302808302e-06, + "loss": 0.7639, + "step": 2662 + }, + { + "epoch": 2.9251681999176165, + "grad_norm": 0.585617184638977, + "learning_rate": 8.18070818070818e-06, + "loss": 0.7614, + "step": 2663 + }, + { + "epoch": 2.9262666483591926, + "grad_norm": 0.5277345776557922, + "learning_rate": 8.058608058608059e-06, + "loss": 0.6489, + "step": 2664 + }, + { + "epoch": 2.927365096800769, + "grad_norm": 0.3540293574333191, + "learning_rate": 7.936507936507936e-06, + "loss": 0.4503, + "step": 2665 + }, + { + "epoch": 2.9284635452423453, + "grad_norm": 0.554492175579071, + "learning_rate": 7.814407814407813e-06, + "loss": 0.5785, + "step": 2666 + }, + { + "epoch": 2.9295619936839215, + "grad_norm": 0.5547875761985779, + "learning_rate": 7.692307692307692e-06, + "loss": 0.5763, + "step": 2667 + }, + { + "epoch": 2.9306604421254976, + "grad_norm": 0.745947003364563, + "learning_rate": 7.57020757020757e-06, + "loss": 0.512, + "step": 2668 + }, + { + "epoch": 2.931758890567074, + "grad_norm": 0.47691571712493896, + "learning_rate": 7.448107448107448e-06, + "loss": 0.7018, + "step": 2669 + }, + { + "epoch": 2.9328573390086503, + "grad_norm": 0.9611607789993286, + "learning_rate": 7.3260073260073255e-06, + "loss": 0.7419, + "step": 2670 + }, + { + "epoch": 2.9339557874502264, + "grad_norm": 0.5495268106460571, + "learning_rate": 7.203907203907203e-06, + "loss": 0.6096, + "step": 2671 + }, + { + "epoch": 2.935054235891803, + "grad_norm": 0.8863226771354675, + "learning_rate": 7.081807081807081e-06, + "loss": 0.7149, + "step": 2672 + }, + { + "epoch": 2.936152684333379, + "grad_norm": 0.4234665334224701, + "learning_rate": 6.959706959706959e-06, + "loss": 0.6913, + "step": 2673 + }, + { + "epoch": 2.9372511327749553, + "grad_norm": 0.9667326211929321, + "learning_rate": 6.837606837606837e-06, + "loss": 0.4181, + "step": 2674 + }, + { + "epoch": 2.9383495812165314, + "grad_norm": 0.543683648109436, + "learning_rate": 6.715506715506716e-06, + "loss": 0.6329, + "step": 2675 + }, + { + "epoch": 2.939448029658108, + "grad_norm": 0.5083779692649841, + "learning_rate": 6.5934065934065935e-06, + "loss": 0.8742, + "step": 2676 + }, + { + "epoch": 2.940546478099684, + "grad_norm": 0.7212001085281372, + "learning_rate": 6.4713064713064706e-06, + "loss": 0.6912, + "step": 2677 + }, + { + "epoch": 2.9416449265412603, + "grad_norm": 0.9474835991859436, + "learning_rate": 6.349206349206348e-06, + "loss": 0.649, + "step": 2678 + }, + { + "epoch": 2.942743374982837, + "grad_norm": 0.8142021298408508, + "learning_rate": 6.227106227106226e-06, + "loss": 0.6136, + "step": 2679 + }, + { + "epoch": 2.943841823424413, + "grad_norm": 2.9018187522888184, + "learning_rate": 6.105006105006104e-06, + "loss": 0.7157, + "step": 2680 + }, + { + "epoch": 2.944940271865989, + "grad_norm": 0.4023605287075043, + "learning_rate": 5.982905982905982e-06, + "loss": 0.5675, + "step": 2681 + }, + { + "epoch": 2.9460387203075653, + "grad_norm": 0.3693840801715851, + "learning_rate": 5.86080586080586e-06, + "loss": 0.5982, + "step": 2682 + }, + { + "epoch": 2.947137168749142, + "grad_norm": 0.4298234283924103, + "learning_rate": 5.738705738705739e-06, + "loss": 0.5379, + "step": 2683 + }, + { + "epoch": 2.948235617190718, + "grad_norm": 0.6495395302772522, + "learning_rate": 5.6166056166056165e-06, + "loss": 0.5411, + "step": 2684 + }, + { + "epoch": 2.9493340656322946, + "grad_norm": 0.44857510924339294, + "learning_rate": 5.494505494505494e-06, + "loss": 0.5154, + "step": 2685 + }, + { + "epoch": 2.9504325140738707, + "grad_norm": 0.7485830187797546, + "learning_rate": 5.372405372405372e-06, + "loss": 0.6595, + "step": 2686 + }, + { + "epoch": 2.951530962515447, + "grad_norm": 0.5141469836235046, + "learning_rate": 5.25030525030525e-06, + "loss": 0.6289, + "step": 2687 + }, + { + "epoch": 2.952629410957023, + "grad_norm": 0.8847435712814331, + "learning_rate": 5.128205128205128e-06, + "loss": 0.6734, + "step": 2688 + }, + { + "epoch": 2.9537278593985996, + "grad_norm": 0.570573091506958, + "learning_rate": 5.006105006105005e-06, + "loss": 0.7013, + "step": 2689 + }, + { + "epoch": 2.9548263078401757, + "grad_norm": 0.4376991391181946, + "learning_rate": 4.884004884004883e-06, + "loss": 0.5918, + "step": 2690 + }, + { + "epoch": 2.955924756281752, + "grad_norm": 0.5480318069458008, + "learning_rate": 4.7619047619047615e-06, + "loss": 0.6227, + "step": 2691 + }, + { + "epoch": 2.9570232047233285, + "grad_norm": 0.5831297636032104, + "learning_rate": 4.639804639804639e-06, + "loss": 0.6264, + "step": 2692 + }, + { + "epoch": 2.9581216531649046, + "grad_norm": 1.5778921842575073, + "learning_rate": 4.517704517704517e-06, + "loss": 0.6352, + "step": 2693 + }, + { + "epoch": 2.9592201016064807, + "grad_norm": 0.9567496180534363, + "learning_rate": 4.395604395604395e-06, + "loss": 0.6067, + "step": 2694 + }, + { + "epoch": 2.960318550048057, + "grad_norm": 0.5237869620323181, + "learning_rate": 4.273504273504273e-06, + "loss": 0.8241, + "step": 2695 + }, + { + "epoch": 2.9614169984896335, + "grad_norm": 0.3452164828777313, + "learning_rate": 4.151404151404151e-06, + "loss": 0.5718, + "step": 2696 + }, + { + "epoch": 2.9625154469312096, + "grad_norm": 0.42237767577171326, + "learning_rate": 4.0293040293040296e-06, + "loss": 0.5199, + "step": 2697 + }, + { + "epoch": 2.963613895372786, + "grad_norm": 0.7035055756568909, + "learning_rate": 3.907203907203907e-06, + "loss": 0.7078, + "step": 2698 + }, + { + "epoch": 2.9647123438143623, + "grad_norm": 0.39236482977867126, + "learning_rate": 3.785103785103785e-06, + "loss": 0.59, + "step": 2699 + }, + { + "epoch": 2.9658107922559385, + "grad_norm": 1.1658680438995361, + "learning_rate": 3.6630036630036627e-06, + "loss": 0.53, + "step": 2700 + }, + { + "epoch": 2.9669092406975146, + "grad_norm": 0.6797634363174438, + "learning_rate": 3.5409035409035406e-06, + "loss": 0.6763, + "step": 2701 + }, + { + "epoch": 2.968007689139091, + "grad_norm": 1.0421425104141235, + "learning_rate": 3.4188034188034185e-06, + "loss": 0.4, + "step": 2702 + }, + { + "epoch": 2.9691061375806673, + "grad_norm": 0.36937475204467773, + "learning_rate": 3.2967032967032968e-06, + "loss": 0.5401, + "step": 2703 + }, + { + "epoch": 2.9702045860222435, + "grad_norm": 0.4324638843536377, + "learning_rate": 3.174603174603174e-06, + "loss": 0.5882, + "step": 2704 + }, + { + "epoch": 2.97130303446382, + "grad_norm": 1.2700526714324951, + "learning_rate": 3.052503052503052e-06, + "loss": 0.613, + "step": 2705 + }, + { + "epoch": 2.972401482905396, + "grad_norm": 0.5261131525039673, + "learning_rate": 2.93040293040293e-06, + "loss": 0.6279, + "step": 2706 + }, + { + "epoch": 2.9734999313469723, + "grad_norm": 0.42924660444259644, + "learning_rate": 2.8083028083028082e-06, + "loss": 1.0058, + "step": 2707 + }, + { + "epoch": 2.9745983797885485, + "grad_norm": 3.100399971008301, + "learning_rate": 2.686202686202686e-06, + "loss": 0.5209, + "step": 2708 + }, + { + "epoch": 2.975696828230125, + "grad_norm": 0.3666403293609619, + "learning_rate": 2.564102564102564e-06, + "loss": 0.5231, + "step": 2709 + }, + { + "epoch": 2.976795276671701, + "grad_norm": 1.1315009593963623, + "learning_rate": 2.4420024420024414e-06, + "loss": 0.4449, + "step": 2710 + }, + { + "epoch": 2.9778937251132778, + "grad_norm": 0.3323412537574768, + "learning_rate": 2.3199023199023197e-06, + "loss": 0.4806, + "step": 2711 + }, + { + "epoch": 2.978992173554854, + "grad_norm": 0.7348967790603638, + "learning_rate": 2.1978021978021976e-06, + "loss": 0.7521, + "step": 2712 + }, + { + "epoch": 2.98009062199643, + "grad_norm": 1.018898606300354, + "learning_rate": 2.0757020757020754e-06, + "loss": 0.8468, + "step": 2713 + }, + { + "epoch": 2.981189070438006, + "grad_norm": 0.46808505058288574, + "learning_rate": 1.9536019536019533e-06, + "loss": 0.6992, + "step": 2714 + }, + { + "epoch": 2.9822875188795823, + "grad_norm": 0.5411276817321777, + "learning_rate": 1.8315018315018314e-06, + "loss": 0.5949, + "step": 2715 + }, + { + "epoch": 2.983385967321159, + "grad_norm": 0.45061302185058594, + "learning_rate": 1.7094017094017092e-06, + "loss": 0.4617, + "step": 2716 + }, + { + "epoch": 2.984484415762735, + "grad_norm": 0.44529294967651367, + "learning_rate": 1.587301587301587e-06, + "loss": 0.5811, + "step": 2717 + }, + { + "epoch": 2.9855828642043116, + "grad_norm": 1.255299687385559, + "learning_rate": 1.465201465201465e-06, + "loss": 1.1899, + "step": 2718 + }, + { + "epoch": 2.9866813126458878, + "grad_norm": 0.8325234651565552, + "learning_rate": 1.343101343101343e-06, + "loss": 0.6344, + "step": 2719 + }, + { + "epoch": 2.987779761087464, + "grad_norm": 1.0692095756530762, + "learning_rate": 1.2210012210012207e-06, + "loss": 0.5136, + "step": 2720 + }, + { + "epoch": 2.98887820952904, + "grad_norm": 0.4980855882167816, + "learning_rate": 1.0989010989010988e-06, + "loss": 0.6352, + "step": 2721 + }, + { + "epoch": 2.9899766579706166, + "grad_norm": 0.8502411246299744, + "learning_rate": 9.768009768009766e-07, + "loss": 0.599, + "step": 2722 + }, + { + "epoch": 2.9910751064121928, + "grad_norm": 0.4849570691585541, + "learning_rate": 8.547008547008546e-07, + "loss": 0.5862, + "step": 2723 + }, + { + "epoch": 2.992173554853769, + "grad_norm": 0.5491626858711243, + "learning_rate": 7.326007326007325e-07, + "loss": 0.5634, + "step": 2724 + }, + { + "epoch": 2.9932720032953455, + "grad_norm": 0.7289263606071472, + "learning_rate": 6.105006105006104e-07, + "loss": 0.6643, + "step": 2725 + }, + { + "epoch": 2.9943704517369216, + "grad_norm": 1.5343972444534302, + "learning_rate": 4.884004884004883e-07, + "loss": 0.71, + "step": 2726 + }, + { + "epoch": 2.9954689001784978, + "grad_norm": 0.5619814395904541, + "learning_rate": 3.6630036630036624e-07, + "loss": 0.721, + "step": 2727 + }, + { + "epoch": 2.996567348620074, + "grad_norm": 0.500442624092102, + "learning_rate": 2.4420024420024416e-07, + "loss": 0.6571, + "step": 2728 + }, + { + "epoch": 2.9976657970616505, + "grad_norm": 0.42292630672454834, + "learning_rate": 1.2210012210012208e-07, + "loss": 0.4772, + "step": 2729 + }, + { + "epoch": 2.9987642455032266, + "grad_norm": 0.4350331425666809, + "learning_rate": 0.0, + "loss": 0.7493, + "step": 2730 + }, + { + "epoch": 2.9987642455032266, + "step": 2730, + "total_flos": 1.0372510312766669e+18, + "train_loss": 0.674373844124022, + "train_runtime": 11584.4184, + "train_samples_per_second": 1.886, + "train_steps_per_second": 0.236 + } + ], + "logging_steps": 1.0, + "max_steps": 2730, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0372510312766669e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}