diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8434 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9573653306101608, + "eval_steps": 500, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000797804442175134, + "grad_norm": 1.1620235443115234, + "learning_rate": 3.125e-06, + "loss": 1.0717, + "step": 25 + }, + { + "epoch": 0.001595608884350268, + "grad_norm": 0.1761351078748703, + "learning_rate": 6.25e-06, + "loss": 1.0035, + "step": 50 + }, + { + "epoch": 0.0023934133265254023, + "grad_norm": 0.7293746471405029, + "learning_rate": 9.375000000000001e-06, + "loss": 1.0267, + "step": 75 + }, + { + "epoch": 0.003191217768700536, + "grad_norm": 0.5050331950187683, + "learning_rate": 1.25e-05, + "loss": 0.8625, + "step": 100 + }, + { + "epoch": 0.00398902221087567, + "grad_norm": 0.5066962242126465, + "learning_rate": 1.5625e-05, + "loss": 0.7529, + "step": 125 + }, + { + "epoch": 0.0047868266530508045, + "grad_norm": 0.7367194294929504, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.765, + "step": 150 + }, + { + "epoch": 0.005584631095225938, + "grad_norm": 0.5524747967720032, + "learning_rate": 2.1875e-05, + "loss": 0.7062, + "step": 175 + }, + { + "epoch": 0.006382435537401072, + "grad_norm": 0.6297128200531006, + "learning_rate": 2.5e-05, + "loss": 0.7283, + "step": 200 + }, + { + "epoch": 0.007180239979576206, + "grad_norm": 0.3561699092388153, + "learning_rate": 2.4979926772867422e-05, + "loss": 0.6711, + "step": 225 + }, + { + "epoch": 0.00797804442175134, + "grad_norm": 0.6796361207962036, + "learning_rate": 2.4959853545734843e-05, + "loss": 0.6566, + "step": 250 + }, + { + "epoch": 0.008775848863926475, + "grad_norm": 0.9527813792228699, + "learning_rate": 2.493978031860226e-05, + "loss": 0.6333, + "step": 275 + }, + { + "epoch": 0.009573653306101609, + "grad_norm": 0.5738817453384399, + "learning_rate": 2.4919707091469682e-05, + "loss": 0.6126, + "step": 300 + }, + { + "epoch": 0.010371457748276743, + "grad_norm": 0.8721635937690735, + "learning_rate": 2.4899633864337103e-05, + "loss": 0.5869, + "step": 325 + }, + { + "epoch": 0.011169262190451876, + "grad_norm": 0.8487357497215271, + "learning_rate": 2.4879560637204525e-05, + "loss": 0.6693, + "step": 350 + }, + { + "epoch": 0.01196706663262701, + "grad_norm": 0.8497329950332642, + "learning_rate": 2.4859487410071942e-05, + "loss": 0.6331, + "step": 375 + }, + { + "epoch": 0.012764871074802144, + "grad_norm": 0.7911163568496704, + "learning_rate": 2.4839414182939363e-05, + "loss": 0.6252, + "step": 400 + }, + { + "epoch": 0.013562675516977278, + "grad_norm": 1.0417559146881104, + "learning_rate": 2.4819340955806785e-05, + "loss": 0.654, + "step": 425 + }, + { + "epoch": 0.014360479959152413, + "grad_norm": 0.7947675585746765, + "learning_rate": 2.4799267728674206e-05, + "loss": 0.6265, + "step": 450 + }, + { + "epoch": 0.015158284401327547, + "grad_norm": 0.654082179069519, + "learning_rate": 2.4779194501541623e-05, + "loss": 0.5948, + "step": 475 + }, + { + "epoch": 0.01595608884350268, + "grad_norm": 0.5939388275146484, + "learning_rate": 2.4759121274409045e-05, + "loss": 0.5704, + "step": 500 + }, + { + "epoch": 0.016753893285677814, + "grad_norm": 0.5853065848350525, + "learning_rate": 2.4739048047276466e-05, + "loss": 0.6123, + "step": 525 + }, + { + "epoch": 0.01755169772785295, + "grad_norm": 0.924404501914978, + "learning_rate": 2.4718974820143887e-05, + "loss": 0.578, + "step": 550 + }, + { + "epoch": 0.018349502170028082, + "grad_norm": 0.6280034184455872, + "learning_rate": 2.4698901593011305e-05, + "loss": 0.612, + "step": 575 + }, + { + "epoch": 0.019147306612203218, + "grad_norm": 1.4167883396148682, + "learning_rate": 2.4678828365878726e-05, + "loss": 0.6074, + "step": 600 + }, + { + "epoch": 0.01994511105437835, + "grad_norm": 1.254233717918396, + "learning_rate": 2.4658755138746147e-05, + "loss": 0.6437, + "step": 625 + }, + { + "epoch": 0.020742915496553486, + "grad_norm": 0.6111056208610535, + "learning_rate": 2.4638681911613568e-05, + "loss": 0.6093, + "step": 650 + }, + { + "epoch": 0.02154071993872862, + "grad_norm": 0.7576286792755127, + "learning_rate": 2.4618608684480986e-05, + "loss": 0.5317, + "step": 675 + }, + { + "epoch": 0.02233852438090375, + "grad_norm": 1.1031715869903564, + "learning_rate": 2.459853545734841e-05, + "loss": 0.6359, + "step": 700 + }, + { + "epoch": 0.023136328823078887, + "grad_norm": 0.9123847484588623, + "learning_rate": 2.4578462230215828e-05, + "loss": 0.5288, + "step": 725 + }, + { + "epoch": 0.02393413326525402, + "grad_norm": 1.0893287658691406, + "learning_rate": 2.455838900308325e-05, + "loss": 0.5866, + "step": 750 + }, + { + "epoch": 0.024731937707429156, + "grad_norm": 0.6093726754188538, + "learning_rate": 2.4538315775950667e-05, + "loss": 0.524, + "step": 775 + }, + { + "epoch": 0.02552974214960429, + "grad_norm": 0.8649279475212097, + "learning_rate": 2.451824254881809e-05, + "loss": 0.6248, + "step": 800 + }, + { + "epoch": 0.026327546591779424, + "grad_norm": 0.7364605069160461, + "learning_rate": 2.449816932168551e-05, + "loss": 0.5878, + "step": 825 + }, + { + "epoch": 0.027125351033954557, + "grad_norm": 1.0632132291793823, + "learning_rate": 2.447809609455293e-05, + "loss": 0.6738, + "step": 850 + }, + { + "epoch": 0.027923155476129693, + "grad_norm": 0.5434523224830627, + "learning_rate": 2.4458022867420348e-05, + "loss": 0.6436, + "step": 875 + }, + { + "epoch": 0.028720959918304825, + "grad_norm": 1.3886839151382446, + "learning_rate": 2.4437949640287773e-05, + "loss": 0.6421, + "step": 900 + }, + { + "epoch": 0.029518764360479958, + "grad_norm": 0.66085284948349, + "learning_rate": 2.441787641315519e-05, + "loss": 0.5792, + "step": 925 + }, + { + "epoch": 0.030316568802655094, + "grad_norm": 0.39016640186309814, + "learning_rate": 2.439780318602261e-05, + "loss": 0.5376, + "step": 950 + }, + { + "epoch": 0.031114373244830226, + "grad_norm": 1.0862053632736206, + "learning_rate": 2.4377729958890033e-05, + "loss": 0.4904, + "step": 975 + }, + { + "epoch": 0.03191217768700536, + "grad_norm": 0.7027643918991089, + "learning_rate": 2.4357656731757454e-05, + "loss": 0.6461, + "step": 1000 + }, + { + "epoch": 0.0327099821291805, + "grad_norm": 1.035848617553711, + "learning_rate": 2.433758350462487e-05, + "loss": 0.6061, + "step": 1025 + }, + { + "epoch": 0.03350778657135563, + "grad_norm": 0.9431428909301758, + "learning_rate": 2.4317510277492293e-05, + "loss": 0.585, + "step": 1050 + }, + { + "epoch": 0.03430559101353076, + "grad_norm": 1.2081215381622314, + "learning_rate": 2.4297437050359714e-05, + "loss": 0.5293, + "step": 1075 + }, + { + "epoch": 0.0351033954557059, + "grad_norm": 0.6889535784721375, + "learning_rate": 2.4277363823227135e-05, + "loss": 0.5132, + "step": 1100 + }, + { + "epoch": 0.03590119989788103, + "grad_norm": 1.3305994272232056, + "learning_rate": 2.4257290596094553e-05, + "loss": 0.6028, + "step": 1125 + }, + { + "epoch": 0.036699004340056164, + "grad_norm": 0.4918406307697296, + "learning_rate": 2.4237217368961977e-05, + "loss": 0.6059, + "step": 1150 + }, + { + "epoch": 0.0374968087822313, + "grad_norm": 0.9576552510261536, + "learning_rate": 2.4217144141829395e-05, + "loss": 0.581, + "step": 1175 + }, + { + "epoch": 0.038294613224406436, + "grad_norm": 0.533026397228241, + "learning_rate": 2.4197070914696816e-05, + "loss": 0.5114, + "step": 1200 + }, + { + "epoch": 0.039092417666581565, + "grad_norm": 1.2196937799453735, + "learning_rate": 2.4176997687564234e-05, + "loss": 0.6211, + "step": 1225 + }, + { + "epoch": 0.0398902221087567, + "grad_norm": 1.5606813430786133, + "learning_rate": 2.4156924460431658e-05, + "loss": 0.5383, + "step": 1250 + }, + { + "epoch": 0.04068802655093184, + "grad_norm": 0.4488339424133301, + "learning_rate": 2.4136851233299076e-05, + "loss": 0.563, + "step": 1275 + }, + { + "epoch": 0.04148583099310697, + "grad_norm": 0.9547097086906433, + "learning_rate": 2.4116778006166497e-05, + "loss": 0.5983, + "step": 1300 + }, + { + "epoch": 0.0422836354352821, + "grad_norm": 0.40564727783203125, + "learning_rate": 2.4096704779033915e-05, + "loss": 0.5555, + "step": 1325 + }, + { + "epoch": 0.04308143987745724, + "grad_norm": 0.889912486076355, + "learning_rate": 2.407663155190134e-05, + "loss": 0.5673, + "step": 1350 + }, + { + "epoch": 0.043879244319632374, + "grad_norm": 1.0422492027282715, + "learning_rate": 2.4056558324768757e-05, + "loss": 0.5523, + "step": 1375 + }, + { + "epoch": 0.0446770487618075, + "grad_norm": 1.0268012285232544, + "learning_rate": 2.4036485097636178e-05, + "loss": 0.674, + "step": 1400 + }, + { + "epoch": 0.04547485320398264, + "grad_norm": 0.4538346230983734, + "learning_rate": 2.40164118705036e-05, + "loss": 0.5532, + "step": 1425 + }, + { + "epoch": 0.046272657646157775, + "grad_norm": 0.4324471652507782, + "learning_rate": 2.399633864337102e-05, + "loss": 0.5489, + "step": 1450 + }, + { + "epoch": 0.04707046208833291, + "grad_norm": 0.9547377824783325, + "learning_rate": 2.3976265416238438e-05, + "loss": 0.5658, + "step": 1475 + }, + { + "epoch": 0.04786826653050804, + "grad_norm": 0.3644287586212158, + "learning_rate": 2.395619218910586e-05, + "loss": 0.4947, + "step": 1500 + }, + { + "epoch": 0.048666070972683176, + "grad_norm": 0.9548143744468689, + "learning_rate": 2.393611896197328e-05, + "loss": 0.5477, + "step": 1525 + }, + { + "epoch": 0.04946387541485831, + "grad_norm": 0.7494068145751953, + "learning_rate": 2.3916045734840702e-05, + "loss": 0.4489, + "step": 1550 + }, + { + "epoch": 0.05026167985703344, + "grad_norm": 1.4067238569259644, + "learning_rate": 2.389597250770812e-05, + "loss": 0.5193, + "step": 1575 + }, + { + "epoch": 0.05105948429920858, + "grad_norm": 0.6029401421546936, + "learning_rate": 2.387589928057554e-05, + "loss": 0.4929, + "step": 1600 + }, + { + "epoch": 0.05185728874138371, + "grad_norm": 0.5118242502212524, + "learning_rate": 2.3855826053442962e-05, + "loss": 0.5729, + "step": 1625 + }, + { + "epoch": 0.05265509318355885, + "grad_norm": 0.3905925452709198, + "learning_rate": 2.3835752826310383e-05, + "loss": 0.5315, + "step": 1650 + }, + { + "epoch": 0.05345289762573398, + "grad_norm": 1.9033564329147339, + "learning_rate": 2.38156795991778e-05, + "loss": 0.573, + "step": 1675 + }, + { + "epoch": 0.054250702067909114, + "grad_norm": 0.6796665787696838, + "learning_rate": 2.3795606372045222e-05, + "loss": 0.5212, + "step": 1700 + }, + { + "epoch": 0.05504850651008425, + "grad_norm": 0.7749186158180237, + "learning_rate": 2.3775533144912643e-05, + "loss": 0.5417, + "step": 1725 + }, + { + "epoch": 0.055846310952259386, + "grad_norm": 0.8586596250534058, + "learning_rate": 2.3755459917780064e-05, + "loss": 0.6107, + "step": 1750 + }, + { + "epoch": 0.056644115394434515, + "grad_norm": 0.5593289136886597, + "learning_rate": 2.3735386690647482e-05, + "loss": 0.555, + "step": 1775 + }, + { + "epoch": 0.05744191983660965, + "grad_norm": 0.39711669087409973, + "learning_rate": 2.3715313463514903e-05, + "loss": 0.5285, + "step": 1800 + }, + { + "epoch": 0.05823972427878479, + "grad_norm": 0.6996438503265381, + "learning_rate": 2.3695240236382324e-05, + "loss": 0.561, + "step": 1825 + }, + { + "epoch": 0.059037528720959916, + "grad_norm": 0.6649994850158691, + "learning_rate": 2.3675167009249745e-05, + "loss": 0.6026, + "step": 1850 + }, + { + "epoch": 0.05983533316313505, + "grad_norm": 0.7238967418670654, + "learning_rate": 2.3655093782117166e-05, + "loss": 0.5061, + "step": 1875 + }, + { + "epoch": 0.06063313760531019, + "grad_norm": 1.1705780029296875, + "learning_rate": 2.3635020554984584e-05, + "loss": 0.6363, + "step": 1900 + }, + { + "epoch": 0.061430942047485323, + "grad_norm": 1.1129088401794434, + "learning_rate": 2.3614947327852005e-05, + "loss": 0.4697, + "step": 1925 + }, + { + "epoch": 0.06222874648966045, + "grad_norm": 0.6562258005142212, + "learning_rate": 2.3594874100719426e-05, + "loss": 0.5425, + "step": 1950 + }, + { + "epoch": 0.06302655093183558, + "grad_norm": 1.2954466342926025, + "learning_rate": 2.3574800873586847e-05, + "loss": 0.508, + "step": 1975 + }, + { + "epoch": 0.06382435537401072, + "grad_norm": 0.428022176027298, + "learning_rate": 2.3554727646454265e-05, + "loss": 0.5646, + "step": 2000 + }, + { + "epoch": 0.06462215981618585, + "grad_norm": 1.1137559413909912, + "learning_rate": 2.3534654419321686e-05, + "loss": 0.5514, + "step": 2025 + }, + { + "epoch": 0.065419964258361, + "grad_norm": 0.9217244982719421, + "learning_rate": 2.3514581192189107e-05, + "loss": 0.4994, + "step": 2050 + }, + { + "epoch": 0.06621776870053613, + "grad_norm": 1.9023886919021606, + "learning_rate": 2.349450796505653e-05, + "loss": 0.4763, + "step": 2075 + }, + { + "epoch": 0.06701557314271125, + "grad_norm": 0.49815502762794495, + "learning_rate": 2.3474434737923946e-05, + "loss": 0.5196, + "step": 2100 + }, + { + "epoch": 0.0678133775848864, + "grad_norm": 0.4530934691429138, + "learning_rate": 2.3454361510791367e-05, + "loss": 0.6507, + "step": 2125 + }, + { + "epoch": 0.06861118202706153, + "grad_norm": 1.0055551528930664, + "learning_rate": 2.343428828365879e-05, + "loss": 0.5758, + "step": 2150 + }, + { + "epoch": 0.06940898646923666, + "grad_norm": 0.8732419013977051, + "learning_rate": 2.341421505652621e-05, + "loss": 0.5036, + "step": 2175 + }, + { + "epoch": 0.0702067909114118, + "grad_norm": 0.7871633768081665, + "learning_rate": 2.3394141829393627e-05, + "loss": 0.5806, + "step": 2200 + }, + { + "epoch": 0.07100459535358693, + "grad_norm": 0.6435555815696716, + "learning_rate": 2.337406860226105e-05, + "loss": 0.6648, + "step": 2225 + }, + { + "epoch": 0.07180239979576206, + "grad_norm": 1.1382043361663818, + "learning_rate": 2.335399537512847e-05, + "loss": 0.5407, + "step": 2250 + }, + { + "epoch": 0.0726002042379372, + "grad_norm": 0.8095251321792603, + "learning_rate": 2.333392214799589e-05, + "loss": 0.5153, + "step": 2275 + }, + { + "epoch": 0.07339800868011233, + "grad_norm": 1.1035778522491455, + "learning_rate": 2.331384892086331e-05, + "loss": 0.5654, + "step": 2300 + }, + { + "epoch": 0.07419581312228747, + "grad_norm": 1.2554080486297607, + "learning_rate": 2.329377569373073e-05, + "loss": 0.5228, + "step": 2325 + }, + { + "epoch": 0.0749936175644626, + "grad_norm": 1.3043935298919678, + "learning_rate": 2.327370246659815e-05, + "loss": 0.5288, + "step": 2350 + }, + { + "epoch": 0.07579142200663773, + "grad_norm": 0.9910039901733398, + "learning_rate": 2.3253629239465572e-05, + "loss": 0.5839, + "step": 2375 + }, + { + "epoch": 0.07658922644881287, + "grad_norm": 0.8184835910797119, + "learning_rate": 2.323355601233299e-05, + "loss": 0.5325, + "step": 2400 + }, + { + "epoch": 0.077387030890988, + "grad_norm": 0.8235958814620972, + "learning_rate": 2.3213482785200414e-05, + "loss": 0.5109, + "step": 2425 + }, + { + "epoch": 0.07818483533316313, + "grad_norm": 0.581178605556488, + "learning_rate": 2.3193409558067832e-05, + "loss": 0.5618, + "step": 2450 + }, + { + "epoch": 0.07898263977533827, + "grad_norm": 1.0203675031661987, + "learning_rate": 2.3173336330935253e-05, + "loss": 0.485, + "step": 2475 + }, + { + "epoch": 0.0797804442175134, + "grad_norm": 1.0625289678573608, + "learning_rate": 2.315326310380267e-05, + "loss": 0.5517, + "step": 2500 + }, + { + "epoch": 0.08057824865968853, + "grad_norm": 0.6422656178474426, + "learning_rate": 2.3133189876670095e-05, + "loss": 0.5651, + "step": 2525 + }, + { + "epoch": 0.08137605310186367, + "grad_norm": 0.6603285670280457, + "learning_rate": 2.3113116649537513e-05, + "loss": 0.52, + "step": 2550 + }, + { + "epoch": 0.0821738575440388, + "grad_norm": 0.593621015548706, + "learning_rate": 2.3093043422404934e-05, + "loss": 0.4592, + "step": 2575 + }, + { + "epoch": 0.08297166198621395, + "grad_norm": 0.5782598853111267, + "learning_rate": 2.3072970195272355e-05, + "loss": 0.5465, + "step": 2600 + }, + { + "epoch": 0.08376946642838907, + "grad_norm": 0.6208533644676208, + "learning_rate": 2.3052896968139777e-05, + "loss": 0.5752, + "step": 2625 + }, + { + "epoch": 0.0845672708705642, + "grad_norm": 0.5738264918327332, + "learning_rate": 2.3032823741007194e-05, + "loss": 0.5624, + "step": 2650 + }, + { + "epoch": 0.08536507531273935, + "grad_norm": 0.9327691197395325, + "learning_rate": 2.3012750513874615e-05, + "loss": 0.5183, + "step": 2675 + }, + { + "epoch": 0.08616287975491448, + "grad_norm": 0.3608091473579407, + "learning_rate": 2.2992677286742037e-05, + "loss": 0.4504, + "step": 2700 + }, + { + "epoch": 0.0869606841970896, + "grad_norm": 1.4098608493804932, + "learning_rate": 2.2972604059609458e-05, + "loss": 0.5134, + "step": 2725 + }, + { + "epoch": 0.08775848863926475, + "grad_norm": 0.7239978909492493, + "learning_rate": 2.2952530832476875e-05, + "loss": 0.544, + "step": 2750 + }, + { + "epoch": 0.08855629308143988, + "grad_norm": 0.9918779134750366, + "learning_rate": 2.2932457605344297e-05, + "loss": 0.5694, + "step": 2775 + }, + { + "epoch": 0.089354097523615, + "grad_norm": 0.7250977158546448, + "learning_rate": 2.2912384378211718e-05, + "loss": 0.5262, + "step": 2800 + }, + { + "epoch": 0.09015190196579015, + "grad_norm": 0.8188421726226807, + "learning_rate": 2.289231115107914e-05, + "loss": 0.6284, + "step": 2825 + }, + { + "epoch": 0.09094970640796528, + "grad_norm": 0.46473366022109985, + "learning_rate": 2.2872237923946557e-05, + "loss": 0.4849, + "step": 2850 + }, + { + "epoch": 0.0917475108501404, + "grad_norm": 0.5546078085899353, + "learning_rate": 2.285216469681398e-05, + "loss": 0.5263, + "step": 2875 + }, + { + "epoch": 0.09254531529231555, + "grad_norm": 0.9259676337242126, + "learning_rate": 2.28320914696814e-05, + "loss": 0.5464, + "step": 2900 + }, + { + "epoch": 0.09334311973449068, + "grad_norm": 1.2702957391738892, + "learning_rate": 2.281201824254882e-05, + "loss": 0.5304, + "step": 2925 + }, + { + "epoch": 0.09414092417666582, + "grad_norm": 1.6516447067260742, + "learning_rate": 2.2791945015416238e-05, + "loss": 0.5297, + "step": 2950 + }, + { + "epoch": 0.09493872861884095, + "grad_norm": 0.70711350440979, + "learning_rate": 2.2771871788283662e-05, + "loss": 0.5308, + "step": 2975 + }, + { + "epoch": 0.09573653306101608, + "grad_norm": 0.931492805480957, + "learning_rate": 2.275179856115108e-05, + "loss": 0.5609, + "step": 3000 + }, + { + "epoch": 0.09653433750319122, + "grad_norm": 0.3455384373664856, + "learning_rate": 2.27317253340185e-05, + "loss": 0.5528, + "step": 3025 + }, + { + "epoch": 0.09733214194536635, + "grad_norm": 0.7669233083724976, + "learning_rate": 2.2711652106885922e-05, + "loss": 0.4318, + "step": 3050 + }, + { + "epoch": 0.09812994638754148, + "grad_norm": 0.5945572853088379, + "learning_rate": 2.2691578879753343e-05, + "loss": 0.5695, + "step": 3075 + }, + { + "epoch": 0.09892775082971662, + "grad_norm": 0.8934717774391174, + "learning_rate": 2.267150565262076e-05, + "loss": 0.5284, + "step": 3100 + }, + { + "epoch": 0.09972555527189175, + "grad_norm": 0.6024275422096252, + "learning_rate": 2.2651432425488182e-05, + "loss": 0.5028, + "step": 3125 + }, + { + "epoch": 0.10052335971406688, + "grad_norm": 0.6754337549209595, + "learning_rate": 2.2631359198355603e-05, + "loss": 0.5949, + "step": 3150 + }, + { + "epoch": 0.10132116415624202, + "grad_norm": 1.1694283485412598, + "learning_rate": 2.2611285971223024e-05, + "loss": 0.5488, + "step": 3175 + }, + { + "epoch": 0.10211896859841715, + "grad_norm": 0.8880589604377747, + "learning_rate": 2.2591212744090442e-05, + "loss": 0.5348, + "step": 3200 + }, + { + "epoch": 0.1029167730405923, + "grad_norm": 0.425998330116272, + "learning_rate": 2.2571139516957863e-05, + "loss": 0.6288, + "step": 3225 + }, + { + "epoch": 0.10371457748276743, + "grad_norm": 0.9844962358474731, + "learning_rate": 2.2551066289825284e-05, + "loss": 0.6145, + "step": 3250 + }, + { + "epoch": 0.10451238192494255, + "grad_norm": 0.7182367444038391, + "learning_rate": 2.2530993062692706e-05, + "loss": 0.6248, + "step": 3275 + }, + { + "epoch": 0.1053101863671177, + "grad_norm": 0.493770956993103, + "learning_rate": 2.2510919835560123e-05, + "loss": 0.5463, + "step": 3300 + }, + { + "epoch": 0.10610799080929283, + "grad_norm": 1.0892950296401978, + "learning_rate": 2.2490846608427545e-05, + "loss": 0.543, + "step": 3325 + }, + { + "epoch": 0.10690579525146796, + "grad_norm": 0.9539535045623779, + "learning_rate": 2.2470773381294966e-05, + "loss": 0.5055, + "step": 3350 + }, + { + "epoch": 0.1077035996936431, + "grad_norm": 0.9406744837760925, + "learning_rate": 2.2450700154162387e-05, + "loss": 0.4861, + "step": 3375 + }, + { + "epoch": 0.10850140413581823, + "grad_norm": 1.0666565895080566, + "learning_rate": 2.2430626927029805e-05, + "loss": 0.4699, + "step": 3400 + }, + { + "epoch": 0.10929920857799336, + "grad_norm": 0.6132543683052063, + "learning_rate": 2.2410553699897226e-05, + "loss": 0.5105, + "step": 3425 + }, + { + "epoch": 0.1100970130201685, + "grad_norm": 1.0401798486709595, + "learning_rate": 2.2390480472764647e-05, + "loss": 0.5095, + "step": 3450 + }, + { + "epoch": 0.11089481746234363, + "grad_norm": 0.6453993320465088, + "learning_rate": 2.2370407245632068e-05, + "loss": 0.5078, + "step": 3475 + }, + { + "epoch": 0.11169262190451877, + "grad_norm": 1.104200839996338, + "learning_rate": 2.2350334018499486e-05, + "loss": 0.5562, + "step": 3500 + }, + { + "epoch": 0.1124904263466939, + "grad_norm": 0.8307452201843262, + "learning_rate": 2.2330260791366907e-05, + "loss": 0.5327, + "step": 3525 + }, + { + "epoch": 0.11328823078886903, + "grad_norm": 0.5826612710952759, + "learning_rate": 2.2310187564234328e-05, + "loss": 0.4893, + "step": 3550 + }, + { + "epoch": 0.11408603523104417, + "grad_norm": 0.5139064192771912, + "learning_rate": 2.2290114337101746e-05, + "loss": 0.5041, + "step": 3575 + }, + { + "epoch": 0.1148838396732193, + "grad_norm": 0.40968069434165955, + "learning_rate": 2.227004110996917e-05, + "loss": 0.519, + "step": 3600 + }, + { + "epoch": 0.11568164411539443, + "grad_norm": 0.7572267055511475, + "learning_rate": 2.2249967882836588e-05, + "loss": 0.4424, + "step": 3625 + }, + { + "epoch": 0.11647944855756957, + "grad_norm": 1.1099541187286377, + "learning_rate": 2.222989465570401e-05, + "loss": 0.5423, + "step": 3650 + }, + { + "epoch": 0.1172772529997447, + "grad_norm": 0.8752992749214172, + "learning_rate": 2.2209821428571427e-05, + "loss": 0.5944, + "step": 3675 + }, + { + "epoch": 0.11807505744191983, + "grad_norm": 1.1976771354675293, + "learning_rate": 2.218974820143885e-05, + "loss": 0.5575, + "step": 3700 + }, + { + "epoch": 0.11887286188409497, + "grad_norm": 1.2560482025146484, + "learning_rate": 2.216967497430627e-05, + "loss": 0.502, + "step": 3725 + }, + { + "epoch": 0.1196706663262701, + "grad_norm": 0.7299981713294983, + "learning_rate": 2.214960174717369e-05, + "loss": 0.5669, + "step": 3750 + }, + { + "epoch": 0.12046847076844523, + "grad_norm": 0.9207307696342468, + "learning_rate": 2.212952852004111e-05, + "loss": 0.5062, + "step": 3775 + }, + { + "epoch": 0.12126627521062038, + "grad_norm": 1.2774001359939575, + "learning_rate": 2.2109455292908532e-05, + "loss": 0.5401, + "step": 3800 + }, + { + "epoch": 0.1220640796527955, + "grad_norm": 0.797244668006897, + "learning_rate": 2.208938206577595e-05, + "loss": 0.5483, + "step": 3825 + }, + { + "epoch": 0.12286188409497065, + "grad_norm": 0.9657193422317505, + "learning_rate": 2.206930883864337e-05, + "loss": 0.5249, + "step": 3850 + }, + { + "epoch": 0.12365968853714578, + "grad_norm": 0.7845392227172852, + "learning_rate": 2.2049235611510792e-05, + "loss": 0.5519, + "step": 3875 + }, + { + "epoch": 0.1244574929793209, + "grad_norm": 0.8090838193893433, + "learning_rate": 2.2029162384378214e-05, + "loss": 0.565, + "step": 3900 + }, + { + "epoch": 0.12525529742149605, + "grad_norm": 0.8069561719894409, + "learning_rate": 2.200908915724563e-05, + "loss": 0.6185, + "step": 3925 + }, + { + "epoch": 0.12605310186367116, + "grad_norm": 0.8270429372787476, + "learning_rate": 2.1989015930113052e-05, + "loss": 0.5291, + "step": 3950 + }, + { + "epoch": 0.1268509063058463, + "grad_norm": 0.7712761163711548, + "learning_rate": 2.1968942702980474e-05, + "loss": 0.581, + "step": 3975 + }, + { + "epoch": 0.12764871074802145, + "grad_norm": 0.6054518222808838, + "learning_rate": 2.1948869475847895e-05, + "loss": 0.4576, + "step": 4000 + }, + { + "epoch": 0.1284465151901966, + "grad_norm": 1.1705981492996216, + "learning_rate": 2.1928796248715312e-05, + "loss": 0.481, + "step": 4025 + }, + { + "epoch": 0.1292443196323717, + "grad_norm": 1.2569853067398071, + "learning_rate": 2.1908723021582737e-05, + "loss": 0.5749, + "step": 4050 + }, + { + "epoch": 0.13004212407454685, + "grad_norm": 1.009342908859253, + "learning_rate": 2.1888649794450155e-05, + "loss": 0.5604, + "step": 4075 + }, + { + "epoch": 0.130839928516722, + "grad_norm": 0.8715687394142151, + "learning_rate": 2.1868576567317576e-05, + "loss": 0.4648, + "step": 4100 + }, + { + "epoch": 0.1316377329588971, + "grad_norm": 0.7666533589363098, + "learning_rate": 2.1848503340184994e-05, + "loss": 0.5616, + "step": 4125 + }, + { + "epoch": 0.13243553740107225, + "grad_norm": 0.9141665101051331, + "learning_rate": 2.1828430113052418e-05, + "loss": 0.517, + "step": 4150 + }, + { + "epoch": 0.1332333418432474, + "grad_norm": 0.7477203011512756, + "learning_rate": 2.1808356885919836e-05, + "loss": 0.4709, + "step": 4175 + }, + { + "epoch": 0.1340311462854225, + "grad_norm": 1.1471550464630127, + "learning_rate": 2.1788283658787257e-05, + "loss": 0.4974, + "step": 4200 + }, + { + "epoch": 0.13482895072759765, + "grad_norm": 0.5586664080619812, + "learning_rate": 2.1768210431654678e-05, + "loss": 0.5779, + "step": 4225 + }, + { + "epoch": 0.1356267551697728, + "grad_norm": 0.8173395395278931, + "learning_rate": 2.17481372045221e-05, + "loss": 0.4929, + "step": 4250 + }, + { + "epoch": 0.1364245596119479, + "grad_norm": 0.8017388582229614, + "learning_rate": 2.1728063977389517e-05, + "loss": 0.423, + "step": 4275 + }, + { + "epoch": 0.13722236405412305, + "grad_norm": 0.6071333885192871, + "learning_rate": 2.1707990750256938e-05, + "loss": 0.5407, + "step": 4300 + }, + { + "epoch": 0.1380201684962982, + "grad_norm": 0.6837856769561768, + "learning_rate": 2.168791752312436e-05, + "loss": 0.4963, + "step": 4325 + }, + { + "epoch": 0.1388179729384733, + "grad_norm": 0.5120430588722229, + "learning_rate": 2.166784429599178e-05, + "loss": 0.5045, + "step": 4350 + }, + { + "epoch": 0.13961577738064845, + "grad_norm": 0.8727617263793945, + "learning_rate": 2.1647771068859198e-05, + "loss": 0.4111, + "step": 4375 + }, + { + "epoch": 0.1404135818228236, + "grad_norm": 1.0532296895980835, + "learning_rate": 2.162769784172662e-05, + "loss": 0.4489, + "step": 4400 + }, + { + "epoch": 0.1412113862649987, + "grad_norm": 0.57016921043396, + "learning_rate": 2.160762461459404e-05, + "loss": 0.477, + "step": 4425 + }, + { + "epoch": 0.14200919070717385, + "grad_norm": 0.8259162902832031, + "learning_rate": 2.158755138746146e-05, + "loss": 0.5164, + "step": 4450 + }, + { + "epoch": 0.142806995149349, + "grad_norm": 0.3785061538219452, + "learning_rate": 2.156747816032888e-05, + "loss": 0.4683, + "step": 4475 + }, + { + "epoch": 0.1436047995915241, + "grad_norm": 1.1364110708236694, + "learning_rate": 2.1547404933196304e-05, + "loss": 0.4723, + "step": 4500 + }, + { + "epoch": 0.14440260403369926, + "grad_norm": 0.8808683156967163, + "learning_rate": 2.152733170606372e-05, + "loss": 0.5151, + "step": 4525 + }, + { + "epoch": 0.1452004084758744, + "grad_norm": 0.8673577308654785, + "learning_rate": 2.1507258478931143e-05, + "loss": 0.5108, + "step": 4550 + }, + { + "epoch": 0.14599821291804954, + "grad_norm": 1.2383835315704346, + "learning_rate": 2.148718525179856e-05, + "loss": 0.5185, + "step": 4575 + }, + { + "epoch": 0.14679601736022466, + "grad_norm": 1.3869210481643677, + "learning_rate": 2.1467112024665985e-05, + "loss": 0.5453, + "step": 4600 + }, + { + "epoch": 0.1475938218023998, + "grad_norm": 0.4665587842464447, + "learning_rate": 2.1447038797533403e-05, + "loss": 0.4585, + "step": 4625 + }, + { + "epoch": 0.14839162624457494, + "grad_norm": 0.29424333572387695, + "learning_rate": 2.1426965570400824e-05, + "loss": 0.5438, + "step": 4650 + }, + { + "epoch": 0.14918943068675006, + "grad_norm": 1.7765225172042847, + "learning_rate": 2.140689234326824e-05, + "loss": 0.5499, + "step": 4675 + }, + { + "epoch": 0.1499872351289252, + "grad_norm": 0.9864098429679871, + "learning_rate": 2.1386819116135666e-05, + "loss": 0.4991, + "step": 4700 + }, + { + "epoch": 0.15078503957110034, + "grad_norm": 0.8926157355308533, + "learning_rate": 2.1366745889003084e-05, + "loss": 0.5008, + "step": 4725 + }, + { + "epoch": 0.15158284401327546, + "grad_norm": 0.7439494729042053, + "learning_rate": 2.1346672661870505e-05, + "loss": 0.5324, + "step": 4750 + }, + { + "epoch": 0.1523806484554506, + "grad_norm": 0.6221075057983398, + "learning_rate": 2.1326599434737926e-05, + "loss": 0.5064, + "step": 4775 + }, + { + "epoch": 0.15317845289762574, + "grad_norm": 0.8994887471199036, + "learning_rate": 2.1306526207605347e-05, + "loss": 0.5377, + "step": 4800 + }, + { + "epoch": 0.15397625733980086, + "grad_norm": 1.5914185047149658, + "learning_rate": 2.1286452980472765e-05, + "loss": 0.4884, + "step": 4825 + }, + { + "epoch": 0.154774061781976, + "grad_norm": 0.37632209062576294, + "learning_rate": 2.1266379753340186e-05, + "loss": 0.5193, + "step": 4850 + }, + { + "epoch": 0.15557186622415115, + "grad_norm": 1.095083475112915, + "learning_rate": 2.1246306526207607e-05, + "loss": 0.4342, + "step": 4875 + }, + { + "epoch": 0.15636967066632626, + "grad_norm": 1.2659378051757812, + "learning_rate": 2.122623329907503e-05, + "loss": 0.5291, + "step": 4900 + }, + { + "epoch": 0.1571674751085014, + "grad_norm": 0.8220809698104858, + "learning_rate": 2.1206160071942446e-05, + "loss": 0.5319, + "step": 4925 + }, + { + "epoch": 0.15796527955067655, + "grad_norm": 0.8385465145111084, + "learning_rate": 2.1186086844809867e-05, + "loss": 0.4569, + "step": 4950 + }, + { + "epoch": 0.15876308399285166, + "grad_norm": 0.5405530333518982, + "learning_rate": 2.116601361767729e-05, + "loss": 0.4429, + "step": 4975 + }, + { + "epoch": 0.1595608884350268, + "grad_norm": 1.7731573581695557, + "learning_rate": 2.114594039054471e-05, + "loss": 0.4868, + "step": 5000 + }, + { + "epoch": 0.16035869287720195, + "grad_norm": 1.1661796569824219, + "learning_rate": 2.1125867163412127e-05, + "loss": 0.438, + "step": 5025 + }, + { + "epoch": 0.16115649731937706, + "grad_norm": 0.8461349606513977, + "learning_rate": 2.110579393627955e-05, + "loss": 0.566, + "step": 5050 + }, + { + "epoch": 0.1619543017615522, + "grad_norm": 0.503616452217102, + "learning_rate": 2.108572070914697e-05, + "loss": 0.4791, + "step": 5075 + }, + { + "epoch": 0.16275210620372735, + "grad_norm": 1.0445747375488281, + "learning_rate": 2.1065647482014387e-05, + "loss": 0.5193, + "step": 5100 + }, + { + "epoch": 0.16354991064590246, + "grad_norm": 0.9016474485397339, + "learning_rate": 2.104557425488181e-05, + "loss": 0.5409, + "step": 5125 + }, + { + "epoch": 0.1643477150880776, + "grad_norm": 1.2945493459701538, + "learning_rate": 2.102550102774923e-05, + "loss": 0.528, + "step": 5150 + }, + { + "epoch": 0.16514551953025275, + "grad_norm": 1.1654711961746216, + "learning_rate": 2.100542780061665e-05, + "loss": 0.5258, + "step": 5175 + }, + { + "epoch": 0.1659433239724279, + "grad_norm": 0.4375602602958679, + "learning_rate": 2.098535457348407e-05, + "loss": 0.5405, + "step": 5200 + }, + { + "epoch": 0.166741128414603, + "grad_norm": 0.808623194694519, + "learning_rate": 2.0965281346351493e-05, + "loss": 0.4869, + "step": 5225 + }, + { + "epoch": 0.16753893285677815, + "grad_norm": 0.881857693195343, + "learning_rate": 2.094520811921891e-05, + "loss": 0.492, + "step": 5250 + }, + { + "epoch": 0.1683367372989533, + "grad_norm": 0.7097049355506897, + "learning_rate": 2.0925134892086332e-05, + "loss": 0.4401, + "step": 5275 + }, + { + "epoch": 0.1691345417411284, + "grad_norm": 0.6862661242485046, + "learning_rate": 2.090506166495375e-05, + "loss": 0.5644, + "step": 5300 + }, + { + "epoch": 0.16993234618330355, + "grad_norm": 0.5271662473678589, + "learning_rate": 2.0884988437821174e-05, + "loss": 0.536, + "step": 5325 + }, + { + "epoch": 0.1707301506254787, + "grad_norm": 1.1128803491592407, + "learning_rate": 2.0864915210688592e-05, + "loss": 0.5166, + "step": 5350 + }, + { + "epoch": 0.1715279550676538, + "grad_norm": 0.656862199306488, + "learning_rate": 2.0844841983556013e-05, + "loss": 0.4488, + "step": 5375 + }, + { + "epoch": 0.17232575950982895, + "grad_norm": 0.5935942530632019, + "learning_rate": 2.0824768756423434e-05, + "loss": 0.5376, + "step": 5400 + }, + { + "epoch": 0.1731235639520041, + "grad_norm": 0.6691839098930359, + "learning_rate": 2.0804695529290855e-05, + "loss": 0.512, + "step": 5425 + }, + { + "epoch": 0.1739213683941792, + "grad_norm": 0.6211944818496704, + "learning_rate": 2.0784622302158273e-05, + "loss": 0.535, + "step": 5450 + }, + { + "epoch": 0.17471917283635435, + "grad_norm": 0.8693767786026001, + "learning_rate": 2.0764549075025694e-05, + "loss": 0.5033, + "step": 5475 + }, + { + "epoch": 0.1755169772785295, + "grad_norm": 0.5080140829086304, + "learning_rate": 2.0744475847893115e-05, + "loss": 0.5029, + "step": 5500 + }, + { + "epoch": 0.1763147817207046, + "grad_norm": 0.9701049327850342, + "learning_rate": 2.0724402620760536e-05, + "loss": 0.4856, + "step": 5525 + }, + { + "epoch": 0.17711258616287975, + "grad_norm": 0.9497528672218323, + "learning_rate": 2.0704329393627954e-05, + "loss": 0.5074, + "step": 5550 + }, + { + "epoch": 0.1779103906050549, + "grad_norm": 1.4564626216888428, + "learning_rate": 2.0684256166495375e-05, + "loss": 0.4149, + "step": 5575 + }, + { + "epoch": 0.17870819504723, + "grad_norm": 1.11367928981781, + "learning_rate": 2.0664182939362796e-05, + "loss": 0.4164, + "step": 5600 + }, + { + "epoch": 0.17950599948940515, + "grad_norm": 0.6194526553153992, + "learning_rate": 2.0644109712230218e-05, + "loss": 0.5459, + "step": 5625 + }, + { + "epoch": 0.1803038039315803, + "grad_norm": 1.52586030960083, + "learning_rate": 2.0624036485097635e-05, + "loss": 0.5643, + "step": 5650 + }, + { + "epoch": 0.1811016083737554, + "grad_norm": 0.6483252644538879, + "learning_rate": 2.060396325796506e-05, + "loss": 0.4655, + "step": 5675 + }, + { + "epoch": 0.18189941281593056, + "grad_norm": 0.6978493928909302, + "learning_rate": 2.0583890030832478e-05, + "loss": 0.5387, + "step": 5700 + }, + { + "epoch": 0.1826972172581057, + "grad_norm": 0.3132692277431488, + "learning_rate": 2.05638168036999e-05, + "loss": 0.4972, + "step": 5725 + }, + { + "epoch": 0.1834950217002808, + "grad_norm": 0.7360209226608276, + "learning_rate": 2.0543743576567316e-05, + "loss": 0.6053, + "step": 5750 + }, + { + "epoch": 0.18429282614245596, + "grad_norm": 0.4095138907432556, + "learning_rate": 2.052367034943474e-05, + "loss": 0.4819, + "step": 5775 + }, + { + "epoch": 0.1850906305846311, + "grad_norm": 0.6342286467552185, + "learning_rate": 2.050359712230216e-05, + "loss": 0.4445, + "step": 5800 + }, + { + "epoch": 0.18588843502680624, + "grad_norm": 1.3439861536026, + "learning_rate": 2.048352389516958e-05, + "loss": 0.407, + "step": 5825 + }, + { + "epoch": 0.18668623946898136, + "grad_norm": 1.3485324382781982, + "learning_rate": 2.0463450668036998e-05, + "loss": 0.4515, + "step": 5850 + }, + { + "epoch": 0.1874840439111565, + "grad_norm": 1.4493141174316406, + "learning_rate": 2.0443377440904422e-05, + "loss": 0.4998, + "step": 5875 + }, + { + "epoch": 0.18828184835333164, + "grad_norm": 0.6169376969337463, + "learning_rate": 2.042330421377184e-05, + "loss": 0.4751, + "step": 5900 + }, + { + "epoch": 0.18907965279550676, + "grad_norm": 0.8687784671783447, + "learning_rate": 2.040323098663926e-05, + "loss": 0.5946, + "step": 5925 + }, + { + "epoch": 0.1898774572376819, + "grad_norm": 1.027100920677185, + "learning_rate": 2.0383157759506682e-05, + "loss": 0.4756, + "step": 5950 + }, + { + "epoch": 0.19067526167985704, + "grad_norm": 2.688061475753784, + "learning_rate": 2.0363084532374103e-05, + "loss": 0.505, + "step": 5975 + }, + { + "epoch": 0.19147306612203216, + "grad_norm": 0.5821250081062317, + "learning_rate": 2.034301130524152e-05, + "loss": 0.4667, + "step": 6000 + }, + { + "epoch": 0.1922708705642073, + "grad_norm": 1.2581595182418823, + "learning_rate": 2.0322938078108942e-05, + "loss": 0.5072, + "step": 6025 + }, + { + "epoch": 0.19306867500638245, + "grad_norm": 0.5629422068595886, + "learning_rate": 2.0302864850976363e-05, + "loss": 0.4727, + "step": 6050 + }, + { + "epoch": 0.19386647944855756, + "grad_norm": 1.0601611137390137, + "learning_rate": 2.0282791623843784e-05, + "loss": 0.516, + "step": 6075 + }, + { + "epoch": 0.1946642838907327, + "grad_norm": 0.5984941720962524, + "learning_rate": 2.0262718396711202e-05, + "loss": 0.4991, + "step": 6100 + }, + { + "epoch": 0.19546208833290785, + "grad_norm": 1.2396142482757568, + "learning_rate": 2.0242645169578627e-05, + "loss": 0.4942, + "step": 6125 + }, + { + "epoch": 0.19625989277508296, + "grad_norm": 0.722944974899292, + "learning_rate": 2.0222571942446044e-05, + "loss": 0.5489, + "step": 6150 + }, + { + "epoch": 0.1970576972172581, + "grad_norm": 0.28916776180267334, + "learning_rate": 2.0202498715313466e-05, + "loss": 0.4739, + "step": 6175 + }, + { + "epoch": 0.19785550165943325, + "grad_norm": 0.5348458886146545, + "learning_rate": 2.0182425488180883e-05, + "loss": 0.4926, + "step": 6200 + }, + { + "epoch": 0.19865330610160836, + "grad_norm": 1.1795445680618286, + "learning_rate": 2.0162352261048308e-05, + "loss": 0.5097, + "step": 6225 + }, + { + "epoch": 0.1994511105437835, + "grad_norm": 0.9219532608985901, + "learning_rate": 2.0142279033915726e-05, + "loss": 0.441, + "step": 6250 + }, + { + "epoch": 0.20024891498595865, + "grad_norm": 0.9598010778427124, + "learning_rate": 2.0122205806783147e-05, + "loss": 0.4856, + "step": 6275 + }, + { + "epoch": 0.20104671942813376, + "grad_norm": 1.0245938301086426, + "learning_rate": 2.0102132579650564e-05, + "loss": 0.449, + "step": 6300 + }, + { + "epoch": 0.2018445238703089, + "grad_norm": 1.4804080724716187, + "learning_rate": 2.008205935251799e-05, + "loss": 0.5155, + "step": 6325 + }, + { + "epoch": 0.20264232831248405, + "grad_norm": 0.5288428664207458, + "learning_rate": 2.0061986125385407e-05, + "loss": 0.4514, + "step": 6350 + }, + { + "epoch": 0.20344013275465916, + "grad_norm": 0.9050682783126831, + "learning_rate": 2.0041912898252828e-05, + "loss": 0.4946, + "step": 6375 + }, + { + "epoch": 0.2042379371968343, + "grad_norm": 1.1348626613616943, + "learning_rate": 2.002183967112025e-05, + "loss": 0.492, + "step": 6400 + }, + { + "epoch": 0.20503574163900945, + "grad_norm": 0.9863408803939819, + "learning_rate": 2.000176644398767e-05, + "loss": 0.4849, + "step": 6425 + }, + { + "epoch": 0.2058335460811846, + "grad_norm": 0.8158233165740967, + "learning_rate": 1.9981693216855088e-05, + "loss": 0.4371, + "step": 6450 + }, + { + "epoch": 0.2066313505233597, + "grad_norm": 0.5699653625488281, + "learning_rate": 1.996161998972251e-05, + "loss": 0.4869, + "step": 6475 + }, + { + "epoch": 0.20742915496553485, + "grad_norm": 0.6135886311531067, + "learning_rate": 1.994154676258993e-05, + "loss": 0.4866, + "step": 6500 + }, + { + "epoch": 0.20822695940771, + "grad_norm": 0.8482078909873962, + "learning_rate": 1.9921473535457348e-05, + "loss": 0.5002, + "step": 6525 + }, + { + "epoch": 0.2090247638498851, + "grad_norm": 0.9109760522842407, + "learning_rate": 1.990140030832477e-05, + "loss": 0.4985, + "step": 6550 + }, + { + "epoch": 0.20982256829206025, + "grad_norm": 0.7344380617141724, + "learning_rate": 1.988132708119219e-05, + "loss": 0.494, + "step": 6575 + }, + { + "epoch": 0.2106203727342354, + "grad_norm": 0.7968775629997253, + "learning_rate": 1.986125385405961e-05, + "loss": 0.5183, + "step": 6600 + }, + { + "epoch": 0.2114181771764105, + "grad_norm": 1.089021921157837, + "learning_rate": 1.984118062692703e-05, + "loss": 0.5522, + "step": 6625 + }, + { + "epoch": 0.21221598161858565, + "grad_norm": 0.886903703212738, + "learning_rate": 1.982110739979445e-05, + "loss": 0.516, + "step": 6650 + }, + { + "epoch": 0.2130137860607608, + "grad_norm": 0.8863084316253662, + "learning_rate": 1.980103417266187e-05, + "loss": 0.4592, + "step": 6675 + }, + { + "epoch": 0.2138115905029359, + "grad_norm": 0.4921455383300781, + "learning_rate": 1.9780960945529292e-05, + "loss": 0.483, + "step": 6700 + }, + { + "epoch": 0.21460939494511105, + "grad_norm": 0.9074678421020508, + "learning_rate": 1.976088771839671e-05, + "loss": 0.5242, + "step": 6725 + }, + { + "epoch": 0.2154071993872862, + "grad_norm": 0.6239631772041321, + "learning_rate": 1.974081449126413e-05, + "loss": 0.4701, + "step": 6750 + }, + { + "epoch": 0.2162050038294613, + "grad_norm": 0.6419113278388977, + "learning_rate": 1.9720741264131552e-05, + "loss": 0.5541, + "step": 6775 + }, + { + "epoch": 0.21700280827163645, + "grad_norm": 1.3911529779434204, + "learning_rate": 1.9700668036998974e-05, + "loss": 0.4585, + "step": 6800 + }, + { + "epoch": 0.2178006127138116, + "grad_norm": 1.0443439483642578, + "learning_rate": 1.968059480986639e-05, + "loss": 0.4677, + "step": 6825 + }, + { + "epoch": 0.2185984171559867, + "grad_norm": 1.1934226751327515, + "learning_rate": 1.9660521582733816e-05, + "loss": 0.4793, + "step": 6850 + }, + { + "epoch": 0.21939622159816186, + "grad_norm": 0.3158310055732727, + "learning_rate": 1.9640448355601234e-05, + "loss": 0.5371, + "step": 6875 + }, + { + "epoch": 0.220194026040337, + "grad_norm": 0.8096803426742554, + "learning_rate": 1.9620375128468655e-05, + "loss": 0.6083, + "step": 6900 + }, + { + "epoch": 0.2209918304825121, + "grad_norm": 0.4177459478378296, + "learning_rate": 1.9600301901336072e-05, + "loss": 0.5113, + "step": 6925 + }, + { + "epoch": 0.22178963492468726, + "grad_norm": 0.9590117931365967, + "learning_rate": 1.9580228674203497e-05, + "loss": 0.4658, + "step": 6950 + }, + { + "epoch": 0.2225874393668624, + "grad_norm": 1.7520601749420166, + "learning_rate": 1.9560155447070915e-05, + "loss": 0.5562, + "step": 6975 + }, + { + "epoch": 0.22338524380903754, + "grad_norm": 0.8236017823219299, + "learning_rate": 1.9540082219938336e-05, + "loss": 0.4285, + "step": 7000 + }, + { + "epoch": 0.22418304825121266, + "grad_norm": 0.48859933018684387, + "learning_rate": 1.9520008992805754e-05, + "loss": 0.4806, + "step": 7025 + }, + { + "epoch": 0.2249808526933878, + "grad_norm": 1.1268525123596191, + "learning_rate": 1.9499935765673178e-05, + "loss": 0.5602, + "step": 7050 + }, + { + "epoch": 0.22577865713556294, + "grad_norm": 1.7955353260040283, + "learning_rate": 1.9479862538540596e-05, + "loss": 0.4894, + "step": 7075 + }, + { + "epoch": 0.22657646157773806, + "grad_norm": 0.9489879012107849, + "learning_rate": 1.9459789311408017e-05, + "loss": 0.5958, + "step": 7100 + }, + { + "epoch": 0.2273742660199132, + "grad_norm": 0.6150937080383301, + "learning_rate": 1.9439716084275438e-05, + "loss": 0.4284, + "step": 7125 + }, + { + "epoch": 0.22817207046208834, + "grad_norm": 0.865825891494751, + "learning_rate": 1.941964285714286e-05, + "loss": 0.5286, + "step": 7150 + }, + { + "epoch": 0.22896987490426346, + "grad_norm": 0.8219313621520996, + "learning_rate": 1.9399569630010277e-05, + "loss": 0.4516, + "step": 7175 + }, + { + "epoch": 0.2297676793464386, + "grad_norm": 0.9921424984931946, + "learning_rate": 1.9379496402877698e-05, + "loss": 0.4889, + "step": 7200 + }, + { + "epoch": 0.23056548378861375, + "grad_norm": 0.645586371421814, + "learning_rate": 1.935942317574512e-05, + "loss": 0.5168, + "step": 7225 + }, + { + "epoch": 0.23136328823078886, + "grad_norm": 0.7225667238235474, + "learning_rate": 1.933934994861254e-05, + "loss": 0.5267, + "step": 7250 + }, + { + "epoch": 0.232161092672964, + "grad_norm": 0.6475264430046082, + "learning_rate": 1.9319276721479958e-05, + "loss": 0.4812, + "step": 7275 + }, + { + "epoch": 0.23295889711513915, + "grad_norm": 0.6617599725723267, + "learning_rate": 1.9299203494347383e-05, + "loss": 0.4795, + "step": 7300 + }, + { + "epoch": 0.23375670155731426, + "grad_norm": 0.8409857153892517, + "learning_rate": 1.92791302672148e-05, + "loss": 0.501, + "step": 7325 + }, + { + "epoch": 0.2345545059994894, + "grad_norm": 0.6069079637527466, + "learning_rate": 1.925905704008222e-05, + "loss": 0.4619, + "step": 7350 + }, + { + "epoch": 0.23535231044166455, + "grad_norm": 0.4156222343444824, + "learning_rate": 1.923898381294964e-05, + "loss": 0.5283, + "step": 7375 + }, + { + "epoch": 0.23615011488383966, + "grad_norm": 0.8497269749641418, + "learning_rate": 1.9218910585817064e-05, + "loss": 0.5375, + "step": 7400 + }, + { + "epoch": 0.2369479193260148, + "grad_norm": 0.9058747887611389, + "learning_rate": 1.919883735868448e-05, + "loss": 0.522, + "step": 7425 + }, + { + "epoch": 0.23774572376818995, + "grad_norm": 0.41128915548324585, + "learning_rate": 1.9178764131551903e-05, + "loss": 0.517, + "step": 7450 + }, + { + "epoch": 0.23854352821036506, + "grad_norm": 0.824787974357605, + "learning_rate": 1.915869090441932e-05, + "loss": 0.4799, + "step": 7475 + }, + { + "epoch": 0.2393413326525402, + "grad_norm": 1.1245702505111694, + "learning_rate": 1.9138617677286745e-05, + "loss": 0.4543, + "step": 7500 + }, + { + "epoch": 0.24013913709471535, + "grad_norm": 1.861336350440979, + "learning_rate": 1.9118544450154163e-05, + "loss": 0.4628, + "step": 7525 + }, + { + "epoch": 0.24093694153689046, + "grad_norm": 1.8422930240631104, + "learning_rate": 1.9098471223021584e-05, + "loss": 0.4926, + "step": 7550 + }, + { + "epoch": 0.2417347459790656, + "grad_norm": 0.821029007434845, + "learning_rate": 1.9078397995889005e-05, + "loss": 0.5141, + "step": 7575 + }, + { + "epoch": 0.24253255042124075, + "grad_norm": 0.7867022752761841, + "learning_rate": 1.9058324768756426e-05, + "loss": 0.4917, + "step": 7600 + }, + { + "epoch": 0.2433303548634159, + "grad_norm": 0.5639078617095947, + "learning_rate": 1.9038251541623844e-05, + "loss": 0.4945, + "step": 7625 + }, + { + "epoch": 0.244128159305591, + "grad_norm": 1.0543819665908813, + "learning_rate": 1.9018178314491265e-05, + "loss": 0.5713, + "step": 7650 + }, + { + "epoch": 0.24492596374776615, + "grad_norm": 0.8011125326156616, + "learning_rate": 1.8998105087358686e-05, + "loss": 0.4423, + "step": 7675 + }, + { + "epoch": 0.2457237681899413, + "grad_norm": 0.4197518527507782, + "learning_rate": 1.8978031860226107e-05, + "loss": 0.4802, + "step": 7700 + }, + { + "epoch": 0.2465215726321164, + "grad_norm": 0.43537619709968567, + "learning_rate": 1.8957958633093525e-05, + "loss": 0.491, + "step": 7725 + }, + { + "epoch": 0.24731937707429155, + "grad_norm": 0.9926440119743347, + "learning_rate": 1.8937885405960946e-05, + "loss": 0.5206, + "step": 7750 + }, + { + "epoch": 0.2481171815164667, + "grad_norm": 0.6548684239387512, + "learning_rate": 1.8917812178828367e-05, + "loss": 0.5278, + "step": 7775 + }, + { + "epoch": 0.2489149859586418, + "grad_norm": 0.6337385773658752, + "learning_rate": 1.889773895169579e-05, + "loss": 0.5716, + "step": 7800 + }, + { + "epoch": 0.24971279040081695, + "grad_norm": 0.8676078915596008, + "learning_rate": 1.8877665724563206e-05, + "loss": 0.4849, + "step": 7825 + }, + { + "epoch": 0.2505105948429921, + "grad_norm": 0.860718846321106, + "learning_rate": 1.885759249743063e-05, + "loss": 0.5069, + "step": 7850 + }, + { + "epoch": 0.25130839928516724, + "grad_norm": 0.6479659080505371, + "learning_rate": 1.883751927029805e-05, + "loss": 0.4364, + "step": 7875 + }, + { + "epoch": 0.2521062037273423, + "grad_norm": 0.5919177532196045, + "learning_rate": 1.881744604316547e-05, + "loss": 0.5035, + "step": 7900 + }, + { + "epoch": 0.25290400816951747, + "grad_norm": 0.6908952593803406, + "learning_rate": 1.8797372816032887e-05, + "loss": 0.4602, + "step": 7925 + }, + { + "epoch": 0.2537018126116926, + "grad_norm": 0.9885756373405457, + "learning_rate": 1.8777299588900312e-05, + "loss": 0.52, + "step": 7950 + }, + { + "epoch": 0.25449961705386775, + "grad_norm": 1.0296975374221802, + "learning_rate": 1.875722636176773e-05, + "loss": 0.542, + "step": 7975 + }, + { + "epoch": 0.2552974214960429, + "grad_norm": 0.44542399048805237, + "learning_rate": 1.873715313463515e-05, + "loss": 0.4894, + "step": 8000 + }, + { + "epoch": 0.25609522593821804, + "grad_norm": 1.1537628173828125, + "learning_rate": 1.8717079907502572e-05, + "loss": 0.5045, + "step": 8025 + }, + { + "epoch": 0.2568930303803932, + "grad_norm": 0.4203762114048004, + "learning_rate": 1.869700668036999e-05, + "loss": 0.4983, + "step": 8050 + }, + { + "epoch": 0.25769083482256827, + "grad_norm": 0.9488811492919922, + "learning_rate": 1.867693345323741e-05, + "loss": 0.4102, + "step": 8075 + }, + { + "epoch": 0.2584886392647434, + "grad_norm": 1.0219916105270386, + "learning_rate": 1.8656860226104832e-05, + "loss": 0.5188, + "step": 8100 + }, + { + "epoch": 0.25928644370691856, + "grad_norm": 2.179837465286255, + "learning_rate": 1.8636786998972253e-05, + "loss": 0.4998, + "step": 8125 + }, + { + "epoch": 0.2600842481490937, + "grad_norm": 0.5699672102928162, + "learning_rate": 1.861671377183967e-05, + "loss": 0.5445, + "step": 8150 + }, + { + "epoch": 0.26088205259126884, + "grad_norm": 1.0542255640029907, + "learning_rate": 1.8596640544707092e-05, + "loss": 0.4803, + "step": 8175 + }, + { + "epoch": 0.261679857033444, + "grad_norm": 0.6809638142585754, + "learning_rate": 1.8576567317574513e-05, + "loss": 0.4494, + "step": 8200 + }, + { + "epoch": 0.2624776614756191, + "grad_norm": 0.8445626497268677, + "learning_rate": 1.8556494090441934e-05, + "loss": 0.5055, + "step": 8225 + }, + { + "epoch": 0.2632754659177942, + "grad_norm": 0.6919910311698914, + "learning_rate": 1.8536420863309352e-05, + "loss": 0.448, + "step": 8250 + }, + { + "epoch": 0.26407327035996936, + "grad_norm": 1.0985883474349976, + "learning_rate": 1.8516347636176773e-05, + "loss": 0.4849, + "step": 8275 + }, + { + "epoch": 0.2648710748021445, + "grad_norm": 0.7515255212783813, + "learning_rate": 1.8496274409044194e-05, + "loss": 0.4677, + "step": 8300 + }, + { + "epoch": 0.26566887924431964, + "grad_norm": 0.825547993183136, + "learning_rate": 1.8476201181911615e-05, + "loss": 0.5624, + "step": 8325 + }, + { + "epoch": 0.2664666836864948, + "grad_norm": 1.1458021402359009, + "learning_rate": 1.8456127954779033e-05, + "loss": 0.4882, + "step": 8350 + }, + { + "epoch": 0.2672644881286699, + "grad_norm": 0.3550206422805786, + "learning_rate": 1.8436054727646454e-05, + "loss": 0.4423, + "step": 8375 + }, + { + "epoch": 0.268062292570845, + "grad_norm": 0.5570346713066101, + "learning_rate": 1.8415981500513875e-05, + "loss": 0.565, + "step": 8400 + }, + { + "epoch": 0.26886009701302016, + "grad_norm": 0.5592429637908936, + "learning_rate": 1.8395908273381296e-05, + "loss": 0.4471, + "step": 8425 + }, + { + "epoch": 0.2696579014551953, + "grad_norm": 0.3842066526412964, + "learning_rate": 1.8375835046248714e-05, + "loss": 0.5283, + "step": 8450 + }, + { + "epoch": 0.27045570589737045, + "grad_norm": 0.478157103061676, + "learning_rate": 1.835576181911614e-05, + "loss": 0.4916, + "step": 8475 + }, + { + "epoch": 0.2712535103395456, + "grad_norm": 1.1027177572250366, + "learning_rate": 1.8335688591983556e-05, + "loss": 0.5794, + "step": 8500 + }, + { + "epoch": 0.27205131478172073, + "grad_norm": 1.0234731435775757, + "learning_rate": 1.8315615364850977e-05, + "loss": 0.4383, + "step": 8525 + }, + { + "epoch": 0.2728491192238958, + "grad_norm": 0.9634427428245544, + "learning_rate": 1.8295542137718395e-05, + "loss": 0.4818, + "step": 8550 + }, + { + "epoch": 0.27364692366607096, + "grad_norm": 0.6669744253158569, + "learning_rate": 1.827546891058582e-05, + "loss": 0.5153, + "step": 8575 + }, + { + "epoch": 0.2744447281082461, + "grad_norm": 0.6819443106651306, + "learning_rate": 1.8255395683453237e-05, + "loss": 0.4161, + "step": 8600 + }, + { + "epoch": 0.27524253255042125, + "grad_norm": 0.6380025148391724, + "learning_rate": 1.823532245632066e-05, + "loss": 0.4378, + "step": 8625 + }, + { + "epoch": 0.2760403369925964, + "grad_norm": 1.0884222984313965, + "learning_rate": 1.8215249229188076e-05, + "loss": 0.4557, + "step": 8650 + }, + { + "epoch": 0.27683814143477153, + "grad_norm": 0.3991362750530243, + "learning_rate": 1.81951760020555e-05, + "loss": 0.4468, + "step": 8675 + }, + { + "epoch": 0.2776359458769466, + "grad_norm": 0.6306272149085999, + "learning_rate": 1.817510277492292e-05, + "loss": 0.4402, + "step": 8700 + }, + { + "epoch": 0.27843375031912176, + "grad_norm": 0.9752461910247803, + "learning_rate": 1.815502954779034e-05, + "loss": 0.502, + "step": 8725 + }, + { + "epoch": 0.2792315547612969, + "grad_norm": 1.1096750497817993, + "learning_rate": 1.813495632065776e-05, + "loss": 0.4984, + "step": 8750 + }, + { + "epoch": 0.28002935920347205, + "grad_norm": 0.7078149914741516, + "learning_rate": 1.8114883093525182e-05, + "loss": 0.4901, + "step": 8775 + }, + { + "epoch": 0.2808271636456472, + "grad_norm": 0.8923904299736023, + "learning_rate": 1.80948098663926e-05, + "loss": 0.4476, + "step": 8800 + }, + { + "epoch": 0.28162496808782234, + "grad_norm": 1.1634156703948975, + "learning_rate": 1.807473663926002e-05, + "loss": 0.5004, + "step": 8825 + }, + { + "epoch": 0.2824227725299974, + "grad_norm": 0.6043126583099365, + "learning_rate": 1.8054663412127442e-05, + "loss": 0.4892, + "step": 8850 + }, + { + "epoch": 0.28322057697217257, + "grad_norm": 0.7623210549354553, + "learning_rate": 1.8034590184994863e-05, + "loss": 0.4507, + "step": 8875 + }, + { + "epoch": 0.2840183814143477, + "grad_norm": 0.7996333837509155, + "learning_rate": 1.801451695786228e-05, + "loss": 0.4039, + "step": 8900 + }, + { + "epoch": 0.28481618585652285, + "grad_norm": 1.5519790649414062, + "learning_rate": 1.7994443730729702e-05, + "loss": 0.4859, + "step": 8925 + }, + { + "epoch": 0.285613990298698, + "grad_norm": 0.7139633297920227, + "learning_rate": 1.7974370503597123e-05, + "loss": 0.466, + "step": 8950 + }, + { + "epoch": 0.28641179474087314, + "grad_norm": 1.1657685041427612, + "learning_rate": 1.7954297276464544e-05, + "loss": 0.4362, + "step": 8975 + }, + { + "epoch": 0.2872095991830482, + "grad_norm": 1.1409554481506348, + "learning_rate": 1.7934224049331962e-05, + "loss": 0.41, + "step": 9000 + }, + { + "epoch": 0.28800740362522337, + "grad_norm": 0.5146012902259827, + "learning_rate": 1.7914150822199387e-05, + "loss": 0.4733, + "step": 9025 + }, + { + "epoch": 0.2888052080673985, + "grad_norm": 1.1914423704147339, + "learning_rate": 1.7894077595066804e-05, + "loss": 0.505, + "step": 9050 + }, + { + "epoch": 0.28960301250957365, + "grad_norm": 0.5126356482505798, + "learning_rate": 1.7874004367934225e-05, + "loss": 0.4781, + "step": 9075 + }, + { + "epoch": 0.2904008169517488, + "grad_norm": 0.3558446168899536, + "learning_rate": 1.7853931140801643e-05, + "loss": 0.4813, + "step": 9100 + }, + { + "epoch": 0.29119862139392394, + "grad_norm": 0.9246950745582581, + "learning_rate": 1.7833857913669068e-05, + "loss": 0.4645, + "step": 9125 + }, + { + "epoch": 0.2919964258360991, + "grad_norm": 0.4734017550945282, + "learning_rate": 1.7813784686536485e-05, + "loss": 0.4598, + "step": 9150 + }, + { + "epoch": 0.29279423027827417, + "grad_norm": 0.7660199403762817, + "learning_rate": 1.7793711459403907e-05, + "loss": 0.5033, + "step": 9175 + }, + { + "epoch": 0.2935920347204493, + "grad_norm": 0.7290098071098328, + "learning_rate": 1.7773638232271328e-05, + "loss": 0.4783, + "step": 9200 + }, + { + "epoch": 0.29438983916262446, + "grad_norm": 0.7562490105628967, + "learning_rate": 1.775356500513875e-05, + "loss": 0.4961, + "step": 9225 + }, + { + "epoch": 0.2951876436047996, + "grad_norm": 0.31955620646476746, + "learning_rate": 1.7733491778006167e-05, + "loss": 0.5139, + "step": 9250 + }, + { + "epoch": 0.29598544804697474, + "grad_norm": 1.351357102394104, + "learning_rate": 1.7713418550873588e-05, + "loss": 0.4708, + "step": 9275 + }, + { + "epoch": 0.2967832524891499, + "grad_norm": 0.41533350944519043, + "learning_rate": 1.769334532374101e-05, + "loss": 0.4064, + "step": 9300 + }, + { + "epoch": 0.29758105693132497, + "grad_norm": 1.161253809928894, + "learning_rate": 1.767327209660843e-05, + "loss": 0.4731, + "step": 9325 + }, + { + "epoch": 0.2983788613735001, + "grad_norm": 0.7671948075294495, + "learning_rate": 1.7653198869475848e-05, + "loss": 0.491, + "step": 9350 + }, + { + "epoch": 0.29917666581567526, + "grad_norm": 0.7606444954872131, + "learning_rate": 1.763312564234327e-05, + "loss": 0.5046, + "step": 9375 + }, + { + "epoch": 0.2999744702578504, + "grad_norm": 1.3677698373794556, + "learning_rate": 1.761305241521069e-05, + "loss": 0.3982, + "step": 9400 + }, + { + "epoch": 0.30077227470002554, + "grad_norm": 0.811142086982727, + "learning_rate": 1.759297918807811e-05, + "loss": 0.4427, + "step": 9425 + }, + { + "epoch": 0.3015700791422007, + "grad_norm": 0.864321768283844, + "learning_rate": 1.757290596094553e-05, + "loss": 0.4531, + "step": 9450 + }, + { + "epoch": 0.3023678835843758, + "grad_norm": 1.6318188905715942, + "learning_rate": 1.7552832733812953e-05, + "loss": 0.499, + "step": 9475 + }, + { + "epoch": 0.3031656880265509, + "grad_norm": 1.0822746753692627, + "learning_rate": 1.753275950668037e-05, + "loss": 0.546, + "step": 9500 + }, + { + "epoch": 0.30396349246872606, + "grad_norm": 0.5256426334381104, + "learning_rate": 1.7512686279547792e-05, + "loss": 0.4819, + "step": 9525 + }, + { + "epoch": 0.3047612969109012, + "grad_norm": 1.2467024326324463, + "learning_rate": 1.749261305241521e-05, + "loss": 0.4946, + "step": 9550 + }, + { + "epoch": 0.30555910135307635, + "grad_norm": 0.6196355819702148, + "learning_rate": 1.747253982528263e-05, + "loss": 0.4924, + "step": 9575 + }, + { + "epoch": 0.3063569057952515, + "grad_norm": 0.48964831233024597, + "learning_rate": 1.7452466598150052e-05, + "loss": 0.4554, + "step": 9600 + }, + { + "epoch": 0.3071547102374266, + "grad_norm": 1.2648134231567383, + "learning_rate": 1.7432393371017473e-05, + "loss": 0.5184, + "step": 9625 + }, + { + "epoch": 0.3079525146796017, + "grad_norm": 0.8756480813026428, + "learning_rate": 1.7412320143884895e-05, + "loss": 0.4393, + "step": 9650 + }, + { + "epoch": 0.30875031912177686, + "grad_norm": 0.6415950059890747, + "learning_rate": 1.7392246916752312e-05, + "loss": 0.519, + "step": 9675 + }, + { + "epoch": 0.309548123563952, + "grad_norm": 0.8496715426445007, + "learning_rate": 1.7372173689619733e-05, + "loss": 0.5031, + "step": 9700 + }, + { + "epoch": 0.31034592800612715, + "grad_norm": 0.42665472626686096, + "learning_rate": 1.7352100462487155e-05, + "loss": 0.5048, + "step": 9725 + }, + { + "epoch": 0.3111437324483023, + "grad_norm": 0.6046943664550781, + "learning_rate": 1.7332027235354576e-05, + "loss": 0.499, + "step": 9750 + }, + { + "epoch": 0.31194153689047743, + "grad_norm": 0.6538355350494385, + "learning_rate": 1.7311954008221993e-05, + "loss": 0.4287, + "step": 9775 + }, + { + "epoch": 0.3127393413326525, + "grad_norm": 0.4587744176387787, + "learning_rate": 1.7291880781089415e-05, + "loss": 0.4803, + "step": 9800 + }, + { + "epoch": 0.31353714577482766, + "grad_norm": 0.9619668126106262, + "learning_rate": 1.7271807553956836e-05, + "loss": 0.4662, + "step": 9825 + }, + { + "epoch": 0.3143349502170028, + "grad_norm": 0.7719420194625854, + "learning_rate": 1.7251734326824257e-05, + "loss": 0.5092, + "step": 9850 + }, + { + "epoch": 0.31513275465917795, + "grad_norm": 0.7614120841026306, + "learning_rate": 1.7231661099691675e-05, + "loss": 0.4709, + "step": 9875 + }, + { + "epoch": 0.3159305591013531, + "grad_norm": 1.1873565912246704, + "learning_rate": 1.7211587872559096e-05, + "loss": 0.4459, + "step": 9900 + }, + { + "epoch": 0.31672836354352824, + "grad_norm": 0.6782514452934265, + "learning_rate": 1.7191514645426517e-05, + "loss": 0.4737, + "step": 9925 + }, + { + "epoch": 0.3175261679857033, + "grad_norm": 0.46671539545059204, + "learning_rate": 1.7171441418293938e-05, + "loss": 0.5144, + "step": 9950 + }, + { + "epoch": 0.31832397242787847, + "grad_norm": 0.7853887677192688, + "learning_rate": 1.7151368191161356e-05, + "loss": 0.5039, + "step": 9975 + }, + { + "epoch": 0.3191217768700536, + "grad_norm": 0.9797842502593994, + "learning_rate": 1.7131294964028777e-05, + "loss": 0.5097, + "step": 10000 + }, + { + "epoch": 0.31991958131222875, + "grad_norm": 1.0205621719360352, + "learning_rate": 1.7111221736896198e-05, + "loss": 0.4735, + "step": 10025 + }, + { + "epoch": 0.3207173857544039, + "grad_norm": 0.7101638913154602, + "learning_rate": 1.709114850976362e-05, + "loss": 0.5876, + "step": 10050 + }, + { + "epoch": 0.32151519019657904, + "grad_norm": 1.0455431938171387, + "learning_rate": 1.7071075282631037e-05, + "loss": 0.5183, + "step": 10075 + }, + { + "epoch": 0.3223129946387541, + "grad_norm": 1.2804579734802246, + "learning_rate": 1.7051002055498458e-05, + "loss": 0.472, + "step": 10100 + }, + { + "epoch": 0.32311079908092927, + "grad_norm": 0.707480788230896, + "learning_rate": 1.703092882836588e-05, + "loss": 0.4995, + "step": 10125 + }, + { + "epoch": 0.3239086035231044, + "grad_norm": 0.5702756643295288, + "learning_rate": 1.70108556012333e-05, + "loss": 0.5285, + "step": 10150 + }, + { + "epoch": 0.32470640796527955, + "grad_norm": 1.105216383934021, + "learning_rate": 1.6990782374100718e-05, + "loss": 0.4843, + "step": 10175 + }, + { + "epoch": 0.3255042124074547, + "grad_norm": 0.6410677433013916, + "learning_rate": 1.6970709146968142e-05, + "loss": 0.4855, + "step": 10200 + }, + { + "epoch": 0.32630201684962984, + "grad_norm": 1.1038342714309692, + "learning_rate": 1.695063591983556e-05, + "loss": 0.4942, + "step": 10225 + }, + { + "epoch": 0.3270998212918049, + "grad_norm": 0.8862643837928772, + "learning_rate": 1.693056269270298e-05, + "loss": 0.5562, + "step": 10250 + }, + { + "epoch": 0.32789762573398007, + "grad_norm": 0.8062041997909546, + "learning_rate": 1.69104894655704e-05, + "loss": 0.5021, + "step": 10275 + }, + { + "epoch": 0.3286954301761552, + "grad_norm": 0.8131182789802551, + "learning_rate": 1.6890416238437824e-05, + "loss": 0.47, + "step": 10300 + }, + { + "epoch": 0.32949323461833036, + "grad_norm": 0.7970181703567505, + "learning_rate": 1.687034301130524e-05, + "loss": 0.4677, + "step": 10325 + }, + { + "epoch": 0.3302910390605055, + "grad_norm": 0.6498174071311951, + "learning_rate": 1.6850269784172663e-05, + "loss": 0.4772, + "step": 10350 + }, + { + "epoch": 0.33108884350268064, + "grad_norm": 0.6998928189277649, + "learning_rate": 1.6830196557040084e-05, + "loss": 0.4691, + "step": 10375 + }, + { + "epoch": 0.3318866479448558, + "grad_norm": 0.653899073600769, + "learning_rate": 1.6810123329907505e-05, + "loss": 0.4978, + "step": 10400 + }, + { + "epoch": 0.33268445238703087, + "grad_norm": 0.8953115344047546, + "learning_rate": 1.6790050102774923e-05, + "loss": 0.4758, + "step": 10425 + }, + { + "epoch": 0.333482256829206, + "grad_norm": 1.1513452529907227, + "learning_rate": 1.6769976875642344e-05, + "loss": 0.4322, + "step": 10450 + }, + { + "epoch": 0.33428006127138116, + "grad_norm": 0.9625234007835388, + "learning_rate": 1.6749903648509765e-05, + "loss": 0.5192, + "step": 10475 + }, + { + "epoch": 0.3350778657135563, + "grad_norm": 0.9107385277748108, + "learning_rate": 1.6729830421377186e-05, + "loss": 0.5478, + "step": 10500 + }, + { + "epoch": 0.33587567015573144, + "grad_norm": 1.4335834980010986, + "learning_rate": 1.6709757194244604e-05, + "loss": 0.482, + "step": 10525 + }, + { + "epoch": 0.3366734745979066, + "grad_norm": 0.3528570830821991, + "learning_rate": 1.6689683967112025e-05, + "loss": 0.4542, + "step": 10550 + }, + { + "epoch": 0.3374712790400817, + "grad_norm": 0.7097840309143066, + "learning_rate": 1.6669610739979446e-05, + "loss": 0.4464, + "step": 10575 + }, + { + "epoch": 0.3382690834822568, + "grad_norm": 0.835563063621521, + "learning_rate": 1.6649537512846867e-05, + "loss": 0.4649, + "step": 10600 + }, + { + "epoch": 0.33906688792443196, + "grad_norm": 0.5886844992637634, + "learning_rate": 1.6629464285714285e-05, + "loss": 0.5257, + "step": 10625 + }, + { + "epoch": 0.3398646923666071, + "grad_norm": 1.1818501949310303, + "learning_rate": 1.660939105858171e-05, + "loss": 0.4614, + "step": 10650 + }, + { + "epoch": 0.34066249680878224, + "grad_norm": 1.0059629678726196, + "learning_rate": 1.6589317831449127e-05, + "loss": 0.4473, + "step": 10675 + }, + { + "epoch": 0.3414603012509574, + "grad_norm": 0.6061350107192993, + "learning_rate": 1.6569244604316548e-05, + "loss": 0.4962, + "step": 10700 + }, + { + "epoch": 0.3422581056931325, + "grad_norm": 0.7166891694068909, + "learning_rate": 1.6549171377183966e-05, + "loss": 0.5111, + "step": 10725 + }, + { + "epoch": 0.3430559101353076, + "grad_norm": 1.4252043962478638, + "learning_rate": 1.652909815005139e-05, + "loss": 0.46, + "step": 10750 + }, + { + "epoch": 0.34385371457748276, + "grad_norm": 0.9708260893821716, + "learning_rate": 1.6509024922918808e-05, + "loss": 0.4923, + "step": 10775 + }, + { + "epoch": 0.3446515190196579, + "grad_norm": 1.8117674589157104, + "learning_rate": 1.648895169578623e-05, + "loss": 0.4932, + "step": 10800 + }, + { + "epoch": 0.34544932346183305, + "grad_norm": 1.0717155933380127, + "learning_rate": 1.6468878468653647e-05, + "loss": 0.4686, + "step": 10825 + }, + { + "epoch": 0.3462471279040082, + "grad_norm": 1.0861729383468628, + "learning_rate": 1.644880524152107e-05, + "loss": 0.513, + "step": 10850 + }, + { + "epoch": 0.3470449323461833, + "grad_norm": 0.8470234274864197, + "learning_rate": 1.642873201438849e-05, + "loss": 0.3456, + "step": 10875 + }, + { + "epoch": 0.3478427367883584, + "grad_norm": 1.0085972547531128, + "learning_rate": 1.640865878725591e-05, + "loss": 0.4574, + "step": 10900 + }, + { + "epoch": 0.34864054123053356, + "grad_norm": 0.8033475279808044, + "learning_rate": 1.638858556012333e-05, + "loss": 0.6031, + "step": 10925 + }, + { + "epoch": 0.3494383456727087, + "grad_norm": 0.9935612082481384, + "learning_rate": 1.6368512332990753e-05, + "loss": 0.4731, + "step": 10950 + }, + { + "epoch": 0.35023615011488385, + "grad_norm": 0.6845170259475708, + "learning_rate": 1.634843910585817e-05, + "loss": 0.4265, + "step": 10975 + }, + { + "epoch": 0.351033954557059, + "grad_norm": 0.9165354371070862, + "learning_rate": 1.632836587872559e-05, + "loss": 0.4642, + "step": 11000 + }, + { + "epoch": 0.35183175899923413, + "grad_norm": 0.9118735194206238, + "learning_rate": 1.6308292651593013e-05, + "loss": 0.5264, + "step": 11025 + }, + { + "epoch": 0.3526295634414092, + "grad_norm": 0.4230351448059082, + "learning_rate": 1.6288219424460434e-05, + "loss": 0.5, + "step": 11050 + }, + { + "epoch": 0.35342736788358436, + "grad_norm": 0.555743396282196, + "learning_rate": 1.626814619732785e-05, + "loss": 0.4524, + "step": 11075 + }, + { + "epoch": 0.3542251723257595, + "grad_norm": 0.5540426969528198, + "learning_rate": 1.6248072970195273e-05, + "loss": 0.4752, + "step": 11100 + }, + { + "epoch": 0.35502297676793465, + "grad_norm": 0.6602437496185303, + "learning_rate": 1.6227999743062694e-05, + "loss": 0.4597, + "step": 11125 + }, + { + "epoch": 0.3558207812101098, + "grad_norm": 1.2678382396697998, + "learning_rate": 1.6207926515930115e-05, + "loss": 0.4918, + "step": 11150 + }, + { + "epoch": 0.35661858565228494, + "grad_norm": 0.7993351817131042, + "learning_rate": 1.6187853288797533e-05, + "loss": 0.4044, + "step": 11175 + }, + { + "epoch": 0.35741639009446, + "grad_norm": 0.4799681007862091, + "learning_rate": 1.6167780061664954e-05, + "loss": 0.4845, + "step": 11200 + }, + { + "epoch": 0.35821419453663517, + "grad_norm": 0.4807746708393097, + "learning_rate": 1.6147706834532375e-05, + "loss": 0.4885, + "step": 11225 + }, + { + "epoch": 0.3590119989788103, + "grad_norm": 0.6480098962783813, + "learning_rate": 1.6127633607399796e-05, + "loss": 0.4902, + "step": 11250 + }, + { + "epoch": 0.35980980342098545, + "grad_norm": 0.921784520149231, + "learning_rate": 1.6107560380267214e-05, + "loss": 0.4527, + "step": 11275 + }, + { + "epoch": 0.3606076078631606, + "grad_norm": 0.7924018502235413, + "learning_rate": 1.6087487153134635e-05, + "loss": 0.4114, + "step": 11300 + }, + { + "epoch": 0.36140541230533574, + "grad_norm": 0.680823564529419, + "learning_rate": 1.6067413926002056e-05, + "loss": 0.4844, + "step": 11325 + }, + { + "epoch": 0.3622032167475108, + "grad_norm": 0.6174274682998657, + "learning_rate": 1.6047340698869477e-05, + "loss": 0.4905, + "step": 11350 + }, + { + "epoch": 0.36300102118968597, + "grad_norm": 0.9357193112373352, + "learning_rate": 1.60272674717369e-05, + "loss": 0.4846, + "step": 11375 + }, + { + "epoch": 0.3637988256318611, + "grad_norm": 0.8845779895782471, + "learning_rate": 1.6007194244604316e-05, + "loss": 0.48, + "step": 11400 + }, + { + "epoch": 0.36459663007403625, + "grad_norm": 1.3138706684112549, + "learning_rate": 1.5987121017471737e-05, + "loss": 0.4544, + "step": 11425 + }, + { + "epoch": 0.3653944345162114, + "grad_norm": 0.6036775708198547, + "learning_rate": 1.596704779033916e-05, + "loss": 0.4878, + "step": 11450 + }, + { + "epoch": 0.36619223895838654, + "grad_norm": 0.646816611289978, + "learning_rate": 1.594697456320658e-05, + "loss": 0.5299, + "step": 11475 + }, + { + "epoch": 0.3669900434005616, + "grad_norm": 0.8093705177307129, + "learning_rate": 1.5926901336073997e-05, + "loss": 0.4264, + "step": 11500 + }, + { + "epoch": 0.36778784784273677, + "grad_norm": 0.7628122568130493, + "learning_rate": 1.590682810894142e-05, + "loss": 0.467, + "step": 11525 + }, + { + "epoch": 0.3685856522849119, + "grad_norm": 0.45453178882598877, + "learning_rate": 1.588675488180884e-05, + "loss": 0.514, + "step": 11550 + }, + { + "epoch": 0.36938345672708706, + "grad_norm": 0.6202525496482849, + "learning_rate": 1.586668165467626e-05, + "loss": 0.4646, + "step": 11575 + }, + { + "epoch": 0.3701812611692622, + "grad_norm": 0.7522080540657043, + "learning_rate": 1.584660842754368e-05, + "loss": 0.4502, + "step": 11600 + }, + { + "epoch": 0.37097906561143734, + "grad_norm": 0.9508293867111206, + "learning_rate": 1.58265352004111e-05, + "loss": 0.4064, + "step": 11625 + }, + { + "epoch": 0.3717768700536125, + "grad_norm": 1.5886996984481812, + "learning_rate": 1.580646197327852e-05, + "loss": 0.4133, + "step": 11650 + }, + { + "epoch": 0.37257467449578757, + "grad_norm": 0.9361369013786316, + "learning_rate": 1.5786388746145942e-05, + "loss": 0.4868, + "step": 11675 + }, + { + "epoch": 0.3733724789379627, + "grad_norm": 1.056349515914917, + "learning_rate": 1.576631551901336e-05, + "loss": 0.46, + "step": 11700 + }, + { + "epoch": 0.37417028338013786, + "grad_norm": 0.6318427324295044, + "learning_rate": 1.574624229188078e-05, + "loss": 0.48, + "step": 11725 + }, + { + "epoch": 0.374968087822313, + "grad_norm": 5.217873573303223, + "learning_rate": 1.5726169064748202e-05, + "loss": 0.5168, + "step": 11750 + }, + { + "epoch": 0.37576589226448814, + "grad_norm": 0.5438673496246338, + "learning_rate": 1.5706095837615623e-05, + "loss": 0.4524, + "step": 11775 + }, + { + "epoch": 0.3765636967066633, + "grad_norm": 0.9590070843696594, + "learning_rate": 1.568602261048304e-05, + "loss": 0.5341, + "step": 11800 + }, + { + "epoch": 0.3773615011488384, + "grad_norm": 1.6194452047348022, + "learning_rate": 1.5665949383350465e-05, + "loss": 0.5055, + "step": 11825 + }, + { + "epoch": 0.3781593055910135, + "grad_norm": 0.5337164998054504, + "learning_rate": 1.5645876156217883e-05, + "loss": 0.4146, + "step": 11850 + }, + { + "epoch": 0.37895711003318866, + "grad_norm": 0.4194962680339813, + "learning_rate": 1.5625802929085304e-05, + "loss": 0.4396, + "step": 11875 + }, + { + "epoch": 0.3797549144753638, + "grad_norm": 0.8845074772834778, + "learning_rate": 1.5605729701952722e-05, + "loss": 0.4649, + "step": 11900 + }, + { + "epoch": 0.38055271891753895, + "grad_norm": 0.4936656653881073, + "learning_rate": 1.5585656474820146e-05, + "loss": 0.5033, + "step": 11925 + }, + { + "epoch": 0.3813505233597141, + "grad_norm": 1.3536922931671143, + "learning_rate": 1.5565583247687564e-05, + "loss": 0.5048, + "step": 11950 + }, + { + "epoch": 0.3821483278018892, + "grad_norm": 0.27829474210739136, + "learning_rate": 1.5545510020554985e-05, + "loss": 0.4022, + "step": 11975 + }, + { + "epoch": 0.3829461322440643, + "grad_norm": 1.678222894668579, + "learning_rate": 1.5525436793422403e-05, + "loss": 0.4964, + "step": 12000 + }, + { + "epoch": 0.38374393668623946, + "grad_norm": 0.45845431089401245, + "learning_rate": 1.5505363566289828e-05, + "loss": 0.4632, + "step": 12025 + }, + { + "epoch": 0.3845417411284146, + "grad_norm": 0.9954239726066589, + "learning_rate": 1.5485290339157245e-05, + "loss": 0.5619, + "step": 12050 + }, + { + "epoch": 0.38533954557058975, + "grad_norm": 0.4554518461227417, + "learning_rate": 1.5465217112024666e-05, + "loss": 0.4775, + "step": 12075 + }, + { + "epoch": 0.3861373500127649, + "grad_norm": 1.5614060163497925, + "learning_rate": 1.5445143884892088e-05, + "loss": 0.4309, + "step": 12100 + }, + { + "epoch": 0.38693515445494, + "grad_norm": 0.525596559047699, + "learning_rate": 1.542507065775951e-05, + "loss": 0.4973, + "step": 12125 + }, + { + "epoch": 0.3877329588971151, + "grad_norm": 1.0024532079696655, + "learning_rate": 1.5404997430626926e-05, + "loss": 0.4464, + "step": 12150 + }, + { + "epoch": 0.38853076333929026, + "grad_norm": 1.3986321687698364, + "learning_rate": 1.5384924203494348e-05, + "loss": 0.4882, + "step": 12175 + }, + { + "epoch": 0.3893285677814654, + "grad_norm": 1.0822752714157104, + "learning_rate": 1.536485097636177e-05, + "loss": 0.4517, + "step": 12200 + }, + { + "epoch": 0.39012637222364055, + "grad_norm": 1.104766607284546, + "learning_rate": 1.534477774922919e-05, + "loss": 0.4684, + "step": 12225 + }, + { + "epoch": 0.3909241766658157, + "grad_norm": 0.4974013566970825, + "learning_rate": 1.5324704522096608e-05, + "loss": 0.4644, + "step": 12250 + }, + { + "epoch": 0.39172198110799084, + "grad_norm": 1.233022689819336, + "learning_rate": 1.5304631294964032e-05, + "loss": 0.4296, + "step": 12275 + }, + { + "epoch": 0.3925197855501659, + "grad_norm": 1.320533275604248, + "learning_rate": 1.528455806783145e-05, + "loss": 0.4937, + "step": 12300 + }, + { + "epoch": 0.39331758999234107, + "grad_norm": 0.3013911545276642, + "learning_rate": 1.526448484069887e-05, + "loss": 0.4021, + "step": 12325 + }, + { + "epoch": 0.3941153944345162, + "grad_norm": 0.8026270270347595, + "learning_rate": 1.5244411613566289e-05, + "loss": 0.4949, + "step": 12350 + }, + { + "epoch": 0.39491319887669135, + "grad_norm": 0.7568235993385315, + "learning_rate": 1.5224338386433712e-05, + "loss": 0.4236, + "step": 12375 + }, + { + "epoch": 0.3957110033188665, + "grad_norm": 0.8384756445884705, + "learning_rate": 1.5204265159301131e-05, + "loss": 0.4697, + "step": 12400 + }, + { + "epoch": 0.39650880776104164, + "grad_norm": 0.9342674016952515, + "learning_rate": 1.518419193216855e-05, + "loss": 0.4424, + "step": 12425 + }, + { + "epoch": 0.3973066122032167, + "grad_norm": 0.8164283037185669, + "learning_rate": 1.516411870503597e-05, + "loss": 0.4843, + "step": 12450 + }, + { + "epoch": 0.39810441664539187, + "grad_norm": 0.8556693196296692, + "learning_rate": 1.5144045477903393e-05, + "loss": 0.4874, + "step": 12475 + }, + { + "epoch": 0.398902221087567, + "grad_norm": 1.1020561456680298, + "learning_rate": 1.5123972250770812e-05, + "loss": 0.4821, + "step": 12500 + }, + { + "epoch": 0.39970002552974215, + "grad_norm": 0.43005600571632385, + "learning_rate": 1.5103899023638232e-05, + "loss": 0.4774, + "step": 12525 + }, + { + "epoch": 0.4004978299719173, + "grad_norm": 1.0019757747650146, + "learning_rate": 1.5083825796505654e-05, + "loss": 0.47, + "step": 12550 + }, + { + "epoch": 0.40129563441409244, + "grad_norm": 0.9805948734283447, + "learning_rate": 1.5063752569373074e-05, + "loss": 0.4655, + "step": 12575 + }, + { + "epoch": 0.4020934388562675, + "grad_norm": 1.0946297645568848, + "learning_rate": 1.5043679342240493e-05, + "loss": 0.4656, + "step": 12600 + }, + { + "epoch": 0.40289124329844267, + "grad_norm": 0.7530800700187683, + "learning_rate": 1.5023606115107913e-05, + "loss": 0.4989, + "step": 12625 + }, + { + "epoch": 0.4036890477406178, + "grad_norm": 0.7736749053001404, + "learning_rate": 1.5003532887975336e-05, + "loss": 0.474, + "step": 12650 + }, + { + "epoch": 0.40448685218279296, + "grad_norm": 0.6549740433692932, + "learning_rate": 1.4983459660842755e-05, + "loss": 0.4824, + "step": 12675 + }, + { + "epoch": 0.4052846566249681, + "grad_norm": 0.556654691696167, + "learning_rate": 1.4963386433710174e-05, + "loss": 0.4596, + "step": 12700 + }, + { + "epoch": 0.40608246106714324, + "grad_norm": 0.8068644404411316, + "learning_rate": 1.4943313206577597e-05, + "loss": 0.4255, + "step": 12725 + }, + { + "epoch": 0.40688026550931833, + "grad_norm": 0.7896491289138794, + "learning_rate": 1.4923239979445017e-05, + "loss": 0.4676, + "step": 12750 + }, + { + "epoch": 0.40767806995149347, + "grad_norm": 0.8864936232566833, + "learning_rate": 1.4903166752312436e-05, + "loss": 0.4526, + "step": 12775 + }, + { + "epoch": 0.4084758743936686, + "grad_norm": 0.7941861748695374, + "learning_rate": 1.4883093525179856e-05, + "loss": 0.4269, + "step": 12800 + }, + { + "epoch": 0.40927367883584376, + "grad_norm": 0.8171241879463196, + "learning_rate": 1.4863020298047278e-05, + "loss": 0.4955, + "step": 12825 + }, + { + "epoch": 0.4100714832780189, + "grad_norm": 0.7162429094314575, + "learning_rate": 1.4842947070914698e-05, + "loss": 0.4507, + "step": 12850 + }, + { + "epoch": 0.41086928772019404, + "grad_norm": 0.6082909107208252, + "learning_rate": 1.4822873843782117e-05, + "loss": 0.4514, + "step": 12875 + }, + { + "epoch": 0.4116670921623692, + "grad_norm": 0.9657214879989624, + "learning_rate": 1.4802800616649537e-05, + "loss": 0.4681, + "step": 12900 + }, + { + "epoch": 0.4124648966045443, + "grad_norm": 0.9833455681800842, + "learning_rate": 1.478272738951696e-05, + "loss": 0.5292, + "step": 12925 + }, + { + "epoch": 0.4132627010467194, + "grad_norm": 0.7374243140220642, + "learning_rate": 1.4762654162384379e-05, + "loss": 0.4767, + "step": 12950 + }, + { + "epoch": 0.41406050548889456, + "grad_norm": 0.7715759873390198, + "learning_rate": 1.4742580935251798e-05, + "loss": 0.4711, + "step": 12975 + }, + { + "epoch": 0.4148583099310697, + "grad_norm": 0.7317653298377991, + "learning_rate": 1.4722507708119221e-05, + "loss": 0.4605, + "step": 13000 + }, + { + "epoch": 0.41565611437324484, + "grad_norm": 0.5078895092010498, + "learning_rate": 1.470243448098664e-05, + "loss": 0.4192, + "step": 13025 + }, + { + "epoch": 0.41645391881542, + "grad_norm": 0.4797663390636444, + "learning_rate": 1.468236125385406e-05, + "loss": 0.4826, + "step": 13050 + }, + { + "epoch": 0.4172517232575951, + "grad_norm": 0.7433570027351379, + "learning_rate": 1.466228802672148e-05, + "loss": 0.4962, + "step": 13075 + }, + { + "epoch": 0.4180495276997702, + "grad_norm": 0.7199302315711975, + "learning_rate": 1.4642214799588902e-05, + "loss": 0.4215, + "step": 13100 + }, + { + "epoch": 0.41884733214194536, + "grad_norm": 1.2280867099761963, + "learning_rate": 1.4622141572456322e-05, + "loss": 0.5665, + "step": 13125 + }, + { + "epoch": 0.4196451365841205, + "grad_norm": 1.1554296016693115, + "learning_rate": 1.4602068345323741e-05, + "loss": 0.5012, + "step": 13150 + }, + { + "epoch": 0.42044294102629565, + "grad_norm": 0.9082030057907104, + "learning_rate": 1.458199511819116e-05, + "loss": 0.4835, + "step": 13175 + }, + { + "epoch": 0.4212407454684708, + "grad_norm": 1.0009952783584595, + "learning_rate": 1.4561921891058584e-05, + "loss": 0.5488, + "step": 13200 + }, + { + "epoch": 0.4220385499106459, + "grad_norm": 0.5102426409721375, + "learning_rate": 1.4541848663926003e-05, + "loss": 0.5196, + "step": 13225 + }, + { + "epoch": 0.422836354352821, + "grad_norm": 1.0560364723205566, + "learning_rate": 1.4521775436793422e-05, + "loss": 0.4803, + "step": 13250 + }, + { + "epoch": 0.42363415879499616, + "grad_norm": 1.5170994997024536, + "learning_rate": 1.4501702209660845e-05, + "loss": 0.4307, + "step": 13275 + }, + { + "epoch": 0.4244319632371713, + "grad_norm": 0.8432527184486389, + "learning_rate": 1.4481628982528265e-05, + "loss": 0.5236, + "step": 13300 + }, + { + "epoch": 0.42522976767934645, + "grad_norm": 0.5388304591178894, + "learning_rate": 1.4461555755395684e-05, + "loss": 0.5089, + "step": 13325 + }, + { + "epoch": 0.4260275721215216, + "grad_norm": 1.233600378036499, + "learning_rate": 1.4441482528263104e-05, + "loss": 0.433, + "step": 13350 + }, + { + "epoch": 0.4268253765636967, + "grad_norm": 1.7418749332427979, + "learning_rate": 1.4421409301130526e-05, + "loss": 0.3748, + "step": 13375 + }, + { + "epoch": 0.4276231810058718, + "grad_norm": 0.7089687585830688, + "learning_rate": 1.4401336073997946e-05, + "loss": 0.405, + "step": 13400 + }, + { + "epoch": 0.42842098544804696, + "grad_norm": 0.848208487033844, + "learning_rate": 1.4381262846865365e-05, + "loss": 0.4831, + "step": 13425 + }, + { + "epoch": 0.4292187898902221, + "grad_norm": 0.5146478414535522, + "learning_rate": 1.4361189619732786e-05, + "loss": 0.4587, + "step": 13450 + }, + { + "epoch": 0.43001659433239725, + "grad_norm": 0.4021094739437103, + "learning_rate": 1.4341116392600208e-05, + "loss": 0.4515, + "step": 13475 + }, + { + "epoch": 0.4308143987745724, + "grad_norm": 0.797254204750061, + "learning_rate": 1.4321043165467627e-05, + "loss": 0.4606, + "step": 13500 + }, + { + "epoch": 0.43161220321674754, + "grad_norm": 0.7970134019851685, + "learning_rate": 1.4300969938335046e-05, + "loss": 0.4588, + "step": 13525 + }, + { + "epoch": 0.4324100076589226, + "grad_norm": 0.7470701336860657, + "learning_rate": 1.4280896711202468e-05, + "loss": 0.4378, + "step": 13550 + }, + { + "epoch": 0.43320781210109777, + "grad_norm": 0.998225748538971, + "learning_rate": 1.4260823484069889e-05, + "loss": 0.4534, + "step": 13575 + }, + { + "epoch": 0.4340056165432729, + "grad_norm": 1.5322880744934082, + "learning_rate": 1.4240750256937308e-05, + "loss": 0.4574, + "step": 13600 + }, + { + "epoch": 0.43480342098544805, + "grad_norm": 2.1111321449279785, + "learning_rate": 1.4220677029804728e-05, + "loss": 0.548, + "step": 13625 + }, + { + "epoch": 0.4356012254276232, + "grad_norm": 1.3010362386703491, + "learning_rate": 1.4200603802672149e-05, + "loss": 0.3906, + "step": 13650 + }, + { + "epoch": 0.43639902986979834, + "grad_norm": 0.4892048239707947, + "learning_rate": 1.418053057553957e-05, + "loss": 0.4437, + "step": 13675 + }, + { + "epoch": 0.4371968343119734, + "grad_norm": 0.603466808795929, + "learning_rate": 1.416045734840699e-05, + "loss": 0.4823, + "step": 13700 + }, + { + "epoch": 0.43799463875414857, + "grad_norm": 0.5488519072532654, + "learning_rate": 1.414038412127441e-05, + "loss": 0.4642, + "step": 13725 + }, + { + "epoch": 0.4387924431963237, + "grad_norm": 0.9897418022155762, + "learning_rate": 1.412031089414183e-05, + "loss": 0.5128, + "step": 13750 + }, + { + "epoch": 0.43959024763849885, + "grad_norm": 0.32410895824432373, + "learning_rate": 1.410023766700925e-05, + "loss": 0.3808, + "step": 13775 + }, + { + "epoch": 0.440388052080674, + "grad_norm": 1.3885278701782227, + "learning_rate": 1.408016443987667e-05, + "loss": 0.4951, + "step": 13800 + }, + { + "epoch": 0.44118585652284914, + "grad_norm": 0.7485048770904541, + "learning_rate": 1.4060091212744092e-05, + "loss": 0.5199, + "step": 13825 + }, + { + "epoch": 0.4419836609650242, + "grad_norm": 0.6236168146133423, + "learning_rate": 1.4040017985611511e-05, + "loss": 0.4795, + "step": 13850 + }, + { + "epoch": 0.44278146540719937, + "grad_norm": 0.7369155287742615, + "learning_rate": 1.401994475847893e-05, + "loss": 0.4874, + "step": 13875 + }, + { + "epoch": 0.4435792698493745, + "grad_norm": 1.1749411821365356, + "learning_rate": 1.3999871531346353e-05, + "loss": 0.4792, + "step": 13900 + }, + { + "epoch": 0.44437707429154966, + "grad_norm": 0.394364595413208, + "learning_rate": 1.3979798304213773e-05, + "loss": 0.4384, + "step": 13925 + }, + { + "epoch": 0.4451748787337248, + "grad_norm": 0.717343807220459, + "learning_rate": 1.3959725077081192e-05, + "loss": 0.5035, + "step": 13950 + }, + { + "epoch": 0.44597268317589994, + "grad_norm": 0.9770955443382263, + "learning_rate": 1.3939651849948612e-05, + "loss": 0.4876, + "step": 13975 + }, + { + "epoch": 0.4467704876180751, + "grad_norm": 0.1990404576063156, + "learning_rate": 1.3919578622816034e-05, + "loss": 0.5259, + "step": 14000 + }, + { + "epoch": 0.4475682920602502, + "grad_norm": 1.2714977264404297, + "learning_rate": 1.3899505395683454e-05, + "loss": 0.5204, + "step": 14025 + }, + { + "epoch": 0.4483660965024253, + "grad_norm": 0.9181020855903625, + "learning_rate": 1.3879432168550873e-05, + "loss": 0.5153, + "step": 14050 + }, + { + "epoch": 0.44916390094460046, + "grad_norm": 0.7125169038772583, + "learning_rate": 1.3859358941418293e-05, + "loss": 0.4085, + "step": 14075 + }, + { + "epoch": 0.4499617053867756, + "grad_norm": 0.9389515519142151, + "learning_rate": 1.3839285714285715e-05, + "loss": 0.4255, + "step": 14100 + }, + { + "epoch": 0.45075950982895074, + "grad_norm": 0.9015195369720459, + "learning_rate": 1.3819212487153135e-05, + "loss": 0.4657, + "step": 14125 + }, + { + "epoch": 0.4515573142711259, + "grad_norm": 0.5635702610015869, + "learning_rate": 1.3799139260020554e-05, + "loss": 0.4839, + "step": 14150 + }, + { + "epoch": 0.452355118713301, + "grad_norm": 0.6795171499252319, + "learning_rate": 1.3779066032887977e-05, + "loss": 0.4617, + "step": 14175 + }, + { + "epoch": 0.4531529231554761, + "grad_norm": 1.020156741142273, + "learning_rate": 1.3758992805755397e-05, + "loss": 0.519, + "step": 14200 + }, + { + "epoch": 0.45395072759765126, + "grad_norm": 1.1862505674362183, + "learning_rate": 1.3738919578622816e-05, + "loss": 0.5016, + "step": 14225 + }, + { + "epoch": 0.4547485320398264, + "grad_norm": 0.9645688533782959, + "learning_rate": 1.3718846351490236e-05, + "loss": 0.516, + "step": 14250 + }, + { + "epoch": 0.45554633648200155, + "grad_norm": 0.8609448671340942, + "learning_rate": 1.3698773124357658e-05, + "loss": 0.4662, + "step": 14275 + }, + { + "epoch": 0.4563441409241767, + "grad_norm": 0.7543959617614746, + "learning_rate": 1.3678699897225078e-05, + "loss": 0.4866, + "step": 14300 + }, + { + "epoch": 0.4571419453663518, + "grad_norm": 1.1085060834884644, + "learning_rate": 1.3658626670092497e-05, + "loss": 0.4216, + "step": 14325 + }, + { + "epoch": 0.4579397498085269, + "grad_norm": 1.7177371978759766, + "learning_rate": 1.3638553442959917e-05, + "loss": 0.5042, + "step": 14350 + }, + { + "epoch": 0.45873755425070206, + "grad_norm": 0.8194246888160706, + "learning_rate": 1.361848021582734e-05, + "loss": 0.4852, + "step": 14375 + }, + { + "epoch": 0.4595353586928772, + "grad_norm": 0.5089772343635559, + "learning_rate": 1.3598406988694759e-05, + "loss": 0.5001, + "step": 14400 + }, + { + "epoch": 0.46033316313505235, + "grad_norm": 0.7217407822608948, + "learning_rate": 1.3578333761562178e-05, + "loss": 0.3952, + "step": 14425 + }, + { + "epoch": 0.4611309675772275, + "grad_norm": 0.9532568454742432, + "learning_rate": 1.3558260534429601e-05, + "loss": 0.3844, + "step": 14450 + }, + { + "epoch": 0.4619287720194026, + "grad_norm": 0.8932861685752869, + "learning_rate": 1.353818730729702e-05, + "loss": 0.4268, + "step": 14475 + }, + { + "epoch": 0.4627265764615777, + "grad_norm": 0.7293800115585327, + "learning_rate": 1.351811408016444e-05, + "loss": 0.4509, + "step": 14500 + }, + { + "epoch": 0.46352438090375286, + "grad_norm": 0.616687536239624, + "learning_rate": 1.349804085303186e-05, + "loss": 0.4004, + "step": 14525 + }, + { + "epoch": 0.464322185345928, + "grad_norm": 0.6240351796150208, + "learning_rate": 1.3477967625899282e-05, + "loss": 0.4765, + "step": 14550 + }, + { + "epoch": 0.46511998978810315, + "grad_norm": 0.7241207361221313, + "learning_rate": 1.3457894398766702e-05, + "loss": 0.5139, + "step": 14575 + }, + { + "epoch": 0.4659177942302783, + "grad_norm": 0.6566533446311951, + "learning_rate": 1.3437821171634121e-05, + "loss": 0.4583, + "step": 14600 + }, + { + "epoch": 0.46671559867245344, + "grad_norm": 0.5932489633560181, + "learning_rate": 1.3417747944501544e-05, + "loss": 0.4816, + "step": 14625 + }, + { + "epoch": 0.4675134031146285, + "grad_norm": 0.9913176894187927, + "learning_rate": 1.3397674717368963e-05, + "loss": 0.4806, + "step": 14650 + }, + { + "epoch": 0.46831120755680367, + "grad_norm": 0.5182284116744995, + "learning_rate": 1.3377601490236383e-05, + "loss": 0.4503, + "step": 14675 + }, + { + "epoch": 0.4691090119989788, + "grad_norm": 0.3942260444164276, + "learning_rate": 1.3357528263103802e-05, + "loss": 0.4434, + "step": 14700 + }, + { + "epoch": 0.46990681644115395, + "grad_norm": 0.7585189342498779, + "learning_rate": 1.3337455035971225e-05, + "loss": 0.5389, + "step": 14725 + }, + { + "epoch": 0.4707046208833291, + "grad_norm": 0.6170709133148193, + "learning_rate": 1.3317381808838645e-05, + "loss": 0.4664, + "step": 14750 + }, + { + "epoch": 0.47150242532550424, + "grad_norm": 0.7014161348342896, + "learning_rate": 1.3297308581706064e-05, + "loss": 0.4745, + "step": 14775 + }, + { + "epoch": 0.4723002297676793, + "grad_norm": 1.9517532587051392, + "learning_rate": 1.3277235354573483e-05, + "loss": 0.4908, + "step": 14800 + }, + { + "epoch": 0.47309803420985447, + "grad_norm": 0.47267842292785645, + "learning_rate": 1.3257162127440906e-05, + "loss": 0.3736, + "step": 14825 + }, + { + "epoch": 0.4738958386520296, + "grad_norm": 0.5750212073326111, + "learning_rate": 1.3237088900308326e-05, + "loss": 0.5174, + "step": 14850 + }, + { + "epoch": 0.47469364309420475, + "grad_norm": 0.7819316983222961, + "learning_rate": 1.3217015673175745e-05, + "loss": 0.4964, + "step": 14875 + }, + { + "epoch": 0.4754914475363799, + "grad_norm": 1.4765163660049438, + "learning_rate": 1.3196942446043168e-05, + "loss": 0.4414, + "step": 14900 + }, + { + "epoch": 0.47628925197855504, + "grad_norm": 0.7535708546638489, + "learning_rate": 1.3176869218910587e-05, + "loss": 0.4577, + "step": 14925 + }, + { + "epoch": 0.4770870564207301, + "grad_norm": 1.512585997581482, + "learning_rate": 1.3156795991778007e-05, + "loss": 0.4547, + "step": 14950 + }, + { + "epoch": 0.47788486086290527, + "grad_norm": 0.7475982308387756, + "learning_rate": 1.3136722764645426e-05, + "loss": 0.4786, + "step": 14975 + }, + { + "epoch": 0.4786826653050804, + "grad_norm": 0.29426124691963196, + "learning_rate": 1.311664953751285e-05, + "loss": 0.4754, + "step": 15000 + }, + { + "epoch": 0.47948046974725556, + "grad_norm": 0.7986690402030945, + "learning_rate": 1.3096576310380269e-05, + "loss": 0.4829, + "step": 15025 + }, + { + "epoch": 0.4802782741894307, + "grad_norm": 0.7937321662902832, + "learning_rate": 1.3076503083247688e-05, + "loss": 0.479, + "step": 15050 + }, + { + "epoch": 0.48107607863160584, + "grad_norm": 0.8871631026268005, + "learning_rate": 1.3056429856115107e-05, + "loss": 0.4428, + "step": 15075 + }, + { + "epoch": 0.48187388307378093, + "grad_norm": 1.7166088819503784, + "learning_rate": 1.303635662898253e-05, + "loss": 0.4494, + "step": 15100 + }, + { + "epoch": 0.48267168751595607, + "grad_norm": 1.0895076990127563, + "learning_rate": 1.301628340184995e-05, + "loss": 0.4473, + "step": 15125 + }, + { + "epoch": 0.4834694919581312, + "grad_norm": 0.8884141445159912, + "learning_rate": 1.299621017471737e-05, + "loss": 0.4681, + "step": 15150 + }, + { + "epoch": 0.48426729640030636, + "grad_norm": 1.1340069770812988, + "learning_rate": 1.297613694758479e-05, + "loss": 0.5118, + "step": 15175 + }, + { + "epoch": 0.4850651008424815, + "grad_norm": 0.3954940140247345, + "learning_rate": 1.2956063720452211e-05, + "loss": 0.4181, + "step": 15200 + }, + { + "epoch": 0.48586290528465664, + "grad_norm": 0.9146122932434082, + "learning_rate": 1.2935990493319631e-05, + "loss": 0.4984, + "step": 15225 + }, + { + "epoch": 0.4866607097268318, + "grad_norm": 0.6080213189125061, + "learning_rate": 1.291591726618705e-05, + "loss": 0.4609, + "step": 15250 + }, + { + "epoch": 0.4874585141690069, + "grad_norm": 1.3363434076309204, + "learning_rate": 1.2895844039054471e-05, + "loss": 0.5055, + "step": 15275 + }, + { + "epoch": 0.488256318611182, + "grad_norm": 0.7919199466705322, + "learning_rate": 1.2875770811921891e-05, + "loss": 0.4164, + "step": 15300 + }, + { + "epoch": 0.48905412305335716, + "grad_norm": 0.7286228537559509, + "learning_rate": 1.2855697584789312e-05, + "loss": 0.488, + "step": 15325 + }, + { + "epoch": 0.4898519274955323, + "grad_norm": 0.8509761691093445, + "learning_rate": 1.2835624357656733e-05, + "loss": 0.4565, + "step": 15350 + }, + { + "epoch": 0.49064973193770745, + "grad_norm": 0.9142221808433533, + "learning_rate": 1.2815551130524153e-05, + "loss": 0.5023, + "step": 15375 + }, + { + "epoch": 0.4914475363798826, + "grad_norm": 0.9267359972000122, + "learning_rate": 1.2795477903391572e-05, + "loss": 0.4025, + "step": 15400 + }, + { + "epoch": 0.4922453408220577, + "grad_norm": 0.964512288570404, + "learning_rate": 1.2775404676258993e-05, + "loss": 0.5004, + "step": 15425 + }, + { + "epoch": 0.4930431452642328, + "grad_norm": 0.857150137424469, + "learning_rate": 1.2755331449126414e-05, + "loss": 0.4275, + "step": 15450 + }, + { + "epoch": 0.49384094970640796, + "grad_norm": 0.49004054069519043, + "learning_rate": 1.2735258221993834e-05, + "loss": 0.4849, + "step": 15475 + }, + { + "epoch": 0.4946387541485831, + "grad_norm": 0.9604198932647705, + "learning_rate": 1.2715184994861253e-05, + "loss": 0.4593, + "step": 15500 + }, + { + "epoch": 0.49543655859075825, + "grad_norm": 0.5274483561515808, + "learning_rate": 1.2695111767728674e-05, + "loss": 0.4779, + "step": 15525 + }, + { + "epoch": 0.4962343630329334, + "grad_norm": 2.0652174949645996, + "learning_rate": 1.2675038540596095e-05, + "loss": 0.4492, + "step": 15550 + }, + { + "epoch": 0.4970321674751085, + "grad_norm": 0.8072428703308105, + "learning_rate": 1.2654965313463515e-05, + "loss": 0.492, + "step": 15575 + }, + { + "epoch": 0.4978299719172836, + "grad_norm": 0.5541747212409973, + "learning_rate": 1.2634892086330934e-05, + "loss": 0.449, + "step": 15600 + }, + { + "epoch": 0.49862777635945876, + "grad_norm": 0.773730456829071, + "learning_rate": 1.2614818859198357e-05, + "loss": 0.4562, + "step": 15625 + }, + { + "epoch": 0.4994255808016339, + "grad_norm": 0.9741753339767456, + "learning_rate": 1.2594745632065777e-05, + "loss": 0.4142, + "step": 15650 + }, + { + "epoch": 0.500223385243809, + "grad_norm": 1.2298176288604736, + "learning_rate": 1.2574672404933196e-05, + "loss": 0.5186, + "step": 15675 + }, + { + "epoch": 0.5010211896859842, + "grad_norm": 0.8459392189979553, + "learning_rate": 1.2554599177800615e-05, + "loss": 0.5335, + "step": 15700 + }, + { + "epoch": 0.5018189941281593, + "grad_norm": 0.9389364123344421, + "learning_rate": 1.2534525950668038e-05, + "loss": 0.4762, + "step": 15725 + }, + { + "epoch": 0.5026167985703345, + "grad_norm": 1.0771899223327637, + "learning_rate": 1.2514452723535458e-05, + "loss": 0.5067, + "step": 15750 + }, + { + "epoch": 0.5034146030125096, + "grad_norm": 1.1614314317703247, + "learning_rate": 1.2494379496402877e-05, + "loss": 0.4776, + "step": 15775 + }, + { + "epoch": 0.5042124074546847, + "grad_norm": 1.7263872623443604, + "learning_rate": 1.2474306269270298e-05, + "loss": 0.4122, + "step": 15800 + }, + { + "epoch": 0.5050102118968598, + "grad_norm": 1.081225037574768, + "learning_rate": 1.245423304213772e-05, + "loss": 0.4653, + "step": 15825 + }, + { + "epoch": 0.5058080163390349, + "grad_norm": 0.6398253440856934, + "learning_rate": 1.2434159815005139e-05, + "loss": 0.4563, + "step": 15850 + }, + { + "epoch": 0.5066058207812101, + "grad_norm": 1.2320051193237305, + "learning_rate": 1.241408658787256e-05, + "loss": 0.4612, + "step": 15875 + }, + { + "epoch": 0.5074036252233852, + "grad_norm": 0.6390859484672546, + "learning_rate": 1.239401336073998e-05, + "loss": 0.4836, + "step": 15900 + }, + { + "epoch": 0.5082014296655604, + "grad_norm": 0.7116482853889465, + "learning_rate": 1.23739401336074e-05, + "loss": 0.4157, + "step": 15925 + }, + { + "epoch": 0.5089992341077355, + "grad_norm": 0.8386733531951904, + "learning_rate": 1.235386690647482e-05, + "loss": 0.4989, + "step": 15950 + }, + { + "epoch": 0.5097970385499107, + "grad_norm": 0.46999993920326233, + "learning_rate": 1.2333793679342241e-05, + "loss": 0.5006, + "step": 15975 + }, + { + "epoch": 0.5105948429920858, + "grad_norm": 0.6487120985984802, + "learning_rate": 1.231372045220966e-05, + "loss": 0.4817, + "step": 16000 + }, + { + "epoch": 0.5113926474342609, + "grad_norm": 0.7748807668685913, + "learning_rate": 1.2293647225077082e-05, + "loss": 0.435, + "step": 16025 + }, + { + "epoch": 0.5121904518764361, + "grad_norm": 1.0600996017456055, + "learning_rate": 1.2273573997944503e-05, + "loss": 0.4186, + "step": 16050 + }, + { + "epoch": 0.5129882563186112, + "grad_norm": 0.6807442307472229, + "learning_rate": 1.2253500770811922e-05, + "loss": 0.4519, + "step": 16075 + }, + { + "epoch": 0.5137860607607864, + "grad_norm": 0.8556498885154724, + "learning_rate": 1.2233427543679343e-05, + "loss": 0.4238, + "step": 16100 + }, + { + "epoch": 0.5145838652029614, + "grad_norm": 1.2915552854537964, + "learning_rate": 1.2213354316546763e-05, + "loss": 0.4428, + "step": 16125 + }, + { + "epoch": 0.5153816696451365, + "grad_norm": 1.086190104484558, + "learning_rate": 1.2193281089414184e-05, + "loss": 0.5217, + "step": 16150 + }, + { + "epoch": 0.5161794740873117, + "grad_norm": 0.7254406809806824, + "learning_rate": 1.2173207862281603e-05, + "loss": 0.4927, + "step": 16175 + }, + { + "epoch": 0.5169772785294868, + "grad_norm": 1.0535979270935059, + "learning_rate": 1.2153134635149025e-05, + "loss": 0.5605, + "step": 16200 + }, + { + "epoch": 0.517775082971662, + "grad_norm": 0.9690276980400085, + "learning_rate": 1.2133061408016444e-05, + "loss": 0.4528, + "step": 16225 + }, + { + "epoch": 0.5185728874138371, + "grad_norm": 0.6356890201568604, + "learning_rate": 1.2112988180883865e-05, + "loss": 0.5514, + "step": 16250 + }, + { + "epoch": 0.5193706918560123, + "grad_norm": 0.5850731134414673, + "learning_rate": 1.2092914953751286e-05, + "loss": 0.4528, + "step": 16275 + }, + { + "epoch": 0.5201684962981874, + "grad_norm": 1.5722885131835938, + "learning_rate": 1.2072841726618706e-05, + "loss": 0.419, + "step": 16300 + }, + { + "epoch": 0.5209663007403625, + "grad_norm": 0.645149290561676, + "learning_rate": 1.2052768499486127e-05, + "loss": 0.4778, + "step": 16325 + }, + { + "epoch": 0.5217641051825377, + "grad_norm": 0.8022610545158386, + "learning_rate": 1.2032695272353546e-05, + "loss": 0.4892, + "step": 16350 + }, + { + "epoch": 0.5225619096247128, + "grad_norm": 1.2941786050796509, + "learning_rate": 1.2012622045220967e-05, + "loss": 0.4753, + "step": 16375 + }, + { + "epoch": 0.523359714066888, + "grad_norm": 0.8070077896118164, + "learning_rate": 1.1992548818088387e-05, + "loss": 0.4174, + "step": 16400 + }, + { + "epoch": 0.5241575185090631, + "grad_norm": 0.7832496166229248, + "learning_rate": 1.1972475590955808e-05, + "loss": 0.461, + "step": 16425 + }, + { + "epoch": 0.5249553229512381, + "grad_norm": 0.6133424639701843, + "learning_rate": 1.1952402363823227e-05, + "loss": 0.4594, + "step": 16450 + }, + { + "epoch": 0.5257531273934133, + "grad_norm": 0.5747678279876709, + "learning_rate": 1.1932329136690649e-05, + "loss": 0.4576, + "step": 16475 + }, + { + "epoch": 0.5265509318355884, + "grad_norm": 0.6148069500923157, + "learning_rate": 1.191225590955807e-05, + "loss": 0.4757, + "step": 16500 + }, + { + "epoch": 0.5273487362777636, + "grad_norm": 0.8133326172828674, + "learning_rate": 1.1892182682425489e-05, + "loss": 0.4392, + "step": 16525 + }, + { + "epoch": 0.5281465407199387, + "grad_norm": 1.6861692667007446, + "learning_rate": 1.187210945529291e-05, + "loss": 0.4725, + "step": 16550 + }, + { + "epoch": 0.5289443451621139, + "grad_norm": 0.44647324085235596, + "learning_rate": 1.185203622816033e-05, + "loss": 0.4729, + "step": 16575 + }, + { + "epoch": 0.529742149604289, + "grad_norm": 0.7046321034431458, + "learning_rate": 1.183196300102775e-05, + "loss": 0.4553, + "step": 16600 + }, + { + "epoch": 0.5305399540464641, + "grad_norm": 0.5717901587486267, + "learning_rate": 1.181188977389517e-05, + "loss": 0.4812, + "step": 16625 + }, + { + "epoch": 0.5313377584886393, + "grad_norm": 0.5891947150230408, + "learning_rate": 1.1791816546762591e-05, + "loss": 0.3988, + "step": 16650 + }, + { + "epoch": 0.5321355629308144, + "grad_norm": 0.5775773525238037, + "learning_rate": 1.177174331963001e-05, + "loss": 0.4744, + "step": 16675 + }, + { + "epoch": 0.5329333673729896, + "grad_norm": 0.7174286246299744, + "learning_rate": 1.1751670092497432e-05, + "loss": 0.3991, + "step": 16700 + }, + { + "epoch": 0.5337311718151647, + "grad_norm": 0.754780650138855, + "learning_rate": 1.1731596865364851e-05, + "loss": 0.4513, + "step": 16725 + }, + { + "epoch": 0.5345289762573397, + "grad_norm": 0.8494898080825806, + "learning_rate": 1.1711523638232273e-05, + "loss": 0.4149, + "step": 16750 + }, + { + "epoch": 0.5353267806995149, + "grad_norm": 0.32787203788757324, + "learning_rate": 1.1691450411099692e-05, + "loss": 0.4747, + "step": 16775 + }, + { + "epoch": 0.53612458514169, + "grad_norm": 1.0056346654891968, + "learning_rate": 1.1671377183967113e-05, + "loss": 0.4527, + "step": 16800 + }, + { + "epoch": 0.5369223895838652, + "grad_norm": 1.3246283531188965, + "learning_rate": 1.1651303956834533e-05, + "loss": 0.4984, + "step": 16825 + }, + { + "epoch": 0.5377201940260403, + "grad_norm": 1.2716175317764282, + "learning_rate": 1.1631230729701954e-05, + "loss": 0.3923, + "step": 16850 + }, + { + "epoch": 0.5385179984682155, + "grad_norm": 0.9225130677223206, + "learning_rate": 1.1611157502569373e-05, + "loss": 0.4539, + "step": 16875 + }, + { + "epoch": 0.5393158029103906, + "grad_norm": 0.7265071272850037, + "learning_rate": 1.1591084275436794e-05, + "loss": 0.4778, + "step": 16900 + }, + { + "epoch": 0.5401136073525657, + "grad_norm": 0.8638312816619873, + "learning_rate": 1.1571011048304214e-05, + "loss": 0.4632, + "step": 16925 + }, + { + "epoch": 0.5409114117947409, + "grad_norm": 0.6868686079978943, + "learning_rate": 1.1550937821171635e-05, + "loss": 0.5054, + "step": 16950 + }, + { + "epoch": 0.541709216236916, + "grad_norm": 1.0026332139968872, + "learning_rate": 1.1530864594039054e-05, + "loss": 0.4213, + "step": 16975 + }, + { + "epoch": 0.5425070206790912, + "grad_norm": 1.02359938621521, + "learning_rate": 1.1510791366906475e-05, + "loss": 0.4539, + "step": 17000 + }, + { + "epoch": 0.5433048251212663, + "grad_norm": 0.5522026419639587, + "learning_rate": 1.1490718139773895e-05, + "loss": 0.4533, + "step": 17025 + }, + { + "epoch": 0.5441026295634415, + "grad_norm": 0.847406268119812, + "learning_rate": 1.1470644912641316e-05, + "loss": 0.4679, + "step": 17050 + }, + { + "epoch": 0.5449004340056165, + "grad_norm": 0.3250424861907959, + "learning_rate": 1.1450571685508735e-05, + "loss": 0.4526, + "step": 17075 + }, + { + "epoch": 0.5456982384477916, + "grad_norm": 1.1642229557037354, + "learning_rate": 1.1430498458376157e-05, + "loss": 0.4731, + "step": 17100 + }, + { + "epoch": 0.5464960428899668, + "grad_norm": 1.393276572227478, + "learning_rate": 1.1410425231243576e-05, + "loss": 0.4393, + "step": 17125 + }, + { + "epoch": 0.5472938473321419, + "grad_norm": 0.9260122179985046, + "learning_rate": 1.1390352004110997e-05, + "loss": 0.415, + "step": 17150 + }, + { + "epoch": 0.5480916517743171, + "grad_norm": 0.6754615902900696, + "learning_rate": 1.1370278776978417e-05, + "loss": 0.4119, + "step": 17175 + }, + { + "epoch": 0.5488894562164922, + "grad_norm": 0.4611823856830597, + "learning_rate": 1.1350205549845838e-05, + "loss": 0.4634, + "step": 17200 + }, + { + "epoch": 0.5496872606586674, + "grad_norm": 0.439230740070343, + "learning_rate": 1.1330132322713259e-05, + "loss": 0.3912, + "step": 17225 + }, + { + "epoch": 0.5504850651008425, + "grad_norm": 0.7567656636238098, + "learning_rate": 1.1310059095580678e-05, + "loss": 0.4156, + "step": 17250 + }, + { + "epoch": 0.5512828695430176, + "grad_norm": 0.6174827814102173, + "learning_rate": 1.12899858684481e-05, + "loss": 0.4712, + "step": 17275 + }, + { + "epoch": 0.5520806739851928, + "grad_norm": 1.0842621326446533, + "learning_rate": 1.1269912641315519e-05, + "loss": 0.4809, + "step": 17300 + }, + { + "epoch": 0.5528784784273679, + "grad_norm": 1.0989375114440918, + "learning_rate": 1.124983941418294e-05, + "loss": 0.4579, + "step": 17325 + }, + { + "epoch": 0.5536762828695431, + "grad_norm": 1.2708433866500854, + "learning_rate": 1.122976618705036e-05, + "loss": 0.4065, + "step": 17350 + }, + { + "epoch": 0.5544740873117181, + "grad_norm": 0.6591207981109619, + "learning_rate": 1.120969295991778e-05, + "loss": 0.5051, + "step": 17375 + }, + { + "epoch": 0.5552718917538932, + "grad_norm": 0.7222548127174377, + "learning_rate": 1.11896197327852e-05, + "loss": 0.5097, + "step": 17400 + }, + { + "epoch": 0.5560696961960684, + "grad_norm": 1.1408518552780151, + "learning_rate": 1.1169546505652621e-05, + "loss": 0.4664, + "step": 17425 + }, + { + "epoch": 0.5568675006382435, + "grad_norm": 0.2580488622188568, + "learning_rate": 1.1149473278520042e-05, + "loss": 0.4534, + "step": 17450 + }, + { + "epoch": 0.5576653050804187, + "grad_norm": 0.9623468518257141, + "learning_rate": 1.1129400051387462e-05, + "loss": 0.4357, + "step": 17475 + }, + { + "epoch": 0.5584631095225938, + "grad_norm": 1.5668182373046875, + "learning_rate": 1.1109326824254883e-05, + "loss": 0.5258, + "step": 17500 + }, + { + "epoch": 0.559260913964769, + "grad_norm": 0.4907796382904053, + "learning_rate": 1.1089253597122302e-05, + "loss": 0.4705, + "step": 17525 + }, + { + "epoch": 0.5600587184069441, + "grad_norm": 0.6818260550498962, + "learning_rate": 1.1069180369989723e-05, + "loss": 0.5015, + "step": 17550 + }, + { + "epoch": 0.5608565228491192, + "grad_norm": 1.1441423892974854, + "learning_rate": 1.1049107142857143e-05, + "loss": 0.4585, + "step": 17575 + }, + { + "epoch": 0.5616543272912944, + "grad_norm": 0.5130887627601624, + "learning_rate": 1.1029033915724564e-05, + "loss": 0.4527, + "step": 17600 + }, + { + "epoch": 0.5624521317334695, + "grad_norm": 1.3574963808059692, + "learning_rate": 1.1008960688591983e-05, + "loss": 0.4047, + "step": 17625 + }, + { + "epoch": 0.5632499361756447, + "grad_norm": 0.6711642146110535, + "learning_rate": 1.0988887461459405e-05, + "loss": 0.4798, + "step": 17650 + }, + { + "epoch": 0.5640477406178198, + "grad_norm": 0.6693195104598999, + "learning_rate": 1.0968814234326824e-05, + "loss": 0.4238, + "step": 17675 + }, + { + "epoch": 0.5648455450599948, + "grad_norm": 1.0625276565551758, + "learning_rate": 1.0948741007194245e-05, + "loss": 0.3812, + "step": 17700 + }, + { + "epoch": 0.56564334950217, + "grad_norm": 0.9710107445716858, + "learning_rate": 1.0928667780061666e-05, + "loss": 0.4445, + "step": 17725 + }, + { + "epoch": 0.5664411539443451, + "grad_norm": 0.9373927116394043, + "learning_rate": 1.0908594552929086e-05, + "loss": 0.4968, + "step": 17750 + }, + { + "epoch": 0.5672389583865203, + "grad_norm": 0.7126112580299377, + "learning_rate": 1.0888521325796507e-05, + "loss": 0.4338, + "step": 17775 + }, + { + "epoch": 0.5680367628286954, + "grad_norm": 0.8321107029914856, + "learning_rate": 1.0868448098663926e-05, + "loss": 0.4766, + "step": 17800 + }, + { + "epoch": 0.5688345672708706, + "grad_norm": 1.0770740509033203, + "learning_rate": 1.0848374871531347e-05, + "loss": 0.444, + "step": 17825 + }, + { + "epoch": 0.5696323717130457, + "grad_norm": 0.8870105147361755, + "learning_rate": 1.0828301644398767e-05, + "loss": 0.4883, + "step": 17850 + }, + { + "epoch": 0.5704301761552208, + "grad_norm": 1.0617618560791016, + "learning_rate": 1.0808228417266188e-05, + "loss": 0.4949, + "step": 17875 + }, + { + "epoch": 0.571227980597396, + "grad_norm": 0.30951830744743347, + "learning_rate": 1.0788155190133607e-05, + "loss": 0.5067, + "step": 17900 + }, + { + "epoch": 0.5720257850395711, + "grad_norm": 0.9690567255020142, + "learning_rate": 1.0768081963001028e-05, + "loss": 0.4336, + "step": 17925 + }, + { + "epoch": 0.5728235894817463, + "grad_norm": 1.1942867040634155, + "learning_rate": 1.074800873586845e-05, + "loss": 0.4603, + "step": 17950 + }, + { + "epoch": 0.5736213939239214, + "grad_norm": 0.9600849151611328, + "learning_rate": 1.0727935508735869e-05, + "loss": 0.4351, + "step": 17975 + }, + { + "epoch": 0.5744191983660965, + "grad_norm": 1.325777530670166, + "learning_rate": 1.070786228160329e-05, + "loss": 0.4366, + "step": 18000 + }, + { + "epoch": 0.5752170028082716, + "grad_norm": 0.8732290267944336, + "learning_rate": 1.068778905447071e-05, + "loss": 0.4119, + "step": 18025 + }, + { + "epoch": 0.5760148072504467, + "grad_norm": 0.822232186794281, + "learning_rate": 1.066771582733813e-05, + "loss": 0.453, + "step": 18050 + }, + { + "epoch": 0.5768126116926219, + "grad_norm": 0.5754446387290955, + "learning_rate": 1.064764260020555e-05, + "loss": 0.4386, + "step": 18075 + }, + { + "epoch": 0.577610416134797, + "grad_norm": 0.9932488203048706, + "learning_rate": 1.0627569373072971e-05, + "loss": 0.4224, + "step": 18100 + }, + { + "epoch": 0.5784082205769722, + "grad_norm": 1.197025179862976, + "learning_rate": 1.060749614594039e-05, + "loss": 0.4029, + "step": 18125 + }, + { + "epoch": 0.5792060250191473, + "grad_norm": 0.6170285940170288, + "learning_rate": 1.0587422918807812e-05, + "loss": 0.4688, + "step": 18150 + }, + { + "epoch": 0.5800038294613225, + "grad_norm": 0.6788281798362732, + "learning_rate": 1.0567349691675233e-05, + "loss": 0.4127, + "step": 18175 + }, + { + "epoch": 0.5808016339034976, + "grad_norm": 0.9156305193901062, + "learning_rate": 1.0547276464542652e-05, + "loss": 0.4977, + "step": 18200 + }, + { + "epoch": 0.5815994383456727, + "grad_norm": 1.309981107711792, + "learning_rate": 1.0527203237410074e-05, + "loss": 0.3932, + "step": 18225 + }, + { + "epoch": 0.5823972427878479, + "grad_norm": 1.0941970348358154, + "learning_rate": 1.0507130010277493e-05, + "loss": 0.4313, + "step": 18250 + }, + { + "epoch": 0.583195047230023, + "grad_norm": 0.6311997771263123, + "learning_rate": 1.0487056783144914e-05, + "loss": 0.4303, + "step": 18275 + }, + { + "epoch": 0.5839928516721982, + "grad_norm": 0.6561720967292786, + "learning_rate": 1.0466983556012334e-05, + "loss": 0.5383, + "step": 18300 + }, + { + "epoch": 0.5847906561143732, + "grad_norm": 0.528823733329773, + "learning_rate": 1.0446910328879755e-05, + "loss": 0.467, + "step": 18325 + }, + { + "epoch": 0.5855884605565483, + "grad_norm": 0.3194543123245239, + "learning_rate": 1.0426837101747174e-05, + "loss": 0.484, + "step": 18350 + }, + { + "epoch": 0.5863862649987235, + "grad_norm": 1.203895092010498, + "learning_rate": 1.0406763874614595e-05, + "loss": 0.5813, + "step": 18375 + }, + { + "epoch": 0.5871840694408986, + "grad_norm": 0.5720348954200745, + "learning_rate": 1.0386690647482015e-05, + "loss": 0.4371, + "step": 18400 + }, + { + "epoch": 0.5879818738830738, + "grad_norm": 0.49713271856307983, + "learning_rate": 1.0366617420349436e-05, + "loss": 0.51, + "step": 18425 + }, + { + "epoch": 0.5887796783252489, + "grad_norm": 0.9100812673568726, + "learning_rate": 1.0346544193216855e-05, + "loss": 0.456, + "step": 18450 + }, + { + "epoch": 0.589577482767424, + "grad_norm": 0.6779456734657288, + "learning_rate": 1.0326470966084276e-05, + "loss": 0.4508, + "step": 18475 + }, + { + "epoch": 0.5903752872095992, + "grad_norm": 0.5142289996147156, + "learning_rate": 1.0306397738951696e-05, + "loss": 0.5069, + "step": 18500 + }, + { + "epoch": 0.5911730916517743, + "grad_norm": 0.8972587585449219, + "learning_rate": 1.0286324511819117e-05, + "loss": 0.5365, + "step": 18525 + }, + { + "epoch": 0.5919708960939495, + "grad_norm": 0.6554467678070068, + "learning_rate": 1.0266251284686536e-05, + "loss": 0.4959, + "step": 18550 + }, + { + "epoch": 0.5927687005361246, + "grad_norm": 0.8800398111343384, + "learning_rate": 1.0246178057553958e-05, + "loss": 0.4706, + "step": 18575 + }, + { + "epoch": 0.5935665049782998, + "grad_norm": 0.9559227824211121, + "learning_rate": 1.0226104830421377e-05, + "loss": 0.4496, + "step": 18600 + }, + { + "epoch": 0.5943643094204748, + "grad_norm": 1.1470880508422852, + "learning_rate": 1.0206031603288798e-05, + "loss": 0.4553, + "step": 18625 + }, + { + "epoch": 0.5951621138626499, + "grad_norm": 0.7805154919624329, + "learning_rate": 1.0185958376156218e-05, + "loss": 0.5258, + "step": 18650 + }, + { + "epoch": 0.5959599183048251, + "grad_norm": 1.723807454109192, + "learning_rate": 1.0165885149023639e-05, + "loss": 0.5571, + "step": 18675 + }, + { + "epoch": 0.5967577227470002, + "grad_norm": 0.3630259931087494, + "learning_rate": 1.0145811921891058e-05, + "loss": 0.427, + "step": 18700 + }, + { + "epoch": 0.5975555271891754, + "grad_norm": 1.5315433740615845, + "learning_rate": 1.012573869475848e-05, + "loss": 0.4529, + "step": 18725 + }, + { + "epoch": 0.5983533316313505, + "grad_norm": 0.9309017658233643, + "learning_rate": 1.0105665467625899e-05, + "loss": 0.4576, + "step": 18750 + }, + { + "epoch": 0.5991511360735257, + "grad_norm": 1.712268352508545, + "learning_rate": 1.008559224049332e-05, + "loss": 0.4551, + "step": 18775 + }, + { + "epoch": 0.5999489405157008, + "grad_norm": 1.235117793083191, + "learning_rate": 1.006551901336074e-05, + "loss": 0.4807, + "step": 18800 + }, + { + "epoch": 0.6007467449578759, + "grad_norm": 1.1038379669189453, + "learning_rate": 1.004544578622816e-05, + "loss": 0.4631, + "step": 18825 + }, + { + "epoch": 0.6015445494000511, + "grad_norm": 0.6753078699111938, + "learning_rate": 1.002537255909558e-05, + "loss": 0.5024, + "step": 18850 + }, + { + "epoch": 0.6023423538422262, + "grad_norm": 0.7761247754096985, + "learning_rate": 1.0005299331963001e-05, + "loss": 0.4557, + "step": 18875 + }, + { + "epoch": 0.6031401582844014, + "grad_norm": 1.1328458786010742, + "learning_rate": 9.985226104830422e-06, + "loss": 0.5145, + "step": 18900 + }, + { + "epoch": 0.6039379627265765, + "grad_norm": 0.63515704870224, + "learning_rate": 9.965152877697842e-06, + "loss": 0.4617, + "step": 18925 + }, + { + "epoch": 0.6047357671687515, + "grad_norm": 0.4074954688549042, + "learning_rate": 9.945079650565263e-06, + "loss": 0.4264, + "step": 18950 + }, + { + "epoch": 0.6055335716109267, + "grad_norm": 0.9943312406539917, + "learning_rate": 9.925006423432682e-06, + "loss": 0.5184, + "step": 18975 + }, + { + "epoch": 0.6063313760531018, + "grad_norm": 0.9145040512084961, + "learning_rate": 9.904933196300103e-06, + "loss": 0.3868, + "step": 19000 + }, + { + "epoch": 0.607129180495277, + "grad_norm": 1.028710126876831, + "learning_rate": 9.884859969167523e-06, + "loss": 0.4998, + "step": 19025 + }, + { + "epoch": 0.6079269849374521, + "grad_norm": 0.7836230397224426, + "learning_rate": 9.864786742034944e-06, + "loss": 0.467, + "step": 19050 + }, + { + "epoch": 0.6087247893796273, + "grad_norm": 1.2801554203033447, + "learning_rate": 9.844713514902363e-06, + "loss": 0.4495, + "step": 19075 + }, + { + "epoch": 0.6095225938218024, + "grad_norm": 0.748471200466156, + "learning_rate": 9.824640287769784e-06, + "loss": 0.4979, + "step": 19100 + }, + { + "epoch": 0.6103203982639775, + "grad_norm": 0.5333442091941833, + "learning_rate": 9.804567060637206e-06, + "loss": 0.5054, + "step": 19125 + }, + { + "epoch": 0.6111182027061527, + "grad_norm": 0.6367368698120117, + "learning_rate": 9.784493833504625e-06, + "loss": 0.4753, + "step": 19150 + }, + { + "epoch": 0.6119160071483278, + "grad_norm": 0.6109923720359802, + "learning_rate": 9.764420606372046e-06, + "loss": 0.4178, + "step": 19175 + }, + { + "epoch": 0.612713811590503, + "grad_norm": 0.8069015145301819, + "learning_rate": 9.744347379239466e-06, + "loss": 0.5275, + "step": 19200 + }, + { + "epoch": 0.6135116160326781, + "grad_norm": 0.6866354942321777, + "learning_rate": 9.724274152106887e-06, + "loss": 0.3898, + "step": 19225 + }, + { + "epoch": 0.6143094204748532, + "grad_norm": 0.7845311164855957, + "learning_rate": 9.704200924974306e-06, + "loss": 0.5202, + "step": 19250 + }, + { + "epoch": 0.6151072249170283, + "grad_norm": 0.6064120531082153, + "learning_rate": 9.684127697841727e-06, + "loss": 0.4977, + "step": 19275 + }, + { + "epoch": 0.6159050293592034, + "grad_norm": 0.8461109399795532, + "learning_rate": 9.664054470709147e-06, + "loss": 0.4322, + "step": 19300 + }, + { + "epoch": 0.6167028338013786, + "grad_norm": 0.3266942501068115, + "learning_rate": 9.643981243576568e-06, + "loss": 0.473, + "step": 19325 + }, + { + "epoch": 0.6175006382435537, + "grad_norm": 0.7122315764427185, + "learning_rate": 9.623908016443989e-06, + "loss": 0.518, + "step": 19350 + }, + { + "epoch": 0.6182984426857289, + "grad_norm": 0.25452858209609985, + "learning_rate": 9.603834789311408e-06, + "loss": 0.4029, + "step": 19375 + }, + { + "epoch": 0.619096247127904, + "grad_norm": 0.5196482539176941, + "learning_rate": 9.58376156217883e-06, + "loss": 0.3705, + "step": 19400 + }, + { + "epoch": 0.6198940515700792, + "grad_norm": 1.0940996408462524, + "learning_rate": 9.563688335046249e-06, + "loss": 0.4584, + "step": 19425 + }, + { + "epoch": 0.6206918560122543, + "grad_norm": 0.5919874310493469, + "learning_rate": 9.54361510791367e-06, + "loss": 0.5453, + "step": 19450 + }, + { + "epoch": 0.6214896604544294, + "grad_norm": 1.0671135187149048, + "learning_rate": 9.52354188078109e-06, + "loss": 0.4539, + "step": 19475 + }, + { + "epoch": 0.6222874648966046, + "grad_norm": 1.9059410095214844, + "learning_rate": 9.50346865364851e-06, + "loss": 0.4905, + "step": 19500 + }, + { + "epoch": 0.6230852693387797, + "grad_norm": 0.8389720320701599, + "learning_rate": 9.48339542651593e-06, + "loss": 0.4132, + "step": 19525 + }, + { + "epoch": 0.6238830737809549, + "grad_norm": 0.7651321291923523, + "learning_rate": 9.463322199383351e-06, + "loss": 0.4647, + "step": 19550 + }, + { + "epoch": 0.6246808782231299, + "grad_norm": 0.4866482615470886, + "learning_rate": 9.443248972250772e-06, + "loss": 0.4135, + "step": 19575 + }, + { + "epoch": 0.625478682665305, + "grad_norm": 0.42190396785736084, + "learning_rate": 9.423175745118192e-06, + "loss": 0.4283, + "step": 19600 + }, + { + "epoch": 0.6262764871074802, + "grad_norm": 2.0557570457458496, + "learning_rate": 9.403102517985613e-06, + "loss": 0.4617, + "step": 19625 + }, + { + "epoch": 0.6270742915496553, + "grad_norm": 0.7080968022346497, + "learning_rate": 9.383029290853032e-06, + "loss": 0.4704, + "step": 19650 + }, + { + "epoch": 0.6278720959918305, + "grad_norm": 0.35508260130882263, + "learning_rate": 9.362956063720454e-06, + "loss": 0.514, + "step": 19675 + }, + { + "epoch": 0.6286699004340056, + "grad_norm": 0.3968958556652069, + "learning_rate": 9.342882836587873e-06, + "loss": 0.4835, + "step": 19700 + }, + { + "epoch": 0.6294677048761808, + "grad_norm": 0.7077760696411133, + "learning_rate": 9.322809609455294e-06, + "loss": 0.332, + "step": 19725 + }, + { + "epoch": 0.6302655093183559, + "grad_norm": 0.5143846869468689, + "learning_rate": 9.302736382322714e-06, + "loss": 0.5054, + "step": 19750 + }, + { + "epoch": 0.631063313760531, + "grad_norm": 0.4242660105228424, + "learning_rate": 9.282663155190135e-06, + "loss": 0.4281, + "step": 19775 + }, + { + "epoch": 0.6318611182027062, + "grad_norm": 0.679987907409668, + "learning_rate": 9.262589928057554e-06, + "loss": 0.4453, + "step": 19800 + }, + { + "epoch": 0.6326589226448813, + "grad_norm": 1.0942387580871582, + "learning_rate": 9.242516700924975e-06, + "loss": 0.4916, + "step": 19825 + }, + { + "epoch": 0.6334567270870565, + "grad_norm": 0.6843435168266296, + "learning_rate": 9.222443473792396e-06, + "loss": 0.4365, + "step": 19850 + }, + { + "epoch": 0.6342545315292315, + "grad_norm": 0.5189023613929749, + "learning_rate": 9.202370246659816e-06, + "loss": 0.4858, + "step": 19875 + }, + { + "epoch": 0.6350523359714066, + "grad_norm": 0.9441674947738647, + "learning_rate": 9.182297019527237e-06, + "loss": 0.3874, + "step": 19900 + }, + { + "epoch": 0.6358501404135818, + "grad_norm": 0.35448402166366577, + "learning_rate": 9.162223792394656e-06, + "loss": 0.3836, + "step": 19925 + }, + { + "epoch": 0.6366479448557569, + "grad_norm": 1.9092568159103394, + "learning_rate": 9.142150565262078e-06, + "loss": 0.4742, + "step": 19950 + }, + { + "epoch": 0.6374457492979321, + "grad_norm": 0.3969165086746216, + "learning_rate": 9.122077338129497e-06, + "loss": 0.3687, + "step": 19975 + }, + { + "epoch": 0.6382435537401072, + "grad_norm": 1.0867652893066406, + "learning_rate": 9.102004110996918e-06, + "loss": 0.4653, + "step": 20000 + }, + { + "epoch": 0.6390413581822824, + "grad_norm": 1.0081027746200562, + "learning_rate": 9.081930883864338e-06, + "loss": 0.4174, + "step": 20025 + }, + { + "epoch": 0.6398391626244575, + "grad_norm": 0.42013636231422424, + "learning_rate": 9.061857656731759e-06, + "loss": 0.4373, + "step": 20050 + }, + { + "epoch": 0.6406369670666326, + "grad_norm": 0.9158220291137695, + "learning_rate": 9.041784429599178e-06, + "loss": 0.4604, + "step": 20075 + }, + { + "epoch": 0.6414347715088078, + "grad_norm": 1.103516697883606, + "learning_rate": 9.021711202466598e-06, + "loss": 0.4662, + "step": 20100 + }, + { + "epoch": 0.6422325759509829, + "grad_norm": 1.128838300704956, + "learning_rate": 9.001637975334019e-06, + "loss": 0.4665, + "step": 20125 + }, + { + "epoch": 0.6430303803931581, + "grad_norm": 0.66585773229599, + "learning_rate": 8.981564748201438e-06, + "loss": 0.4627, + "step": 20150 + }, + { + "epoch": 0.6438281848353332, + "grad_norm": 1.4553017616271973, + "learning_rate": 8.96149152106886e-06, + "loss": 0.4319, + "step": 20175 + }, + { + "epoch": 0.6446259892775082, + "grad_norm": 0.5869995355606079, + "learning_rate": 8.941418293936279e-06, + "loss": 0.4512, + "step": 20200 + }, + { + "epoch": 0.6454237937196834, + "grad_norm": 0.7448079586029053, + "learning_rate": 8.9213450668037e-06, + "loss": 0.4283, + "step": 20225 + }, + { + "epoch": 0.6462215981618585, + "grad_norm": 0.9250773191452026, + "learning_rate": 8.90127183967112e-06, + "loss": 0.4558, + "step": 20250 + }, + { + "epoch": 0.6470194026040337, + "grad_norm": 1.2568118572235107, + "learning_rate": 8.88119861253854e-06, + "loss": 0.4673, + "step": 20275 + }, + { + "epoch": 0.6478172070462088, + "grad_norm": 0.7910193800926208, + "learning_rate": 8.861125385405962e-06, + "loss": 0.4134, + "step": 20300 + }, + { + "epoch": 0.648615011488384, + "grad_norm": 0.8021867871284485, + "learning_rate": 8.841052158273381e-06, + "loss": 0.4737, + "step": 20325 + }, + { + "epoch": 0.6494128159305591, + "grad_norm": 1.035273790359497, + "learning_rate": 8.820978931140802e-06, + "loss": 0.451, + "step": 20350 + }, + { + "epoch": 0.6502106203727342, + "grad_norm": 0.932902455329895, + "learning_rate": 8.800905704008222e-06, + "loss": 0.4868, + "step": 20375 + }, + { + "epoch": 0.6510084248149094, + "grad_norm": 0.8025002479553223, + "learning_rate": 8.780832476875643e-06, + "loss": 0.4211, + "step": 20400 + }, + { + "epoch": 0.6518062292570845, + "grad_norm": 0.7193844318389893, + "learning_rate": 8.760759249743062e-06, + "loss": 0.4245, + "step": 20425 + }, + { + "epoch": 0.6526040336992597, + "grad_norm": 0.7800945043563843, + "learning_rate": 8.740686022610483e-06, + "loss": 0.4597, + "step": 20450 + }, + { + "epoch": 0.6534018381414348, + "grad_norm": 1.033271074295044, + "learning_rate": 8.720612795477903e-06, + "loss": 0.4414, + "step": 20475 + }, + { + "epoch": 0.6541996425836099, + "grad_norm": 2.8177645206451416, + "learning_rate": 8.700539568345324e-06, + "loss": 0.4914, + "step": 20500 + }, + { + "epoch": 0.654997447025785, + "grad_norm": 0.8891062140464783, + "learning_rate": 8.680466341212745e-06, + "loss": 0.4424, + "step": 20525 + }, + { + "epoch": 0.6557952514679601, + "grad_norm": 1.363903522491455, + "learning_rate": 8.660393114080164e-06, + "loss": 0.4735, + "step": 20550 + }, + { + "epoch": 0.6565930559101353, + "grad_norm": 0.9103833436965942, + "learning_rate": 8.640319886947586e-06, + "loss": 0.4518, + "step": 20575 + }, + { + "epoch": 0.6573908603523104, + "grad_norm": 0.7822914719581604, + "learning_rate": 8.620246659815005e-06, + "loss": 0.4292, + "step": 20600 + }, + { + "epoch": 0.6581886647944856, + "grad_norm": 2.1470134258270264, + "learning_rate": 8.600173432682426e-06, + "loss": 0.4664, + "step": 20625 + }, + { + "epoch": 0.6589864692366607, + "grad_norm": 0.6423518657684326, + "learning_rate": 8.580100205549846e-06, + "loss": 0.4308, + "step": 20650 + }, + { + "epoch": 0.6597842736788359, + "grad_norm": 1.1730583906173706, + "learning_rate": 8.560026978417267e-06, + "loss": 0.4317, + "step": 20675 + }, + { + "epoch": 0.660582078121011, + "grad_norm": 1.6694974899291992, + "learning_rate": 8.539953751284686e-06, + "loss": 0.4499, + "step": 20700 + }, + { + "epoch": 0.6613798825631861, + "grad_norm": 1.1325433254241943, + "learning_rate": 8.519880524152107e-06, + "loss": 0.505, + "step": 20725 + }, + { + "epoch": 0.6621776870053613, + "grad_norm": 1.1263487339019775, + "learning_rate": 8.499807297019528e-06, + "loss": 0.4222, + "step": 20750 + }, + { + "epoch": 0.6629754914475364, + "grad_norm": 0.5488644242286682, + "learning_rate": 8.479734069886948e-06, + "loss": 0.4639, + "step": 20775 + }, + { + "epoch": 0.6637732958897116, + "grad_norm": 0.7831270694732666, + "learning_rate": 8.459660842754369e-06, + "loss": 0.4212, + "step": 20800 + }, + { + "epoch": 0.6645711003318866, + "grad_norm": 0.8199292421340942, + "learning_rate": 8.439587615621788e-06, + "loss": 0.4419, + "step": 20825 + }, + { + "epoch": 0.6653689047740617, + "grad_norm": 0.6708464026451111, + "learning_rate": 8.41951438848921e-06, + "loss": 0.4738, + "step": 20850 + }, + { + "epoch": 0.6661667092162369, + "grad_norm": 0.7316299676895142, + "learning_rate": 8.399441161356629e-06, + "loss": 0.4844, + "step": 20875 + }, + { + "epoch": 0.666964513658412, + "grad_norm": 1.4401187896728516, + "learning_rate": 8.37936793422405e-06, + "loss": 0.4703, + "step": 20900 + }, + { + "epoch": 0.6677623181005872, + "grad_norm": 0.5707149505615234, + "learning_rate": 8.35929470709147e-06, + "loss": 0.4759, + "step": 20925 + }, + { + "epoch": 0.6685601225427623, + "grad_norm": 0.7490055561065674, + "learning_rate": 8.33922147995889e-06, + "loss": 0.3137, + "step": 20950 + }, + { + "epoch": 0.6693579269849375, + "grad_norm": 0.6585310101509094, + "learning_rate": 8.31914825282631e-06, + "loss": 0.3908, + "step": 20975 + }, + { + "epoch": 0.6701557314271126, + "grad_norm": 0.8592582941055298, + "learning_rate": 8.299075025693731e-06, + "loss": 0.5232, + "step": 21000 + }, + { + "epoch": 0.6709535358692877, + "grad_norm": 0.35057422518730164, + "learning_rate": 8.279001798561152e-06, + "loss": 0.3998, + "step": 21025 + }, + { + "epoch": 0.6717513403114629, + "grad_norm": 0.4567711651325226, + "learning_rate": 8.258928571428572e-06, + "loss": 0.4052, + "step": 21050 + }, + { + "epoch": 0.672549144753638, + "grad_norm": 0.9955174922943115, + "learning_rate": 8.238855344295993e-06, + "loss": 0.4841, + "step": 21075 + }, + { + "epoch": 0.6733469491958132, + "grad_norm": 0.7141025066375732, + "learning_rate": 8.218782117163412e-06, + "loss": 0.3902, + "step": 21100 + }, + { + "epoch": 0.6741447536379882, + "grad_norm": 0.9155953526496887, + "learning_rate": 8.198708890030833e-06, + "loss": 0.4197, + "step": 21125 + }, + { + "epoch": 0.6749425580801633, + "grad_norm": 1.427937388420105, + "learning_rate": 8.178635662898253e-06, + "loss": 0.4639, + "step": 21150 + }, + { + "epoch": 0.6757403625223385, + "grad_norm": 0.2833626866340637, + "learning_rate": 8.158562435765674e-06, + "loss": 0.4684, + "step": 21175 + }, + { + "epoch": 0.6765381669645136, + "grad_norm": 0.4364416301250458, + "learning_rate": 8.138489208633094e-06, + "loss": 0.3881, + "step": 21200 + }, + { + "epoch": 0.6773359714066888, + "grad_norm": 0.6703497171401978, + "learning_rate": 8.118415981500515e-06, + "loss": 0.4907, + "step": 21225 + }, + { + "epoch": 0.6781337758488639, + "grad_norm": 1.2733856439590454, + "learning_rate": 8.098342754367936e-06, + "loss": 0.4545, + "step": 21250 + }, + { + "epoch": 0.6789315802910391, + "grad_norm": 1.015809416770935, + "learning_rate": 8.078269527235355e-06, + "loss": 0.49, + "step": 21275 + }, + { + "epoch": 0.6797293847332142, + "grad_norm": 0.4571249783039093, + "learning_rate": 8.058196300102776e-06, + "loss": 0.5203, + "step": 21300 + }, + { + "epoch": 0.6805271891753893, + "grad_norm": 0.6945416927337646, + "learning_rate": 8.038123072970196e-06, + "loss": 0.45, + "step": 21325 + }, + { + "epoch": 0.6813249936175645, + "grad_norm": 0.37875500321388245, + "learning_rate": 8.018049845837617e-06, + "loss": 0.4111, + "step": 21350 + }, + { + "epoch": 0.6821227980597396, + "grad_norm": 0.6942138671875, + "learning_rate": 7.997976618705036e-06, + "loss": 0.4605, + "step": 21375 + }, + { + "epoch": 0.6829206025019148, + "grad_norm": 1.9715650081634521, + "learning_rate": 7.977903391572457e-06, + "loss": 0.4446, + "step": 21400 + }, + { + "epoch": 0.6837184069440899, + "grad_norm": 0.6993141174316406, + "learning_rate": 7.957830164439877e-06, + "loss": 0.3837, + "step": 21425 + }, + { + "epoch": 0.684516211386265, + "grad_norm": 1.2793349027633667, + "learning_rate": 7.937756937307298e-06, + "loss": 0.4387, + "step": 21450 + }, + { + "epoch": 0.6853140158284401, + "grad_norm": 1.1474782228469849, + "learning_rate": 7.91768371017472e-06, + "loss": 0.4459, + "step": 21475 + }, + { + "epoch": 0.6861118202706152, + "grad_norm": 1.2606806755065918, + "learning_rate": 7.897610483042139e-06, + "loss": 0.4069, + "step": 21500 + }, + { + "epoch": 0.6869096247127904, + "grad_norm": 0.9630894064903259, + "learning_rate": 7.87753725590956e-06, + "loss": 0.4406, + "step": 21525 + }, + { + "epoch": 0.6877074291549655, + "grad_norm": 1.2839369773864746, + "learning_rate": 7.85746402877698e-06, + "loss": 0.3923, + "step": 21550 + }, + { + "epoch": 0.6885052335971407, + "grad_norm": 0.9209822416305542, + "learning_rate": 7.8373908016444e-06, + "loss": 0.498, + "step": 21575 + }, + { + "epoch": 0.6893030380393158, + "grad_norm": 1.2149327993392944, + "learning_rate": 7.81731757451182e-06, + "loss": 0.4475, + "step": 21600 + }, + { + "epoch": 0.690100842481491, + "grad_norm": 0.6048392057418823, + "learning_rate": 7.79724434737924e-06, + "loss": 0.5274, + "step": 21625 + }, + { + "epoch": 0.6908986469236661, + "grad_norm": 1.0528596639633179, + "learning_rate": 7.77717112024666e-06, + "loss": 0.5082, + "step": 21650 + }, + { + "epoch": 0.6916964513658412, + "grad_norm": 0.7477854490280151, + "learning_rate": 7.75709789311408e-06, + "loss": 0.4999, + "step": 21675 + }, + { + "epoch": 0.6924942558080164, + "grad_norm": 0.8400072455406189, + "learning_rate": 7.737024665981501e-06, + "loss": 0.4639, + "step": 21700 + }, + { + "epoch": 0.6932920602501915, + "grad_norm": 0.7565206289291382, + "learning_rate": 7.71695143884892e-06, + "loss": 0.4452, + "step": 21725 + }, + { + "epoch": 0.6940898646923666, + "grad_norm": 0.8357092142105103, + "learning_rate": 7.696878211716341e-06, + "loss": 0.483, + "step": 21750 + }, + { + "epoch": 0.6948876691345417, + "grad_norm": 0.7302252650260925, + "learning_rate": 7.676804984583761e-06, + "loss": 0.5125, + "step": 21775 + }, + { + "epoch": 0.6956854735767168, + "grad_norm": 0.8201086521148682, + "learning_rate": 7.656731757451182e-06, + "loss": 0.415, + "step": 21800 + }, + { + "epoch": 0.696483278018892, + "grad_norm": 0.5483965277671814, + "learning_rate": 7.636658530318601e-06, + "loss": 0.4885, + "step": 21825 + }, + { + "epoch": 0.6972810824610671, + "grad_norm": 0.669844925403595, + "learning_rate": 7.6165853031860235e-06, + "loss": 0.4792, + "step": 21850 + }, + { + "epoch": 0.6980788869032423, + "grad_norm": 1.3633980751037598, + "learning_rate": 7.596512076053443e-06, + "loss": 0.4563, + "step": 21875 + }, + { + "epoch": 0.6988766913454174, + "grad_norm": 0.5376991033554077, + "learning_rate": 7.576438848920864e-06, + "loss": 0.4399, + "step": 21900 + }, + { + "epoch": 0.6996744957875926, + "grad_norm": 0.7209357619285583, + "learning_rate": 7.5563656217882835e-06, + "loss": 0.4284, + "step": 21925 + }, + { + "epoch": 0.7004723002297677, + "grad_norm": 1.0455883741378784, + "learning_rate": 7.536292394655705e-06, + "loss": 0.4254, + "step": 21950 + }, + { + "epoch": 0.7012701046719428, + "grad_norm": 1.3988233804702759, + "learning_rate": 7.516219167523125e-06, + "loss": 0.4694, + "step": 21975 + }, + { + "epoch": 0.702067909114118, + "grad_norm": 0.7280498743057251, + "learning_rate": 7.496145940390545e-06, + "loss": 0.4521, + "step": 22000 + }, + { + "epoch": 0.7028657135562931, + "grad_norm": 0.457893431186676, + "learning_rate": 7.4760727132579655e-06, + "loss": 0.3892, + "step": 22025 + }, + { + "epoch": 0.7036635179984683, + "grad_norm": 0.6572154760360718, + "learning_rate": 7.455999486125386e-06, + "loss": 0.419, + "step": 22050 + }, + { + "epoch": 0.7044613224406433, + "grad_norm": 0.5123388171195984, + "learning_rate": 7.435926258992806e-06, + "loss": 0.4823, + "step": 22075 + }, + { + "epoch": 0.7052591268828184, + "grad_norm": 1.2624458074569702, + "learning_rate": 7.415853031860226e-06, + "loss": 0.4367, + "step": 22100 + }, + { + "epoch": 0.7060569313249936, + "grad_norm": 0.9980996251106262, + "learning_rate": 7.395779804727647e-06, + "loss": 0.4898, + "step": 22125 + }, + { + "epoch": 0.7068547357671687, + "grad_norm": 1.3691335916519165, + "learning_rate": 7.375706577595067e-06, + "loss": 0.4773, + "step": 22150 + }, + { + "epoch": 0.7076525402093439, + "grad_norm": 0.6638593673706055, + "learning_rate": 7.355633350462487e-06, + "loss": 0.4036, + "step": 22175 + }, + { + "epoch": 0.708450344651519, + "grad_norm": 0.773307740688324, + "learning_rate": 7.335560123329908e-06, + "loss": 0.4258, + "step": 22200 + }, + { + "epoch": 0.7092481490936942, + "grad_norm": 0.8988430500030518, + "learning_rate": 7.315486896197328e-06, + "loss": 0.3869, + "step": 22225 + }, + { + "epoch": 0.7100459535358693, + "grad_norm": 0.46655961871147156, + "learning_rate": 7.295413669064749e-06, + "loss": 0.4413, + "step": 22250 + }, + { + "epoch": 0.7108437579780444, + "grad_norm": 0.7027944326400757, + "learning_rate": 7.275340441932168e-06, + "loss": 0.4794, + "step": 22275 + }, + { + "epoch": 0.7116415624202196, + "grad_norm": 0.5659714937210083, + "learning_rate": 7.2552672147995895e-06, + "loss": 0.5137, + "step": 22300 + }, + { + "epoch": 0.7124393668623947, + "grad_norm": 0.7782139182090759, + "learning_rate": 7.235193987667009e-06, + "loss": 0.4064, + "step": 22325 + }, + { + "epoch": 0.7132371713045699, + "grad_norm": 0.8942545652389526, + "learning_rate": 7.21512076053443e-06, + "loss": 0.5079, + "step": 22350 + }, + { + "epoch": 0.7140349757467449, + "grad_norm": 0.7615993022918701, + "learning_rate": 7.1950475334018495e-06, + "loss": 0.4202, + "step": 22375 + }, + { + "epoch": 0.71483278018892, + "grad_norm": 0.9601730704307556, + "learning_rate": 7.174974306269271e-06, + "loss": 0.4549, + "step": 22400 + }, + { + "epoch": 0.7156305846310952, + "grad_norm": 1.1959699392318726, + "learning_rate": 7.154901079136692e-06, + "loss": 0.4714, + "step": 22425 + }, + { + "epoch": 0.7164283890732703, + "grad_norm": 0.8447968363761902, + "learning_rate": 7.134827852004111e-06, + "loss": 0.4699, + "step": 22450 + }, + { + "epoch": 0.7172261935154455, + "grad_norm": 1.6587258577346802, + "learning_rate": 7.114754624871532e-06, + "loss": 0.3632, + "step": 22475 + }, + { + "epoch": 0.7180239979576206, + "grad_norm": 0.9805980324745178, + "learning_rate": 7.094681397738952e-06, + "loss": 0.3702, + "step": 22500 + }, + { + "epoch": 0.7188218023997958, + "grad_norm": 0.6011649370193481, + "learning_rate": 7.074608170606373e-06, + "loss": 0.4965, + "step": 22525 + }, + { + "epoch": 0.7196196068419709, + "grad_norm": 0.5311054587364197, + "learning_rate": 7.054534943473792e-06, + "loss": 0.4595, + "step": 22550 + }, + { + "epoch": 0.720417411284146, + "grad_norm": 1.2185113430023193, + "learning_rate": 7.0344617163412134e-06, + "loss": 0.4478, + "step": 22575 + }, + { + "epoch": 0.7212152157263212, + "grad_norm": 0.746266782283783, + "learning_rate": 7.014388489208633e-06, + "loss": 0.4443, + "step": 22600 + }, + { + "epoch": 0.7220130201684963, + "grad_norm": 0.499446302652359, + "learning_rate": 6.994315262076054e-06, + "loss": 0.401, + "step": 22625 + }, + { + "epoch": 0.7228108246106715, + "grad_norm": 0.8455522060394287, + "learning_rate": 6.974242034943475e-06, + "loss": 0.4382, + "step": 22650 + }, + { + "epoch": 0.7236086290528466, + "grad_norm": 1.090509295463562, + "learning_rate": 6.954168807810895e-06, + "loss": 0.4279, + "step": 22675 + }, + { + "epoch": 0.7244064334950217, + "grad_norm": 0.9620064496994019, + "learning_rate": 6.934095580678316e-06, + "loss": 0.3785, + "step": 22700 + }, + { + "epoch": 0.7252042379371968, + "grad_norm": 0.455472856760025, + "learning_rate": 6.914022353545735e-06, + "loss": 0.4455, + "step": 22725 + }, + { + "epoch": 0.7260020423793719, + "grad_norm": 1.5191274881362915, + "learning_rate": 6.893949126413156e-06, + "loss": 0.4911, + "step": 22750 + }, + { + "epoch": 0.7267998468215471, + "grad_norm": 0.9269769191741943, + "learning_rate": 6.873875899280576e-06, + "loss": 0.4514, + "step": 22775 + }, + { + "epoch": 0.7275976512637222, + "grad_norm": 0.774607241153717, + "learning_rate": 6.853802672147996e-06, + "loss": 0.4568, + "step": 22800 + }, + { + "epoch": 0.7283954557058974, + "grad_norm": 1.2156281471252441, + "learning_rate": 6.833729445015416e-06, + "loss": 0.4349, + "step": 22825 + }, + { + "epoch": 0.7291932601480725, + "grad_norm": 0.797739565372467, + "learning_rate": 6.813656217882837e-06, + "loss": 0.4247, + "step": 22850 + }, + { + "epoch": 0.7299910645902477, + "grad_norm": 0.942914605140686, + "learning_rate": 6.793582990750258e-06, + "loss": 0.4915, + "step": 22875 + }, + { + "epoch": 0.7307888690324228, + "grad_norm": 1.1048330068588257, + "learning_rate": 6.773509763617677e-06, + "loss": 0.49, + "step": 22900 + }, + { + "epoch": 0.7315866734745979, + "grad_norm": 0.7522678375244141, + "learning_rate": 6.753436536485098e-06, + "loss": 0.5611, + "step": 22925 + }, + { + "epoch": 0.7323844779167731, + "grad_norm": 0.7625138163566589, + "learning_rate": 6.733363309352518e-06, + "loss": 0.4254, + "step": 22950 + }, + { + "epoch": 0.7331822823589482, + "grad_norm": 0.7532039284706116, + "learning_rate": 6.713290082219939e-06, + "loss": 0.4184, + "step": 22975 + }, + { + "epoch": 0.7339800868011233, + "grad_norm": 1.1807453632354736, + "learning_rate": 6.693216855087358e-06, + "loss": 0.5173, + "step": 23000 + }, + { + "epoch": 0.7347778912432984, + "grad_norm": 0.6093919277191162, + "learning_rate": 6.6731436279547794e-06, + "loss": 0.415, + "step": 23025 + }, + { + "epoch": 0.7355756956854735, + "grad_norm": 1.1248325109481812, + "learning_rate": 6.653070400822199e-06, + "loss": 0.4539, + "step": 23050 + }, + { + "epoch": 0.7363735001276487, + "grad_norm": 0.8926781415939331, + "learning_rate": 6.63299717368962e-06, + "loss": 0.4423, + "step": 23075 + }, + { + "epoch": 0.7371713045698238, + "grad_norm": 0.7217565178871155, + "learning_rate": 6.6129239465570394e-06, + "loss": 0.4549, + "step": 23100 + }, + { + "epoch": 0.737969109011999, + "grad_norm": 1.1932436227798462, + "learning_rate": 6.5928507194244606e-06, + "loss": 0.4441, + "step": 23125 + }, + { + "epoch": 0.7387669134541741, + "grad_norm": 0.7477916479110718, + "learning_rate": 6.572777492291882e-06, + "loss": 0.4147, + "step": 23150 + }, + { + "epoch": 0.7395647178963493, + "grad_norm": 0.721010684967041, + "learning_rate": 6.552704265159301e-06, + "loss": 0.4351, + "step": 23175 + }, + { + "epoch": 0.7403625223385244, + "grad_norm": 0.8074814677238464, + "learning_rate": 6.532631038026722e-06, + "loss": 0.5054, + "step": 23200 + }, + { + "epoch": 0.7411603267806995, + "grad_norm": 1.1390972137451172, + "learning_rate": 6.512557810894142e-06, + "loss": 0.5007, + "step": 23225 + }, + { + "epoch": 0.7419581312228747, + "grad_norm": 0.6260811686515808, + "learning_rate": 6.492484583761563e-06, + "loss": 0.4868, + "step": 23250 + }, + { + "epoch": 0.7427559356650498, + "grad_norm": 0.871371865272522, + "learning_rate": 6.472411356628982e-06, + "loss": 0.4647, + "step": 23275 + }, + { + "epoch": 0.743553740107225, + "grad_norm": 0.4815622866153717, + "learning_rate": 6.452338129496403e-06, + "loss": 0.4833, + "step": 23300 + }, + { + "epoch": 0.7443515445494, + "grad_norm": 0.49014878273010254, + "learning_rate": 6.432264902363823e-06, + "loss": 0.4405, + "step": 23325 + }, + { + "epoch": 0.7451493489915751, + "grad_norm": 0.7321723103523254, + "learning_rate": 6.412191675231244e-06, + "loss": 0.4431, + "step": 23350 + }, + { + "epoch": 0.7459471534337503, + "grad_norm": 0.6357221007347107, + "learning_rate": 6.392118448098665e-06, + "loss": 0.4405, + "step": 23375 + }, + { + "epoch": 0.7467449578759254, + "grad_norm": 0.8464056253433228, + "learning_rate": 6.3720452209660846e-06, + "loss": 0.3762, + "step": 23400 + }, + { + "epoch": 0.7475427623181006, + "grad_norm": 0.36736053228378296, + "learning_rate": 6.351971993833506e-06, + "loss": 0.5167, + "step": 23425 + }, + { + "epoch": 0.7483405667602757, + "grad_norm": 0.4179846942424774, + "learning_rate": 6.331898766700925e-06, + "loss": 0.4696, + "step": 23450 + }, + { + "epoch": 0.7491383712024509, + "grad_norm": 0.5624365210533142, + "learning_rate": 6.311825539568346e-06, + "loss": 0.4453, + "step": 23475 + }, + { + "epoch": 0.749936175644626, + "grad_norm": 0.6948854923248291, + "learning_rate": 6.291752312435766e-06, + "loss": 0.4108, + "step": 23500 + }, + { + "epoch": 0.7507339800868011, + "grad_norm": 0.8782447576522827, + "learning_rate": 6.271679085303187e-06, + "loss": 0.4025, + "step": 23525 + }, + { + "epoch": 0.7515317845289763, + "grad_norm": 0.5253655910491943, + "learning_rate": 6.251605858170606e-06, + "loss": 0.4388, + "step": 23550 + }, + { + "epoch": 0.7523295889711514, + "grad_norm": 0.7023342847824097, + "learning_rate": 6.231532631038027e-06, + "loss": 0.4273, + "step": 23575 + }, + { + "epoch": 0.7531273934133266, + "grad_norm": 0.8842118978500366, + "learning_rate": 6.211459403905448e-06, + "loss": 0.5036, + "step": 23600 + }, + { + "epoch": 0.7539251978555016, + "grad_norm": 1.4032319784164429, + "learning_rate": 6.191386176772868e-06, + "loss": 0.4888, + "step": 23625 + }, + { + "epoch": 0.7547230022976767, + "grad_norm": 0.44206473231315613, + "learning_rate": 6.171312949640288e-06, + "loss": 0.4463, + "step": 23650 + }, + { + "epoch": 0.7555208067398519, + "grad_norm": 1.5239768028259277, + "learning_rate": 6.1512397225077085e-06, + "loss": 0.4516, + "step": 23675 + }, + { + "epoch": 0.756318611182027, + "grad_norm": 0.9154671430587769, + "learning_rate": 6.131166495375129e-06, + "loss": 0.5288, + "step": 23700 + }, + { + "epoch": 0.7571164156242022, + "grad_norm": 0.9400933384895325, + "learning_rate": 6.111093268242549e-06, + "loss": 0.4362, + "step": 23725 + }, + { + "epoch": 0.7579142200663773, + "grad_norm": 1.0026756525039673, + "learning_rate": 6.091020041109969e-06, + "loss": 0.4442, + "step": 23750 + }, + { + "epoch": 0.7587120245085525, + "grad_norm": 0.9670106172561646, + "learning_rate": 6.07094681397739e-06, + "loss": 0.4778, + "step": 23775 + }, + { + "epoch": 0.7595098289507276, + "grad_norm": 0.8731892704963684, + "learning_rate": 6.05087358684481e-06, + "loss": 0.5162, + "step": 23800 + }, + { + "epoch": 0.7603076333929027, + "grad_norm": 0.8117145895957947, + "learning_rate": 6.03080035971223e-06, + "loss": 0.4648, + "step": 23825 + }, + { + "epoch": 0.7611054378350779, + "grad_norm": 0.8562591671943665, + "learning_rate": 6.0107271325796505e-06, + "loss": 0.4668, + "step": 23850 + }, + { + "epoch": 0.761903242277253, + "grad_norm": 1.0122448205947876, + "learning_rate": 5.990653905447071e-06, + "loss": 0.4951, + "step": 23875 + }, + { + "epoch": 0.7627010467194282, + "grad_norm": 0.6894921064376831, + "learning_rate": 5.970580678314491e-06, + "loss": 0.4328, + "step": 23900 + }, + { + "epoch": 0.7634988511616033, + "grad_norm": 1.251848816871643, + "learning_rate": 5.950507451181911e-06, + "loss": 0.5219, + "step": 23925 + }, + { + "epoch": 0.7642966556037784, + "grad_norm": 0.6008142232894897, + "learning_rate": 5.9304342240493325e-06, + "loss": 0.4343, + "step": 23950 + }, + { + "epoch": 0.7650944600459535, + "grad_norm": 1.4150334596633911, + "learning_rate": 5.910360996916753e-06, + "loss": 0.4935, + "step": 23975 + }, + { + "epoch": 0.7658922644881286, + "grad_norm": 1.140459656715393, + "learning_rate": 5.890287769784173e-06, + "loss": 0.4381, + "step": 24000 + }, + { + "epoch": 0.7666900689303038, + "grad_norm": 1.0855172872543335, + "learning_rate": 5.870214542651593e-06, + "loss": 0.4398, + "step": 24025 + }, + { + "epoch": 0.7674878733724789, + "grad_norm": 1.5270390510559082, + "learning_rate": 5.850141315519014e-06, + "loss": 0.4635, + "step": 24050 + }, + { + "epoch": 0.7682856778146541, + "grad_norm": 0.49588742852211, + "learning_rate": 5.830068088386434e-06, + "loss": 0.4111, + "step": 24075 + }, + { + "epoch": 0.7690834822568292, + "grad_norm": 0.7938015460968018, + "learning_rate": 5.809994861253854e-06, + "loss": 0.4, + "step": 24100 + }, + { + "epoch": 0.7698812866990044, + "grad_norm": 1.051199197769165, + "learning_rate": 5.7899216341212745e-06, + "loss": 0.4471, + "step": 24125 + }, + { + "epoch": 0.7706790911411795, + "grad_norm": 0.41370636224746704, + "learning_rate": 5.769848406988695e-06, + "loss": 0.3582, + "step": 24150 + }, + { + "epoch": 0.7714768955833546, + "grad_norm": 0.6135370135307312, + "learning_rate": 5.749775179856116e-06, + "loss": 0.4763, + "step": 24175 + }, + { + "epoch": 0.7722747000255298, + "grad_norm": 0.6476448178291321, + "learning_rate": 5.729701952723536e-06, + "loss": 0.4751, + "step": 24200 + }, + { + "epoch": 0.7730725044677049, + "grad_norm": 0.5266830325126648, + "learning_rate": 5.7096287255909565e-06, + "loss": 0.4136, + "step": 24225 + }, + { + "epoch": 0.77387030890988, + "grad_norm": 0.7233723998069763, + "learning_rate": 5.689555498458377e-06, + "loss": 0.4664, + "step": 24250 + }, + { + "epoch": 0.7746681133520551, + "grad_norm": 1.1774080991744995, + "learning_rate": 5.669482271325797e-06, + "loss": 0.4115, + "step": 24275 + }, + { + "epoch": 0.7754659177942302, + "grad_norm": 0.9674144983291626, + "learning_rate": 5.649409044193217e-06, + "loss": 0.4505, + "step": 24300 + }, + { + "epoch": 0.7762637222364054, + "grad_norm": 0.6825979351997375, + "learning_rate": 5.629335817060638e-06, + "loss": 0.5304, + "step": 24325 + }, + { + "epoch": 0.7770615266785805, + "grad_norm": 0.5845705270767212, + "learning_rate": 5.609262589928058e-06, + "loss": 0.4467, + "step": 24350 + }, + { + "epoch": 0.7778593311207557, + "grad_norm": 1.3499436378479004, + "learning_rate": 5.589189362795478e-06, + "loss": 0.376, + "step": 24375 + }, + { + "epoch": 0.7786571355629308, + "grad_norm": 0.782960832118988, + "learning_rate": 5.5691161356628985e-06, + "loss": 0.4922, + "step": 24400 + }, + { + "epoch": 0.779454940005106, + "grad_norm": 0.5972276926040649, + "learning_rate": 5.549042908530319e-06, + "loss": 0.3807, + "step": 24425 + }, + { + "epoch": 0.7802527444472811, + "grad_norm": 0.9225984811782837, + "learning_rate": 5.528969681397739e-06, + "loss": 0.3951, + "step": 24450 + }, + { + "epoch": 0.7810505488894562, + "grad_norm": 0.6132213473320007, + "learning_rate": 5.508896454265159e-06, + "loss": 0.4606, + "step": 24475 + }, + { + "epoch": 0.7818483533316314, + "grad_norm": 0.8292983174324036, + "learning_rate": 5.48882322713258e-06, + "loss": 0.4301, + "step": 24500 + }, + { + "epoch": 0.7826461577738065, + "grad_norm": 0.8979394435882568, + "learning_rate": 5.46875e-06, + "loss": 0.4381, + "step": 24525 + }, + { + "epoch": 0.7834439622159817, + "grad_norm": 0.7707974910736084, + "learning_rate": 5.44867677286742e-06, + "loss": 0.4548, + "step": 24550 + }, + { + "epoch": 0.7842417666581567, + "grad_norm": 0.7816491723060608, + "learning_rate": 5.4286035457348405e-06, + "loss": 0.4446, + "step": 24575 + }, + { + "epoch": 0.7850395711003318, + "grad_norm": 1.1294130086898804, + "learning_rate": 5.408530318602261e-06, + "loss": 0.5052, + "step": 24600 + }, + { + "epoch": 0.785837375542507, + "grad_norm": 0.662937343120575, + "learning_rate": 5.388457091469681e-06, + "loss": 0.3954, + "step": 24625 + }, + { + "epoch": 0.7866351799846821, + "grad_norm": 0.5076242685317993, + "learning_rate": 5.368383864337102e-06, + "loss": 0.488, + "step": 24650 + }, + { + "epoch": 0.7874329844268573, + "grad_norm": 1.2003448009490967, + "learning_rate": 5.3483106372045225e-06, + "loss": 0.4162, + "step": 24675 + }, + { + "epoch": 0.7882307888690324, + "grad_norm": 0.9463032484054565, + "learning_rate": 5.328237410071943e-06, + "loss": 0.4827, + "step": 24700 + }, + { + "epoch": 0.7890285933112076, + "grad_norm": 0.827299177646637, + "learning_rate": 5.308164182939363e-06, + "loss": 0.4346, + "step": 24725 + }, + { + "epoch": 0.7898263977533827, + "grad_norm": 0.7343999147415161, + "learning_rate": 5.288090955806783e-06, + "loss": 0.4749, + "step": 24750 + }, + { + "epoch": 0.7906242021955578, + "grad_norm": 1.009519100189209, + "learning_rate": 5.268017728674204e-06, + "loss": 0.4613, + "step": 24775 + }, + { + "epoch": 0.791422006637733, + "grad_norm": 0.7076058387756348, + "learning_rate": 5.247944501541624e-06, + "loss": 0.4468, + "step": 24800 + }, + { + "epoch": 0.7922198110799081, + "grad_norm": 0.6096115708351135, + "learning_rate": 5.227871274409044e-06, + "loss": 0.4518, + "step": 24825 + }, + { + "epoch": 0.7930176155220833, + "grad_norm": 0.813082218170166, + "learning_rate": 5.2077980472764645e-06, + "loss": 0.4212, + "step": 24850 + }, + { + "epoch": 0.7938154199642583, + "grad_norm": 0.5449020862579346, + "learning_rate": 5.187724820143885e-06, + "loss": 0.4565, + "step": 24875 + }, + { + "epoch": 0.7946132244064334, + "grad_norm": 1.2484004497528076, + "learning_rate": 5.167651593011306e-06, + "loss": 0.506, + "step": 24900 + }, + { + "epoch": 0.7954110288486086, + "grad_norm": 0.6107988357543945, + "learning_rate": 5.147578365878726e-06, + "loss": 0.4591, + "step": 24925 + }, + { + "epoch": 0.7962088332907837, + "grad_norm": 1.039955735206604, + "learning_rate": 5.1275051387461465e-06, + "loss": 0.3877, + "step": 24950 + }, + { + "epoch": 0.7970066377329589, + "grad_norm": 0.9229916930198669, + "learning_rate": 5.107431911613567e-06, + "loss": 0.5025, + "step": 24975 + }, + { + "epoch": 0.797804442175134, + "grad_norm": 1.3384779691696167, + "learning_rate": 5.087358684480987e-06, + "loss": 0.4399, + "step": 25000 + }, + { + "epoch": 0.7986022466173092, + "grad_norm": 0.9055022597312927, + "learning_rate": 5.067285457348407e-06, + "loss": 0.3905, + "step": 25025 + }, + { + "epoch": 0.7994000510594843, + "grad_norm": 1.076596736907959, + "learning_rate": 5.047212230215828e-06, + "loss": 0.4646, + "step": 25050 + }, + { + "epoch": 0.8001978555016594, + "grad_norm": 0.8877248764038086, + "learning_rate": 5.027139003083248e-06, + "loss": 0.4294, + "step": 25075 + }, + { + "epoch": 0.8009956599438346, + "grad_norm": 0.8110154867172241, + "learning_rate": 5.007065775950668e-06, + "loss": 0.4358, + "step": 25100 + }, + { + "epoch": 0.8017934643860097, + "grad_norm": 0.4719976782798767, + "learning_rate": 4.986992548818089e-06, + "loss": 0.4037, + "step": 25125 + }, + { + "epoch": 0.8025912688281849, + "grad_norm": 0.9919301867485046, + "learning_rate": 4.96691932168551e-06, + "loss": 0.4317, + "step": 25150 + }, + { + "epoch": 0.80338907327036, + "grad_norm": 0.7624319195747375, + "learning_rate": 4.94684609455293e-06, + "loss": 0.4069, + "step": 25175 + }, + { + "epoch": 0.804186877712535, + "grad_norm": 1.2844278812408447, + "learning_rate": 4.926772867420349e-06, + "loss": 0.4065, + "step": 25200 + }, + { + "epoch": 0.8049846821547102, + "grad_norm": 1.9668337106704712, + "learning_rate": 4.90669964028777e-06, + "loss": 0.5158, + "step": 25225 + }, + { + "epoch": 0.8057824865968853, + "grad_norm": 0.7960777282714844, + "learning_rate": 4.88662641315519e-06, + "loss": 0.4561, + "step": 25250 + }, + { + "epoch": 0.8065802910390605, + "grad_norm": 1.1442573070526123, + "learning_rate": 4.86655318602261e-06, + "loss": 0.4389, + "step": 25275 + }, + { + "epoch": 0.8073780954812356, + "grad_norm": 0.6414112448692322, + "learning_rate": 4.8464799588900305e-06, + "loss": 0.3954, + "step": 25300 + }, + { + "epoch": 0.8081758999234108, + "grad_norm": 1.3013173341751099, + "learning_rate": 4.826406731757451e-06, + "loss": 0.4712, + "step": 25325 + }, + { + "epoch": 0.8089737043655859, + "grad_norm": 1.218284249305725, + "learning_rate": 4.806333504624871e-06, + "loss": 0.4355, + "step": 25350 + }, + { + "epoch": 0.809771508807761, + "grad_norm": 0.918598473072052, + "learning_rate": 4.786260277492292e-06, + "loss": 0.4291, + "step": 25375 + }, + { + "epoch": 0.8105693132499362, + "grad_norm": 0.6754209399223328, + "learning_rate": 4.7661870503597125e-06, + "loss": 0.4094, + "step": 25400 + }, + { + "epoch": 0.8113671176921113, + "grad_norm": 0.45365118980407715, + "learning_rate": 4.746113823227133e-06, + "loss": 0.4213, + "step": 25425 + }, + { + "epoch": 0.8121649221342865, + "grad_norm": 0.9628251791000366, + "learning_rate": 4.726040596094553e-06, + "loss": 0.467, + "step": 25450 + }, + { + "epoch": 0.8129627265764616, + "grad_norm": 0.6204245686531067, + "learning_rate": 4.705967368961973e-06, + "loss": 0.4692, + "step": 25475 + }, + { + "epoch": 0.8137605310186367, + "grad_norm": 0.8180040717124939, + "learning_rate": 4.685894141829394e-06, + "loss": 0.4403, + "step": 25500 + }, + { + "epoch": 0.8145583354608118, + "grad_norm": 0.6478278040885925, + "learning_rate": 4.665820914696814e-06, + "loss": 0.4135, + "step": 25525 + }, + { + "epoch": 0.8153561399029869, + "grad_norm": 0.9499540328979492, + "learning_rate": 4.645747687564234e-06, + "loss": 0.4316, + "step": 25550 + }, + { + "epoch": 0.8161539443451621, + "grad_norm": 0.9487488865852356, + "learning_rate": 4.6256744604316545e-06, + "loss": 0.4531, + "step": 25575 + }, + { + "epoch": 0.8169517487873372, + "grad_norm": 1.0303360223770142, + "learning_rate": 4.605601233299076e-06, + "loss": 0.4601, + "step": 25600 + }, + { + "epoch": 0.8177495532295124, + "grad_norm": 0.5638821125030518, + "learning_rate": 4.585528006166496e-06, + "loss": 0.4367, + "step": 25625 + }, + { + "epoch": 0.8185473576716875, + "grad_norm": 0.6594163179397583, + "learning_rate": 4.565454779033916e-06, + "loss": 0.4313, + "step": 25650 + }, + { + "epoch": 0.8193451621138627, + "grad_norm": 0.9298909306526184, + "learning_rate": 4.5453815519013365e-06, + "loss": 0.553, + "step": 25675 + }, + { + "epoch": 0.8201429665560378, + "grad_norm": 0.6085266470909119, + "learning_rate": 4.525308324768757e-06, + "loss": 0.4837, + "step": 25700 + }, + { + "epoch": 0.8209407709982129, + "grad_norm": 0.8190747499465942, + "learning_rate": 4.505235097636177e-06, + "loss": 0.4788, + "step": 25725 + }, + { + "epoch": 0.8217385754403881, + "grad_norm": 1.0059787034988403, + "learning_rate": 4.485161870503597e-06, + "loss": 0.4349, + "step": 25750 + }, + { + "epoch": 0.8225363798825632, + "grad_norm": 0.9931736588478088, + "learning_rate": 4.465088643371018e-06, + "loss": 0.4796, + "step": 25775 + }, + { + "epoch": 0.8233341843247384, + "grad_norm": 1.0960803031921387, + "learning_rate": 4.445015416238438e-06, + "loss": 0.4288, + "step": 25800 + }, + { + "epoch": 0.8241319887669134, + "grad_norm": 0.7804691791534424, + "learning_rate": 4.424942189105859e-06, + "loss": 0.4688, + "step": 25825 + }, + { + "epoch": 0.8249297932090885, + "grad_norm": 0.49692556262016296, + "learning_rate": 4.404868961973279e-06, + "loss": 0.453, + "step": 25850 + }, + { + "epoch": 0.8257275976512637, + "grad_norm": 0.8641229867935181, + "learning_rate": 4.3847957348407e-06, + "loss": 0.4549, + "step": 25875 + }, + { + "epoch": 0.8265254020934388, + "grad_norm": 0.7968979477882385, + "learning_rate": 4.36472250770812e-06, + "loss": 0.3873, + "step": 25900 + }, + { + "epoch": 0.827323206535614, + "grad_norm": 0.6417710781097412, + "learning_rate": 4.34464928057554e-06, + "loss": 0.3927, + "step": 25925 + }, + { + "epoch": 0.8281210109777891, + "grad_norm": 0.7604640126228333, + "learning_rate": 4.3245760534429605e-06, + "loss": 0.466, + "step": 25950 + }, + { + "epoch": 0.8289188154199643, + "grad_norm": 1.0356528759002686, + "learning_rate": 4.304502826310381e-06, + "loss": 0.4877, + "step": 25975 + }, + { + "epoch": 0.8297166198621394, + "grad_norm": 0.8598850965499878, + "learning_rate": 4.284429599177801e-06, + "loss": 0.3717, + "step": 26000 + }, + { + "epoch": 0.8305144243043145, + "grad_norm": 1.647703766822815, + "learning_rate": 4.264356372045221e-06, + "loss": 0.4876, + "step": 26025 + }, + { + "epoch": 0.8313122287464897, + "grad_norm": 0.6927606463432312, + "learning_rate": 4.244283144912642e-06, + "loss": 0.4737, + "step": 26050 + }, + { + "epoch": 0.8321100331886648, + "grad_norm": 1.9317525625228882, + "learning_rate": 4.224209917780062e-06, + "loss": 0.4072, + "step": 26075 + }, + { + "epoch": 0.83290783763084, + "grad_norm": 1.4476104974746704, + "learning_rate": 4.204136690647482e-06, + "loss": 0.4776, + "step": 26100 + }, + { + "epoch": 0.833705642073015, + "grad_norm": 1.3182293176651, + "learning_rate": 4.1840634635149024e-06, + "loss": 0.4551, + "step": 26125 + }, + { + "epoch": 0.8345034465151902, + "grad_norm": 0.4591107666492462, + "learning_rate": 4.163990236382323e-06, + "loss": 0.5006, + "step": 26150 + }, + { + "epoch": 0.8353012509573653, + "grad_norm": 2.0317952632904053, + "learning_rate": 4.143917009249743e-06, + "loss": 0.3685, + "step": 26175 + }, + { + "epoch": 0.8360990553995404, + "grad_norm": 1.3375020027160645, + "learning_rate": 4.123843782117163e-06, + "loss": 0.391, + "step": 26200 + }, + { + "epoch": 0.8368968598417156, + "grad_norm": 0.8244248628616333, + "learning_rate": 4.103770554984584e-06, + "loss": 0.4395, + "step": 26225 + }, + { + "epoch": 0.8376946642838907, + "grad_norm": 0.8836075663566589, + "learning_rate": 4.083697327852004e-06, + "loss": 0.4241, + "step": 26250 + }, + { + "epoch": 0.8384924687260659, + "grad_norm": 0.8446002006530762, + "learning_rate": 4.063624100719424e-06, + "loss": 0.4999, + "step": 26275 + }, + { + "epoch": 0.839290273168241, + "grad_norm": 0.6555794477462769, + "learning_rate": 4.043550873586845e-06, + "loss": 0.5398, + "step": 26300 + }, + { + "epoch": 0.8400880776104162, + "grad_norm": 1.4175652265548706, + "learning_rate": 4.023477646454266e-06, + "loss": 0.4261, + "step": 26325 + }, + { + "epoch": 0.8408858820525913, + "grad_norm": 0.6591663956642151, + "learning_rate": 4.003404419321686e-06, + "loss": 0.4852, + "step": 26350 + }, + { + "epoch": 0.8416836864947664, + "grad_norm": 0.9750247597694397, + "learning_rate": 3.983331192189106e-06, + "loss": 0.423, + "step": 26375 + }, + { + "epoch": 0.8424814909369416, + "grad_norm": 0.7487388253211975, + "learning_rate": 3.9632579650565264e-06, + "loss": 0.407, + "step": 26400 + }, + { + "epoch": 0.8432792953791167, + "grad_norm": 0.8284870982170105, + "learning_rate": 3.943184737923947e-06, + "loss": 0.4602, + "step": 26425 + }, + { + "epoch": 0.8440770998212918, + "grad_norm": 0.8264731168746948, + "learning_rate": 3.923111510791367e-06, + "loss": 0.4068, + "step": 26450 + }, + { + "epoch": 0.8448749042634669, + "grad_norm": 0.648423969745636, + "learning_rate": 3.903038283658787e-06, + "loss": 0.4144, + "step": 26475 + }, + { + "epoch": 0.845672708705642, + "grad_norm": 1.2434682846069336, + "learning_rate": 3.882965056526208e-06, + "loss": 0.4697, + "step": 26500 + }, + { + "epoch": 0.8464705131478172, + "grad_norm": 1.1299934387207031, + "learning_rate": 3.862891829393628e-06, + "loss": 0.479, + "step": 26525 + }, + { + "epoch": 0.8472683175899923, + "grad_norm": 0.8019355535507202, + "learning_rate": 3.842818602261049e-06, + "loss": 0.4719, + "step": 26550 + }, + { + "epoch": 0.8480661220321675, + "grad_norm": 1.5749025344848633, + "learning_rate": 3.822745375128469e-06, + "loss": 0.4612, + "step": 26575 + }, + { + "epoch": 0.8488639264743426, + "grad_norm": 1.2036354541778564, + "learning_rate": 3.802672147995889e-06, + "loss": 0.4539, + "step": 26600 + }, + { + "epoch": 0.8496617309165178, + "grad_norm": 1.055600881576538, + "learning_rate": 3.7825989208633094e-06, + "loss": 0.4538, + "step": 26625 + }, + { + "epoch": 0.8504595353586929, + "grad_norm": 1.1564183235168457, + "learning_rate": 3.7625256937307297e-06, + "loss": 0.5394, + "step": 26650 + }, + { + "epoch": 0.851257339800868, + "grad_norm": 1.1138954162597656, + "learning_rate": 3.74245246659815e-06, + "loss": 0.4532, + "step": 26675 + }, + { + "epoch": 0.8520551442430432, + "grad_norm": 0.5815456509590149, + "learning_rate": 3.7223792394655703e-06, + "loss": 0.4968, + "step": 26700 + }, + { + "epoch": 0.8528529486852183, + "grad_norm": 0.4686177372932434, + "learning_rate": 3.7023060123329906e-06, + "loss": 0.4501, + "step": 26725 + }, + { + "epoch": 0.8536507531273934, + "grad_norm": 0.5037088990211487, + "learning_rate": 3.682232785200411e-06, + "loss": 0.4265, + "step": 26750 + }, + { + "epoch": 0.8544485575695685, + "grad_norm": 0.9735825061798096, + "learning_rate": 3.662159558067832e-06, + "loss": 0.3994, + "step": 26775 + }, + { + "epoch": 0.8552463620117436, + "grad_norm": 1.6671348810195923, + "learning_rate": 3.6420863309352523e-06, + "loss": 0.4917, + "step": 26800 + }, + { + "epoch": 0.8560441664539188, + "grad_norm": 0.9147958159446716, + "learning_rate": 3.6220131038026726e-06, + "loss": 0.4934, + "step": 26825 + }, + { + "epoch": 0.8568419708960939, + "grad_norm": 0.7258955240249634, + "learning_rate": 3.601939876670093e-06, + "loss": 0.4512, + "step": 26850 + }, + { + "epoch": 0.8576397753382691, + "grad_norm": 1.2786023616790771, + "learning_rate": 3.581866649537513e-06, + "loss": 0.449, + "step": 26875 + }, + { + "epoch": 0.8584375797804442, + "grad_norm": 1.0247211456298828, + "learning_rate": 3.5617934224049334e-06, + "loss": 0.4312, + "step": 26900 + }, + { + "epoch": 0.8592353842226194, + "grad_norm": 0.9334677457809448, + "learning_rate": 3.5417201952723537e-06, + "loss": 0.4484, + "step": 26925 + }, + { + "epoch": 0.8600331886647945, + "grad_norm": 1.0831176042556763, + "learning_rate": 3.521646968139774e-06, + "loss": 0.439, + "step": 26950 + }, + { + "epoch": 0.8608309931069696, + "grad_norm": 0.9988906979560852, + "learning_rate": 3.5015737410071943e-06, + "loss": 0.4584, + "step": 26975 + }, + { + "epoch": 0.8616287975491448, + "grad_norm": 0.7575721740722656, + "learning_rate": 3.4815005138746146e-06, + "loss": 0.4597, + "step": 27000 + }, + { + "epoch": 0.8624266019913199, + "grad_norm": 0.5323251485824585, + "learning_rate": 3.4614272867420353e-06, + "loss": 0.4362, + "step": 27025 + }, + { + "epoch": 0.8632244064334951, + "grad_norm": 0.7563241720199585, + "learning_rate": 3.4413540596094556e-06, + "loss": 0.3917, + "step": 27050 + }, + { + "epoch": 0.8640222108756701, + "grad_norm": 0.6328034996986389, + "learning_rate": 3.421280832476876e-06, + "loss": 0.506, + "step": 27075 + }, + { + "epoch": 0.8648200153178452, + "grad_norm": 0.6230515241622925, + "learning_rate": 3.401207605344296e-06, + "loss": 0.4231, + "step": 27100 + }, + { + "epoch": 0.8656178197600204, + "grad_norm": 1.0412312746047974, + "learning_rate": 3.3811343782117164e-06, + "loss": 0.3983, + "step": 27125 + }, + { + "epoch": 0.8664156242021955, + "grad_norm": 1.1394895315170288, + "learning_rate": 3.3610611510791367e-06, + "loss": 0.447, + "step": 27150 + }, + { + "epoch": 0.8672134286443707, + "grad_norm": 0.7843630909919739, + "learning_rate": 3.340987923946557e-06, + "loss": 0.4271, + "step": 27175 + }, + { + "epoch": 0.8680112330865458, + "grad_norm": 0.6892642378807068, + "learning_rate": 3.3209146968139773e-06, + "loss": 0.4125, + "step": 27200 + }, + { + "epoch": 0.868809037528721, + "grad_norm": 1.255333662033081, + "learning_rate": 3.3008414696813976e-06, + "loss": 0.492, + "step": 27225 + }, + { + "epoch": 0.8696068419708961, + "grad_norm": 1.151365041732788, + "learning_rate": 3.2807682425488187e-06, + "loss": 0.4146, + "step": 27250 + }, + { + "epoch": 0.8704046464130712, + "grad_norm": 1.0154856443405151, + "learning_rate": 3.260695015416239e-06, + "loss": 0.4375, + "step": 27275 + }, + { + "epoch": 0.8712024508552464, + "grad_norm": 1.9048150777816772, + "learning_rate": 3.2406217882836593e-06, + "loss": 0.4004, + "step": 27300 + }, + { + "epoch": 0.8720002552974215, + "grad_norm": 1.4371875524520874, + "learning_rate": 3.2205485611510795e-06, + "loss": 0.5112, + "step": 27325 + }, + { + "epoch": 0.8727980597395967, + "grad_norm": 0.8767828345298767, + "learning_rate": 3.2004753340185e-06, + "loss": 0.4611, + "step": 27350 + }, + { + "epoch": 0.8735958641817718, + "grad_norm": 0.9504615664482117, + "learning_rate": 3.18040210688592e-06, + "loss": 0.4734, + "step": 27375 + }, + { + "epoch": 0.8743936686239469, + "grad_norm": 0.7097592353820801, + "learning_rate": 3.1603288797533404e-06, + "loss": 0.4001, + "step": 27400 + }, + { + "epoch": 0.875191473066122, + "grad_norm": 0.9382915496826172, + "learning_rate": 3.1402556526207607e-06, + "loss": 0.4468, + "step": 27425 + }, + { + "epoch": 0.8759892775082971, + "grad_norm": 0.840699315071106, + "learning_rate": 3.120182425488181e-06, + "loss": 0.4581, + "step": 27450 + }, + { + "epoch": 0.8767870819504723, + "grad_norm": 0.5212263464927673, + "learning_rate": 3.1001091983556013e-06, + "loss": 0.4597, + "step": 27475 + }, + { + "epoch": 0.8775848863926474, + "grad_norm": 1.300909161567688, + "learning_rate": 3.0800359712230215e-06, + "loss": 0.4125, + "step": 27500 + }, + { + "epoch": 0.8783826908348226, + "grad_norm": 1.2471950054168701, + "learning_rate": 3.059962744090442e-06, + "loss": 0.4136, + "step": 27525 + }, + { + "epoch": 0.8791804952769977, + "grad_norm": 1.0161088705062866, + "learning_rate": 3.0398895169578625e-06, + "loss": 0.4226, + "step": 27550 + }, + { + "epoch": 0.8799782997191729, + "grad_norm": 0.7740815877914429, + "learning_rate": 3.019816289825283e-06, + "loss": 0.4074, + "step": 27575 + }, + { + "epoch": 0.880776104161348, + "grad_norm": 0.5264682769775391, + "learning_rate": 2.999743062692703e-06, + "loss": 0.4391, + "step": 27600 + }, + { + "epoch": 0.8815739086035231, + "grad_norm": 0.3435772657394409, + "learning_rate": 2.9796698355601234e-06, + "loss": 0.5446, + "step": 27625 + }, + { + "epoch": 0.8823717130456983, + "grad_norm": 2.251746654510498, + "learning_rate": 2.9595966084275437e-06, + "loss": 0.4649, + "step": 27650 + }, + { + "epoch": 0.8831695174878734, + "grad_norm": 0.6416149139404297, + "learning_rate": 2.9395233812949644e-06, + "loss": 0.4889, + "step": 27675 + }, + { + "epoch": 0.8839673219300485, + "grad_norm": 1.3342888355255127, + "learning_rate": 2.9194501541623847e-06, + "loss": 0.4606, + "step": 27700 + }, + { + "epoch": 0.8847651263722236, + "grad_norm": 0.7645418047904968, + "learning_rate": 2.899376927029805e-06, + "loss": 0.4643, + "step": 27725 + }, + { + "epoch": 0.8855629308143987, + "grad_norm": 1.366817593574524, + "learning_rate": 2.8793036998972252e-06, + "loss": 0.4273, + "step": 27750 + }, + { + "epoch": 0.8863607352565739, + "grad_norm": 0.8227196931838989, + "learning_rate": 2.8592304727646455e-06, + "loss": 0.4334, + "step": 27775 + }, + { + "epoch": 0.887158539698749, + "grad_norm": 0.8876248002052307, + "learning_rate": 2.839157245632066e-06, + "loss": 0.4379, + "step": 27800 + }, + { + "epoch": 0.8879563441409242, + "grad_norm": 0.5903315544128418, + "learning_rate": 2.819084018499486e-06, + "loss": 0.4365, + "step": 27825 + }, + { + "epoch": 0.8887541485830993, + "grad_norm": 0.8090491890907288, + "learning_rate": 2.7990107913669064e-06, + "loss": 0.4082, + "step": 27850 + }, + { + "epoch": 0.8895519530252745, + "grad_norm": 1.499947190284729, + "learning_rate": 2.7789375642343267e-06, + "loss": 0.4417, + "step": 27875 + }, + { + "epoch": 0.8903497574674496, + "grad_norm": 1.3385909795761108, + "learning_rate": 2.7588643371017474e-06, + "loss": 0.5155, + "step": 27900 + }, + { + "epoch": 0.8911475619096247, + "grad_norm": 0.8796870112419128, + "learning_rate": 2.7387911099691677e-06, + "loss": 0.4104, + "step": 27925 + }, + { + "epoch": 0.8919453663517999, + "grad_norm": 0.5240228772163391, + "learning_rate": 2.718717882836588e-06, + "loss": 0.4295, + "step": 27950 + }, + { + "epoch": 0.892743170793975, + "grad_norm": 1.00859534740448, + "learning_rate": 2.6986446557040082e-06, + "loss": 0.4137, + "step": 27975 + }, + { + "epoch": 0.8935409752361502, + "grad_norm": 0.6330901980400085, + "learning_rate": 2.6785714285714285e-06, + "loss": 0.4976, + "step": 28000 + }, + { + "epoch": 0.8943387796783252, + "grad_norm": 1.969018578529358, + "learning_rate": 2.6584982014388492e-06, + "loss": 0.3857, + "step": 28025 + }, + { + "epoch": 0.8951365841205003, + "grad_norm": 0.9891221523284912, + "learning_rate": 2.6384249743062695e-06, + "loss": 0.365, + "step": 28050 + }, + { + "epoch": 0.8959343885626755, + "grad_norm": 0.9894170165061951, + "learning_rate": 2.61835174717369e-06, + "loss": 0.378, + "step": 28075 + }, + { + "epoch": 0.8967321930048506, + "grad_norm": 0.8851298093795776, + "learning_rate": 2.59827852004111e-06, + "loss": 0.4099, + "step": 28100 + }, + { + "epoch": 0.8975299974470258, + "grad_norm": 1.1948890686035156, + "learning_rate": 2.5782052929085304e-06, + "loss": 0.4556, + "step": 28125 + }, + { + "epoch": 0.8983278018892009, + "grad_norm": 0.6201117038726807, + "learning_rate": 2.558132065775951e-06, + "loss": 0.5078, + "step": 28150 + }, + { + "epoch": 0.8991256063313761, + "grad_norm": 0.5613217353820801, + "learning_rate": 2.5380588386433714e-06, + "loss": 0.4271, + "step": 28175 + }, + { + "epoch": 0.8999234107735512, + "grad_norm": 0.7274844646453857, + "learning_rate": 2.5179856115107916e-06, + "loss": 0.4116, + "step": 28200 + }, + { + "epoch": 0.9007212152157263, + "grad_norm": 0.6684631705284119, + "learning_rate": 2.4979123843782115e-06, + "loss": 0.4697, + "step": 28225 + }, + { + "epoch": 0.9015190196579015, + "grad_norm": 0.8778204321861267, + "learning_rate": 2.4778391572456322e-06, + "loss": 0.3628, + "step": 28250 + }, + { + "epoch": 0.9023168241000766, + "grad_norm": 1.1124216318130493, + "learning_rate": 2.4577659301130525e-06, + "loss": 0.4588, + "step": 28275 + }, + { + "epoch": 0.9031146285422518, + "grad_norm": 0.9175296425819397, + "learning_rate": 2.437692702980473e-06, + "loss": 0.4123, + "step": 28300 + }, + { + "epoch": 0.9039124329844268, + "grad_norm": 0.8441956639289856, + "learning_rate": 2.417619475847893e-06, + "loss": 0.3731, + "step": 28325 + }, + { + "epoch": 0.904710237426602, + "grad_norm": 0.42998984456062317, + "learning_rate": 2.3975462487153134e-06, + "loss": 0.464, + "step": 28350 + }, + { + "epoch": 0.9055080418687771, + "grad_norm": 1.1030632257461548, + "learning_rate": 2.377473021582734e-06, + "loss": 0.4391, + "step": 28375 + }, + { + "epoch": 0.9063058463109522, + "grad_norm": 1.7289245128631592, + "learning_rate": 2.3573997944501544e-06, + "loss": 0.4695, + "step": 28400 + }, + { + "epoch": 0.9071036507531274, + "grad_norm": 0.6781057715415955, + "learning_rate": 2.3373265673175746e-06, + "loss": 0.4401, + "step": 28425 + }, + { + "epoch": 0.9079014551953025, + "grad_norm": 1.0768778324127197, + "learning_rate": 2.317253340184995e-06, + "loss": 0.4557, + "step": 28450 + }, + { + "epoch": 0.9086992596374777, + "grad_norm": 0.7264516353607178, + "learning_rate": 2.2971801130524152e-06, + "loss": 0.482, + "step": 28475 + }, + { + "epoch": 0.9094970640796528, + "grad_norm": 1.0383821725845337, + "learning_rate": 2.277106885919836e-06, + "loss": 0.5131, + "step": 28500 + }, + { + "epoch": 0.910294868521828, + "grad_norm": 0.6867876052856445, + "learning_rate": 2.257033658787256e-06, + "loss": 0.4676, + "step": 28525 + }, + { + "epoch": 0.9110926729640031, + "grad_norm": 0.840083122253418, + "learning_rate": 2.2369604316546765e-06, + "loss": 0.4954, + "step": 28550 + }, + { + "epoch": 0.9118904774061782, + "grad_norm": 1.2478915452957153, + "learning_rate": 2.2168872045220968e-06, + "loss": 0.4991, + "step": 28575 + }, + { + "epoch": 0.9126882818483534, + "grad_norm": 0.5823152661323547, + "learning_rate": 2.196813977389517e-06, + "loss": 0.4539, + "step": 28600 + }, + { + "epoch": 0.9134860862905285, + "grad_norm": 1.4774327278137207, + "learning_rate": 2.1767407502569373e-06, + "loss": 0.5059, + "step": 28625 + }, + { + "epoch": 0.9142838907327036, + "grad_norm": 0.9385523796081543, + "learning_rate": 2.1566675231243576e-06, + "loss": 0.4734, + "step": 28650 + }, + { + "epoch": 0.9150816951748787, + "grad_norm": 0.7799694538116455, + "learning_rate": 2.136594295991778e-06, + "loss": 0.459, + "step": 28675 + }, + { + "epoch": 0.9158794996170538, + "grad_norm": 0.8785132765769958, + "learning_rate": 2.116521068859198e-06, + "loss": 0.4647, + "step": 28700 + }, + { + "epoch": 0.916677304059229, + "grad_norm": 0.6985654830932617, + "learning_rate": 2.096447841726619e-06, + "loss": 0.395, + "step": 28725 + }, + { + "epoch": 0.9174751085014041, + "grad_norm": 1.901870608329773, + "learning_rate": 2.076374614594039e-06, + "loss": 0.4138, + "step": 28750 + }, + { + "epoch": 0.9182729129435793, + "grad_norm": 0.8224325776100159, + "learning_rate": 2.0563013874614595e-06, + "loss": 0.4232, + "step": 28775 + }, + { + "epoch": 0.9190707173857544, + "grad_norm": 1.8118170499801636, + "learning_rate": 2.0362281603288798e-06, + "loss": 0.4041, + "step": 28800 + }, + { + "epoch": 0.9198685218279296, + "grad_norm": 1.5147721767425537, + "learning_rate": 2.0161549331963e-06, + "loss": 0.4933, + "step": 28825 + }, + { + "epoch": 0.9206663262701047, + "grad_norm": 0.9193384051322937, + "learning_rate": 1.9960817060637208e-06, + "loss": 0.4328, + "step": 28850 + }, + { + "epoch": 0.9214641307122798, + "grad_norm": 0.6495897769927979, + "learning_rate": 1.976008478931141e-06, + "loss": 0.4864, + "step": 28875 + }, + { + "epoch": 0.922261935154455, + "grad_norm": 1.427866816520691, + "learning_rate": 1.9559352517985613e-06, + "loss": 0.5036, + "step": 28900 + }, + { + "epoch": 0.9230597395966301, + "grad_norm": 0.9476510286331177, + "learning_rate": 1.9358620246659816e-06, + "loss": 0.4221, + "step": 28925 + }, + { + "epoch": 0.9238575440388052, + "grad_norm": 1.2440739870071411, + "learning_rate": 1.915788797533402e-06, + "loss": 0.3542, + "step": 28950 + }, + { + "epoch": 0.9246553484809803, + "grad_norm": 1.3304252624511719, + "learning_rate": 1.8957155704008224e-06, + "loss": 0.4482, + "step": 28975 + }, + { + "epoch": 0.9254531529231554, + "grad_norm": 0.4619370400905609, + "learning_rate": 1.8756423432682427e-06, + "loss": 0.4353, + "step": 29000 + }, + { + "epoch": 0.9262509573653306, + "grad_norm": 0.46142280101776123, + "learning_rate": 1.855569116135663e-06, + "loss": 0.4172, + "step": 29025 + }, + { + "epoch": 0.9270487618075057, + "grad_norm": 0.9833303093910217, + "learning_rate": 1.8354958890030833e-06, + "loss": 0.5051, + "step": 29050 + }, + { + "epoch": 0.9278465662496809, + "grad_norm": 0.7043890357017517, + "learning_rate": 1.8154226618705038e-06, + "loss": 0.5368, + "step": 29075 + }, + { + "epoch": 0.928644370691856, + "grad_norm": 0.7420092821121216, + "learning_rate": 1.795349434737924e-06, + "loss": 0.4528, + "step": 29100 + }, + { + "epoch": 0.9294421751340312, + "grad_norm": 1.0412980318069458, + "learning_rate": 1.7752762076053443e-06, + "loss": 0.6043, + "step": 29125 + }, + { + "epoch": 0.9302399795762063, + "grad_norm": 1.3793296813964844, + "learning_rate": 1.7552029804727646e-06, + "loss": 0.3907, + "step": 29150 + }, + { + "epoch": 0.9310377840183814, + "grad_norm": 1.4980815649032593, + "learning_rate": 1.735129753340185e-06, + "loss": 0.4657, + "step": 29175 + }, + { + "epoch": 0.9318355884605566, + "grad_norm": 0.6619548797607422, + "learning_rate": 1.7150565262076056e-06, + "loss": 0.4292, + "step": 29200 + }, + { + "epoch": 0.9326333929027317, + "grad_norm": 0.5773054361343384, + "learning_rate": 1.694983299075026e-06, + "loss": 0.3851, + "step": 29225 + }, + { + "epoch": 0.9334311973449069, + "grad_norm": 0.8674346804618835, + "learning_rate": 1.6749100719424462e-06, + "loss": 0.4751, + "step": 29250 + }, + { + "epoch": 0.9342290017870819, + "grad_norm": 0.7046436667442322, + "learning_rate": 1.6548368448098665e-06, + "loss": 0.4367, + "step": 29275 + }, + { + "epoch": 0.935026806229257, + "grad_norm": 0.8323100209236145, + "learning_rate": 1.6347636176772868e-06, + "loss": 0.5169, + "step": 29300 + }, + { + "epoch": 0.9358246106714322, + "grad_norm": 0.6985541582107544, + "learning_rate": 1.6146903905447072e-06, + "loss": 0.4482, + "step": 29325 + }, + { + "epoch": 0.9366224151136073, + "grad_norm": 1.2575738430023193, + "learning_rate": 1.5946171634121275e-06, + "loss": 0.4144, + "step": 29350 + }, + { + "epoch": 0.9374202195557825, + "grad_norm": 0.5694016814231873, + "learning_rate": 1.5745439362795478e-06, + "loss": 0.4731, + "step": 29375 + }, + { + "epoch": 0.9382180239979576, + "grad_norm": 0.4448241889476776, + "learning_rate": 1.5544707091469683e-06, + "loss": 0.4668, + "step": 29400 + }, + { + "epoch": 0.9390158284401328, + "grad_norm": 0.6371973752975464, + "learning_rate": 1.5343974820143886e-06, + "loss": 0.4542, + "step": 29425 + }, + { + "epoch": 0.9398136328823079, + "grad_norm": 0.6415957808494568, + "learning_rate": 1.5143242548818089e-06, + "loss": 0.403, + "step": 29450 + }, + { + "epoch": 0.940611437324483, + "grad_norm": 0.5851134657859802, + "learning_rate": 1.4942510277492294e-06, + "loss": 0.4501, + "step": 29475 + }, + { + "epoch": 0.9414092417666582, + "grad_norm": 1.1694976091384888, + "learning_rate": 1.4741778006166495e-06, + "loss": 0.474, + "step": 29500 + }, + { + "epoch": 0.9422070462088333, + "grad_norm": 0.941901683807373, + "learning_rate": 1.45410457348407e-06, + "loss": 0.469, + "step": 29525 + }, + { + "epoch": 0.9430048506510085, + "grad_norm": 0.8407112956047058, + "learning_rate": 1.4340313463514902e-06, + "loss": 0.4813, + "step": 29550 + }, + { + "epoch": 0.9438026550931835, + "grad_norm": 1.4796777963638306, + "learning_rate": 1.4139581192189107e-06, + "loss": 0.4733, + "step": 29575 + }, + { + "epoch": 0.9446004595353586, + "grad_norm": 0.6006361842155457, + "learning_rate": 1.393884892086331e-06, + "loss": 0.4195, + "step": 29600 + }, + { + "epoch": 0.9453982639775338, + "grad_norm": 0.835727334022522, + "learning_rate": 1.3738116649537513e-06, + "loss": 0.4281, + "step": 29625 + }, + { + "epoch": 0.9461960684197089, + "grad_norm": 0.8317933678627014, + "learning_rate": 1.3537384378211718e-06, + "loss": 0.4221, + "step": 29650 + }, + { + "epoch": 0.9469938728618841, + "grad_norm": 2.135981559753418, + "learning_rate": 1.333665210688592e-06, + "loss": 0.3963, + "step": 29675 + }, + { + "epoch": 0.9477916773040592, + "grad_norm": 1.3075852394104004, + "learning_rate": 1.3135919835560124e-06, + "loss": 0.4323, + "step": 29700 + }, + { + "epoch": 0.9485894817462344, + "grad_norm": 0.9642304182052612, + "learning_rate": 1.2935187564234327e-06, + "loss": 0.4221, + "step": 29725 + }, + { + "epoch": 0.9493872861884095, + "grad_norm": 1.0338194370269775, + "learning_rate": 1.273445529290853e-06, + "loss": 0.393, + "step": 29750 + }, + { + "epoch": 0.9501850906305847, + "grad_norm": 0.676669180393219, + "learning_rate": 1.2533723021582734e-06, + "loss": 0.4709, + "step": 29775 + }, + { + "epoch": 0.9509828950727598, + "grad_norm": 1.3416296243667603, + "learning_rate": 1.2332990750256937e-06, + "loss": 0.5081, + "step": 29800 + }, + { + "epoch": 0.9517806995149349, + "grad_norm": 0.7452605962753296, + "learning_rate": 1.2132258478931142e-06, + "loss": 0.4765, + "step": 29825 + }, + { + "epoch": 0.9525785039571101, + "grad_norm": 1.2132765054702759, + "learning_rate": 1.1931526207605345e-06, + "loss": 0.4737, + "step": 29850 + }, + { + "epoch": 0.9533763083992852, + "grad_norm": 1.0834319591522217, + "learning_rate": 1.1730793936279548e-06, + "loss": 0.3876, + "step": 29875 + }, + { + "epoch": 0.9541741128414603, + "grad_norm": 0.4042583405971527, + "learning_rate": 1.153006166495375e-06, + "loss": 0.4086, + "step": 29900 + }, + { + "epoch": 0.9549719172836354, + "grad_norm": 0.618110716342926, + "learning_rate": 1.1329329393627954e-06, + "loss": 0.5247, + "step": 29925 + }, + { + "epoch": 0.9557697217258105, + "grad_norm": 0.5366015434265137, + "learning_rate": 1.1128597122302159e-06, + "loss": 0.3987, + "step": 29950 + }, + { + "epoch": 0.9565675261679857, + "grad_norm": 0.7964014410972595, + "learning_rate": 1.0927864850976362e-06, + "loss": 0.4654, + "step": 29975 + }, + { + "epoch": 0.9573653306101608, + "grad_norm": 1.018946886062622, + "learning_rate": 1.0727132579650566e-06, + "loss": 0.4827, + "step": 30000 + } + ], + "logging_steps": 25, + "max_steps": 31336, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.93204147273728e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}