diff --git "a/LoRA_parameters/trainer_state.json" "b/LoRA_parameters/trainer_state.json" new file mode 100644--- /dev/null +++ "b/LoRA_parameters/trainer_state.json" @@ -0,0 +1,17256 @@ +{ + "best_metric": 0.8637903928756714, + "best_model_checkpoint": "CTCLLMs_backboneTrain/checkpoints/LongSpeech_CTC-Shrink_augment_data_self_tokenizer_addMLS_projector_restore_2epoch_OpenASQA_LibriSQA_CommonVoice_random/checkpoint-44000", + "epoch": 2.0, + "eval_steps": 2000, + "global_step": 48676, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008217602103706139, + "grad_norm": 2.583192825317383, + "learning_rate": 2.737850787132101e-06, + "loss": 1.7034, + "step": 20 + }, + { + "epoch": 0.0016435204207412278, + "grad_norm": 1.2284363508224487, + "learning_rate": 5.475701574264202e-06, + "loss": 1.3595, + "step": 40 + }, + { + "epoch": 0.0024652806311118414, + "grad_norm": 0.7835968732833862, + "learning_rate": 8.213552361396305e-06, + "loss": 1.0712, + "step": 60 + }, + { + "epoch": 0.0032870408414824555, + "grad_norm": 0.7891668081283569, + "learning_rate": 1.0951403148528404e-05, + "loss": 0.9601, + "step": 80 + }, + { + "epoch": 0.004108801051853069, + "grad_norm": 0.7853180170059204, + "learning_rate": 1.3689253935660506e-05, + "loss": 0.9205, + "step": 100 + }, + { + "epoch": 0.004930561262223683, + "grad_norm": 0.8246210217475891, + "learning_rate": 1.642710472279261e-05, + "loss": 0.8811, + "step": 120 + }, + { + "epoch": 0.005752321472594297, + "grad_norm": 0.8358054757118225, + "learning_rate": 1.916495550992471e-05, + "loss": 0.8942, + "step": 140 + }, + { + "epoch": 0.006574081682964911, + "grad_norm": 0.8320625424385071, + "learning_rate": 2.190280629705681e-05, + "loss": 0.8571, + "step": 160 + }, + { + "epoch": 0.007395841893335525, + "grad_norm": 0.915210485458374, + "learning_rate": 2.464065708418891e-05, + "loss": 0.8753, + "step": 180 + }, + { + "epoch": 0.008217602103706138, + "grad_norm": 0.819446861743927, + "learning_rate": 2.7378507871321012e-05, + "loss": 0.8164, + "step": 200 + }, + { + "epoch": 0.009039362314076753, + "grad_norm": 0.8161728978157043, + "learning_rate": 3.0116358658453113e-05, + "loss": 0.8124, + "step": 220 + }, + { + "epoch": 0.009861122524447366, + "grad_norm": 0.8150123953819275, + "learning_rate": 3.285420944558522e-05, + "loss": 0.8046, + "step": 240 + }, + { + "epoch": 0.01068288273481798, + "grad_norm": 0.7666240334510803, + "learning_rate": 3.559206023271732e-05, + "loss": 0.804, + "step": 260 + }, + { + "epoch": 0.011504642945188595, + "grad_norm": 0.7685117721557617, + "learning_rate": 3.832991101984942e-05, + "loss": 0.7959, + "step": 280 + }, + { + "epoch": 0.012326403155559208, + "grad_norm": 0.7646723985671997, + "learning_rate": 4.1067761806981516e-05, + "loss": 0.7919, + "step": 300 + }, + { + "epoch": 0.013148163365929822, + "grad_norm": 0.6962915062904358, + "learning_rate": 4.380561259411362e-05, + "loss": 0.8057, + "step": 320 + }, + { + "epoch": 0.013969923576300435, + "grad_norm": 0.7180787324905396, + "learning_rate": 4.654346338124572e-05, + "loss": 0.7921, + "step": 340 + }, + { + "epoch": 0.01479168378667105, + "grad_norm": 0.6863545179367065, + "learning_rate": 4.928131416837782e-05, + "loss": 0.7752, + "step": 360 + }, + { + "epoch": 0.015613443997041664, + "grad_norm": 0.7281647324562073, + "learning_rate": 5.201916495550992e-05, + "loss": 0.8258, + "step": 380 + }, + { + "epoch": 0.016435204207412277, + "grad_norm": 0.6535085439682007, + "learning_rate": 5.4757015742642024e-05, + "loss": 0.8181, + "step": 400 + }, + { + "epoch": 0.01725696441778289, + "grad_norm": 0.722362220287323, + "learning_rate": 5.7494866529774125e-05, + "loss": 0.7875, + "step": 420 + }, + { + "epoch": 0.018078724628153506, + "grad_norm": 0.6223776340484619, + "learning_rate": 6.023271731690623e-05, + "loss": 0.7995, + "step": 440 + }, + { + "epoch": 0.01890048483852412, + "grad_norm": 0.6497051119804382, + "learning_rate": 6.297056810403833e-05, + "loss": 0.8061, + "step": 460 + }, + { + "epoch": 0.01972224504889473, + "grad_norm": 0.6080052256584167, + "learning_rate": 6.570841889117044e-05, + "loss": 0.7808, + "step": 480 + }, + { + "epoch": 0.020544005259265346, + "grad_norm": 0.6562979221343994, + "learning_rate": 6.844626967830253e-05, + "loss": 0.8084, + "step": 500 + }, + { + "epoch": 0.02136576546963596, + "grad_norm": 0.5958985686302185, + "learning_rate": 7.118412046543464e-05, + "loss": 0.7635, + "step": 520 + }, + { + "epoch": 0.022187525680006575, + "grad_norm": 0.6067186594009399, + "learning_rate": 7.392197125256673e-05, + "loss": 0.8001, + "step": 540 + }, + { + "epoch": 0.02300928589037719, + "grad_norm": 0.5998467803001404, + "learning_rate": 7.665982203969884e-05, + "loss": 0.7907, + "step": 560 + }, + { + "epoch": 0.0238310461007478, + "grad_norm": 0.6743142604827881, + "learning_rate": 7.939767282683094e-05, + "loss": 0.7716, + "step": 580 + }, + { + "epoch": 0.024652806311118415, + "grad_norm": 0.5700744390487671, + "learning_rate": 8.213552361396303e-05, + "loss": 0.775, + "step": 600 + }, + { + "epoch": 0.02547456652148903, + "grad_norm": 0.5921510457992554, + "learning_rate": 8.487337440109514e-05, + "loss": 0.7874, + "step": 620 + }, + { + "epoch": 0.026296326731859644, + "grad_norm": 0.6034173965454102, + "learning_rate": 8.761122518822724e-05, + "loss": 0.7815, + "step": 640 + }, + { + "epoch": 0.02711808694223026, + "grad_norm": 0.5742852091789246, + "learning_rate": 9.034907597535934e-05, + "loss": 0.7881, + "step": 660 + }, + { + "epoch": 0.02793984715260087, + "grad_norm": 0.5726728439331055, + "learning_rate": 9.308692676249144e-05, + "loss": 0.7905, + "step": 680 + }, + { + "epoch": 0.028761607362971484, + "grad_norm": 0.5717937350273132, + "learning_rate": 9.582477754962355e-05, + "loss": 0.7674, + "step": 700 + }, + { + "epoch": 0.0295833675733421, + "grad_norm": 0.5768669247627258, + "learning_rate": 9.856262833675564e-05, + "loss": 0.7687, + "step": 720 + }, + { + "epoch": 0.030405127783712713, + "grad_norm": 0.577836275100708, + "learning_rate": 0.00010130047912388776, + "loss": 0.775, + "step": 740 + }, + { + "epoch": 0.031226887994083328, + "grad_norm": 0.5653334259986877, + "learning_rate": 0.00010403832991101984, + "loss": 0.7531, + "step": 760 + }, + { + "epoch": 0.03204864820445394, + "grad_norm": 0.6732868552207947, + "learning_rate": 0.00010677618069815197, + "loss": 0.7961, + "step": 780 + }, + { + "epoch": 0.03287040841482455, + "grad_norm": 0.5599177479743958, + "learning_rate": 0.00010951403148528405, + "loss": 0.7617, + "step": 800 + }, + { + "epoch": 0.03369216862519517, + "grad_norm": 0.6064387559890747, + "learning_rate": 0.00011225188227241617, + "loss": 0.7279, + "step": 820 + }, + { + "epoch": 0.03451392883556578, + "grad_norm": 0.5485804080963135, + "learning_rate": 0.00011498973305954825, + "loss": 0.7499, + "step": 840 + }, + { + "epoch": 0.0353356890459364, + "grad_norm": 0.5458228588104248, + "learning_rate": 0.00011772758384668037, + "loss": 0.7726, + "step": 860 + }, + { + "epoch": 0.03615744925630701, + "grad_norm": 0.5380986928939819, + "learning_rate": 0.00012046543463381245, + "loss": 0.8033, + "step": 880 + }, + { + "epoch": 0.036979209466677626, + "grad_norm": 0.6237996220588684, + "learning_rate": 0.00012320328542094456, + "loss": 0.7961, + "step": 900 + }, + { + "epoch": 0.03780096967704824, + "grad_norm": 0.561638593673706, + "learning_rate": 0.00012594113620807666, + "loss": 0.7799, + "step": 920 + }, + { + "epoch": 0.03862272988741885, + "grad_norm": 0.5861026644706726, + "learning_rate": 0.00012867898699520878, + "loss": 0.7801, + "step": 940 + }, + { + "epoch": 0.03944449009778946, + "grad_norm": 0.5767973065376282, + "learning_rate": 0.00013141683778234087, + "loss": 0.7743, + "step": 960 + }, + { + "epoch": 0.04026625030816008, + "grad_norm": 0.5575984120368958, + "learning_rate": 0.00013415468856947297, + "loss": 0.7781, + "step": 980 + }, + { + "epoch": 0.04108801051853069, + "grad_norm": 0.6257224678993225, + "learning_rate": 0.00013689253935660506, + "loss": 0.7942, + "step": 1000 + }, + { + "epoch": 0.041909770728901306, + "grad_norm": 0.5779247283935547, + "learning_rate": 0.00013963039014373718, + "loss": 0.7781, + "step": 1020 + }, + { + "epoch": 0.04273153093927192, + "grad_norm": 0.580823540687561, + "learning_rate": 0.00014236824093086928, + "loss": 0.7821, + "step": 1040 + }, + { + "epoch": 0.043553291149642535, + "grad_norm": 0.6481038331985474, + "learning_rate": 0.00014510609171800137, + "loss": 0.7743, + "step": 1060 + }, + { + "epoch": 0.04437505136001315, + "grad_norm": 0.5689835548400879, + "learning_rate": 0.00014784394250513347, + "loss": 0.7839, + "step": 1080 + }, + { + "epoch": 0.045196811570383764, + "grad_norm": 0.6186492443084717, + "learning_rate": 0.0001505817932922656, + "loss": 0.8148, + "step": 1100 + }, + { + "epoch": 0.04601857178075438, + "grad_norm": 0.6207014322280884, + "learning_rate": 0.00015331964407939769, + "loss": 0.7854, + "step": 1120 + }, + { + "epoch": 0.04684033199112499, + "grad_norm": 0.649277925491333, + "learning_rate": 0.00015605749486652978, + "loss": 0.792, + "step": 1140 + }, + { + "epoch": 0.0476620922014956, + "grad_norm": 0.6359161734580994, + "learning_rate": 0.00015879534565366188, + "loss": 0.7871, + "step": 1160 + }, + { + "epoch": 0.048483852411866216, + "grad_norm": 0.6189965009689331, + "learning_rate": 0.000161533196440794, + "loss": 0.8014, + "step": 1180 + }, + { + "epoch": 0.04930561262223683, + "grad_norm": 0.6509179472923279, + "learning_rate": 0.00016427104722792606, + "loss": 0.7993, + "step": 1200 + }, + { + "epoch": 0.050127372832607445, + "grad_norm": 0.70870041847229, + "learning_rate": 0.0001670088980150582, + "loss": 0.7892, + "step": 1220 + }, + { + "epoch": 0.05094913304297806, + "grad_norm": 0.6406404376029968, + "learning_rate": 0.00016974674880219028, + "loss": 0.7823, + "step": 1240 + }, + { + "epoch": 0.051770893253348674, + "grad_norm": 0.65333092212677, + "learning_rate": 0.0001724845995893224, + "loss": 0.788, + "step": 1260 + }, + { + "epoch": 0.05259265346371929, + "grad_norm": 0.6580297946929932, + "learning_rate": 0.00017522245037645447, + "loss": 0.7818, + "step": 1280 + }, + { + "epoch": 0.0534144136740899, + "grad_norm": 0.6168191432952881, + "learning_rate": 0.0001779603011635866, + "loss": 0.8192, + "step": 1300 + }, + { + "epoch": 0.05423617388446052, + "grad_norm": 0.6828853487968445, + "learning_rate": 0.0001806981519507187, + "loss": 0.7965, + "step": 1320 + }, + { + "epoch": 0.055057934094831125, + "grad_norm": 0.6340402960777283, + "learning_rate": 0.0001834360027378508, + "loss": 0.7994, + "step": 1340 + }, + { + "epoch": 0.05587969430520174, + "grad_norm": 0.6091774106025696, + "learning_rate": 0.00018617385352498288, + "loss": 0.8142, + "step": 1360 + }, + { + "epoch": 0.056701454515572354, + "grad_norm": 0.6250841617584229, + "learning_rate": 0.000188911704312115, + "loss": 0.7998, + "step": 1380 + }, + { + "epoch": 0.05752321472594297, + "grad_norm": 0.6069123148918152, + "learning_rate": 0.0001916495550992471, + "loss": 0.8138, + "step": 1400 + }, + { + "epoch": 0.05834497493631358, + "grad_norm": 0.6658041477203369, + "learning_rate": 0.00019438740588637922, + "loss": 0.8082, + "step": 1420 + }, + { + "epoch": 0.0591667351466842, + "grad_norm": 0.6972244381904602, + "learning_rate": 0.00019712525667351128, + "loss": 0.8203, + "step": 1440 + }, + { + "epoch": 0.05998849535705481, + "grad_norm": 0.6969318985939026, + "learning_rate": 0.0001998631074606434, + "loss": 0.7889, + "step": 1460 + }, + { + "epoch": 0.06081025556742543, + "grad_norm": 0.6873449087142944, + "learning_rate": 0.00019999992008709735, + "loss": 0.8483, + "step": 1480 + }, + { + "epoch": 0.06163201577779604, + "grad_norm": 0.6407928466796875, + "learning_rate": 0.0001999996633033991, + "loss": 0.8578, + "step": 1500 + }, + { + "epoch": 0.062453775988166656, + "grad_norm": 0.6195780634880066, + "learning_rate": 0.0001999992294279946, + "loss": 0.8053, + "step": 1520 + }, + { + "epoch": 0.06327553619853726, + "grad_norm": 0.7080503702163696, + "learning_rate": 0.00019999861846165223, + "loss": 0.7953, + "step": 1540 + }, + { + "epoch": 0.06409729640890788, + "grad_norm": 0.8221389055252075, + "learning_rate": 0.0001999978304054539, + "loss": 0.831, + "step": 1560 + }, + { + "epoch": 0.06491905661927849, + "grad_norm": 0.7362856268882751, + "learning_rate": 0.00019999686526079525, + "loss": 0.8333, + "step": 1580 + }, + { + "epoch": 0.0657408168296491, + "grad_norm": 0.7318360805511475, + "learning_rate": 0.0001999957230293855, + "loss": 0.8352, + "step": 1600 + }, + { + "epoch": 0.06656257704001972, + "grad_norm": 0.7396681904792786, + "learning_rate": 0.0001999944037132474, + "loss": 0.8108, + "step": 1620 + }, + { + "epoch": 0.06738433725039034, + "grad_norm": 0.624099850654602, + "learning_rate": 0.00019999290731471738, + "loss": 0.8049, + "step": 1640 + }, + { + "epoch": 0.06820609746076095, + "grad_norm": 0.8170691728591919, + "learning_rate": 0.00019999123383644544, + "loss": 0.8198, + "step": 1660 + }, + { + "epoch": 0.06902785767113157, + "grad_norm": 0.7529473304748535, + "learning_rate": 0.00019998938328139517, + "loss": 0.8276, + "step": 1680 + }, + { + "epoch": 0.06984961788150218, + "grad_norm": 0.6491063833236694, + "learning_rate": 0.0001999873556528438, + "loss": 0.8363, + "step": 1700 + }, + { + "epoch": 0.0706713780918728, + "grad_norm": 0.6839701533317566, + "learning_rate": 0.00019998515095438207, + "loss": 0.8183, + "step": 1720 + }, + { + "epoch": 0.07149313830224341, + "grad_norm": 0.7682334184646606, + "learning_rate": 0.00019998276918991437, + "loss": 0.8124, + "step": 1740 + }, + { + "epoch": 0.07231489851261402, + "grad_norm": 0.7527047395706177, + "learning_rate": 0.00019998021036365856, + "loss": 0.8009, + "step": 1760 + }, + { + "epoch": 0.07313665872298464, + "grad_norm": 0.7900727391242981, + "learning_rate": 0.00019997747448014615, + "loss": 0.8252, + "step": 1780 + }, + { + "epoch": 0.07395841893335525, + "grad_norm": 0.799541175365448, + "learning_rate": 0.0001999745615442222, + "loss": 0.8285, + "step": 1800 + }, + { + "epoch": 0.07478017914372587, + "grad_norm": 0.7471742033958435, + "learning_rate": 0.00019997147156104527, + "loss": 0.8078, + "step": 1820 + }, + { + "epoch": 0.07560193935409648, + "grad_norm": 0.8705668449401855, + "learning_rate": 0.00019996820453608752, + "loss": 0.847, + "step": 1840 + }, + { + "epoch": 0.07642369956446708, + "grad_norm": 0.6803821921348572, + "learning_rate": 0.00019996476047513454, + "loss": 0.8152, + "step": 1860 + }, + { + "epoch": 0.0772454597748377, + "grad_norm": 0.7618655562400818, + "learning_rate": 0.00019996113938428555, + "loss": 0.8178, + "step": 1880 + }, + { + "epoch": 0.07806721998520831, + "grad_norm": 0.7413930296897888, + "learning_rate": 0.0001999573412699532, + "loss": 0.8538, + "step": 1900 + }, + { + "epoch": 0.07888898019557893, + "grad_norm": 0.7185872197151184, + "learning_rate": 0.0001999533661388637, + "loss": 0.8205, + "step": 1920 + }, + { + "epoch": 0.07971074040594954, + "grad_norm": 0.7739173769950867, + "learning_rate": 0.0001999492139980566, + "loss": 0.8305, + "step": 1940 + }, + { + "epoch": 0.08053250061632015, + "grad_norm": 0.7355234026908875, + "learning_rate": 0.0001999448848548851, + "loss": 0.8434, + "step": 1960 + }, + { + "epoch": 0.08135426082669077, + "grad_norm": 0.6984680891036987, + "learning_rate": 0.00019994037871701577, + "loss": 0.8307, + "step": 1980 + }, + { + "epoch": 0.08217602103706138, + "grad_norm": 0.8437952995300293, + "learning_rate": 0.00019993569559242864, + "loss": 0.8156, + "step": 2000 + }, + { + "epoch": 0.08217602103706138, + "eval_loss": 1.0192168951034546, + "eval_runtime": 16.6004, + "eval_samples_per_second": 157.827, + "eval_steps_per_second": 4.94, + "step": 2000 + }, + { + "epoch": 0.082997781247432, + "grad_norm": 0.7162949442863464, + "learning_rate": 0.00019993132946350455, + "loss": 0.8269, + "step": 2020 + }, + { + "epoch": 0.08381954145780261, + "grad_norm": 0.8335065841674805, + "learning_rate": 0.00019992631008726108, + "loss": 0.8296, + "step": 2040 + }, + { + "epoch": 0.08464130166817323, + "grad_norm": 0.7615776658058167, + "learning_rate": 0.00019992111374921422, + "loss": 0.8325, + "step": 2060 + }, + { + "epoch": 0.08546306187854384, + "grad_norm": 0.8376593589782715, + "learning_rate": 0.00019991574045856637, + "loss": 0.8299, + "step": 2080 + }, + { + "epoch": 0.08628482208891446, + "grad_norm": 0.7417710423469543, + "learning_rate": 0.00019991019022483312, + "loss": 0.8419, + "step": 2100 + }, + { + "epoch": 0.08710658229928507, + "grad_norm": 0.781035840511322, + "learning_rate": 0.00019990446305784358, + "loss": 0.8179, + "step": 2120 + }, + { + "epoch": 0.08792834250965569, + "grad_norm": 0.7956770062446594, + "learning_rate": 0.0001998985589677401, + "loss": 0.8365, + "step": 2140 + }, + { + "epoch": 0.0887501027200263, + "grad_norm": 0.8096093535423279, + "learning_rate": 0.00019989247796497838, + "loss": 0.8122, + "step": 2160 + }, + { + "epoch": 0.08957186293039691, + "grad_norm": 0.9604154825210571, + "learning_rate": 0.00019988622006032736, + "loss": 0.8284, + "step": 2180 + }, + { + "epoch": 0.09039362314076753, + "grad_norm": 0.7302993535995483, + "learning_rate": 0.00019988011120562424, + "loss": 0.8275, + "step": 2200 + }, + { + "epoch": 0.09121538335113814, + "grad_norm": 0.7435317635536194, + "learning_rate": 0.00019987350837444987, + "loss": 0.8482, + "step": 2220 + }, + { + "epoch": 0.09203714356150876, + "grad_norm": 0.7641948461532593, + "learning_rate": 0.00019986672867497988, + "loss": 0.8277, + "step": 2240 + }, + { + "epoch": 0.09285890377187937, + "grad_norm": 0.8367336392402649, + "learning_rate": 0.00019985977211922068, + "loss": 0.8148, + "step": 2260 + }, + { + "epoch": 0.09368066398224997, + "grad_norm": 0.8000660538673401, + "learning_rate": 0.0001998526387194917, + "loss": 0.8525, + "step": 2280 + }, + { + "epoch": 0.09450242419262059, + "grad_norm": 0.7250078320503235, + "learning_rate": 0.0001998453284884257, + "loss": 0.8394, + "step": 2300 + }, + { + "epoch": 0.0953241844029912, + "grad_norm": 0.8181660771369934, + "learning_rate": 0.00019983784143896854, + "loss": 0.8396, + "step": 2320 + }, + { + "epoch": 0.09614594461336182, + "grad_norm": 0.8068580031394958, + "learning_rate": 0.00019983017758437916, + "loss": 0.8431, + "step": 2340 + }, + { + "epoch": 0.09696770482373243, + "grad_norm": 0.8218814134597778, + "learning_rate": 0.0001998223369382297, + "loss": 0.8335, + "step": 2360 + }, + { + "epoch": 0.09778946503410305, + "grad_norm": 0.7909825444221497, + "learning_rate": 0.00019981431951440537, + "loss": 0.8337, + "step": 2380 + }, + { + "epoch": 0.09861122524447366, + "grad_norm": 0.7333732843399048, + "learning_rate": 0.00019980612532710434, + "loss": 0.8423, + "step": 2400 + }, + { + "epoch": 0.09943298545484427, + "grad_norm": 0.8515979647636414, + "learning_rate": 0.00019979775439083795, + "loss": 0.8397, + "step": 2420 + }, + { + "epoch": 0.10025474566521489, + "grad_norm": 0.7907799482345581, + "learning_rate": 0.0001997892067204304, + "loss": 0.8332, + "step": 2440 + }, + { + "epoch": 0.1010765058755855, + "grad_norm": 0.862369179725647, + "learning_rate": 0.00019978048233101903, + "loss": 0.8295, + "step": 2460 + }, + { + "epoch": 0.10189826608595612, + "grad_norm": 0.733180046081543, + "learning_rate": 0.00019977158123805403, + "loss": 0.8182, + "step": 2480 + }, + { + "epoch": 0.10272002629632673, + "grad_norm": 0.8313851952552795, + "learning_rate": 0.00019976250345729856, + "loss": 0.8501, + "step": 2500 + }, + { + "epoch": 0.10354178650669735, + "grad_norm": 0.7268729209899902, + "learning_rate": 0.0001997532490048287, + "loss": 0.8188, + "step": 2520 + }, + { + "epoch": 0.10436354671706796, + "grad_norm": 0.8857830166816711, + "learning_rate": 0.0001997438178970333, + "loss": 0.8125, + "step": 2540 + }, + { + "epoch": 0.10518530692743858, + "grad_norm": 0.8165369033813477, + "learning_rate": 0.0001997342101506142, + "loss": 0.8283, + "step": 2560 + }, + { + "epoch": 0.10600706713780919, + "grad_norm": 0.8085136413574219, + "learning_rate": 0.00019972442578258597, + "loss": 0.8499, + "step": 2580 + }, + { + "epoch": 0.1068288273481798, + "grad_norm": 0.914569079875946, + "learning_rate": 0.00019971446481027591, + "loss": 0.8235, + "step": 2600 + }, + { + "epoch": 0.10765058755855042, + "grad_norm": 0.8516772389411926, + "learning_rate": 0.0001997043272513242, + "loss": 0.8541, + "step": 2620 + }, + { + "epoch": 0.10847234776892103, + "grad_norm": 0.8908547163009644, + "learning_rate": 0.0001996940131236836, + "loss": 0.8696, + "step": 2640 + }, + { + "epoch": 0.10929410797929165, + "grad_norm": 0.7930579781532288, + "learning_rate": 0.00019968352244561976, + "loss": 0.8295, + "step": 2660 + }, + { + "epoch": 0.11011586818966225, + "grad_norm": 0.8197824358940125, + "learning_rate": 0.00019967285523571075, + "loss": 0.8251, + "step": 2680 + }, + { + "epoch": 0.11093762840003286, + "grad_norm": 0.8889797925949097, + "learning_rate": 0.00019966201151284745, + "loss": 0.8482, + "step": 2700 + }, + { + "epoch": 0.11175938861040348, + "grad_norm": 0.8669871091842651, + "learning_rate": 0.0001996509912962332, + "loss": 0.848, + "step": 2720 + }, + { + "epoch": 0.1125811488207741, + "grad_norm": 0.8625435829162598, + "learning_rate": 0.00019963979460538398, + "loss": 0.853, + "step": 2740 + }, + { + "epoch": 0.11340290903114471, + "grad_norm": 0.9078089594841003, + "learning_rate": 0.00019962842146012828, + "loss": 0.8474, + "step": 2760 + }, + { + "epoch": 0.11422466924151532, + "grad_norm": 0.8258838057518005, + "learning_rate": 0.00019961687188060708, + "loss": 0.8446, + "step": 2780 + }, + { + "epoch": 0.11504642945188594, + "grad_norm": 0.8453534841537476, + "learning_rate": 0.00019960573637644915, + "loss": 0.8416, + "step": 2800 + }, + { + "epoch": 0.11586818966225655, + "grad_norm": 0.8726224899291992, + "learning_rate": 0.00019959384280922383, + "loss": 0.8436, + "step": 2820 + }, + { + "epoch": 0.11668994987262717, + "grad_norm": 0.7921317219734192, + "learning_rate": 0.00019958177286896915, + "loss": 0.8331, + "step": 2840 + }, + { + "epoch": 0.11751171008299778, + "grad_norm": 1.0417413711547852, + "learning_rate": 0.00019956952657706, + "loss": 0.8612, + "step": 2860 + }, + { + "epoch": 0.1183334702933684, + "grad_norm": 0.8408219218254089, + "learning_rate": 0.00019955710395518363, + "loss": 0.8326, + "step": 2880 + }, + { + "epoch": 0.11915523050373901, + "grad_norm": 0.9015172123908997, + "learning_rate": 0.00019954450502533954, + "loss": 0.831, + "step": 2900 + }, + { + "epoch": 0.11997699071410962, + "grad_norm": 0.8820521831512451, + "learning_rate": 0.00019953172980983949, + "loss": 0.8383, + "step": 2920 + }, + { + "epoch": 0.12079875092448024, + "grad_norm": 0.899238646030426, + "learning_rate": 0.00019951877833130737, + "loss": 0.8387, + "step": 2940 + }, + { + "epoch": 0.12162051113485085, + "grad_norm": 0.827013373374939, + "learning_rate": 0.00019950565061267929, + "loss": 0.8421, + "step": 2960 + }, + { + "epoch": 0.12244227134522147, + "grad_norm": 0.8747543692588806, + "learning_rate": 0.00019949234667720336, + "loss": 0.819, + "step": 2980 + }, + { + "epoch": 0.12326403155559208, + "grad_norm": 0.7559501528739929, + "learning_rate": 0.00019947886654843991, + "loss": 0.8384, + "step": 3000 + }, + { + "epoch": 0.1240857917659627, + "grad_norm": 0.8533095121383667, + "learning_rate": 0.00019946521025026117, + "loss": 0.8291, + "step": 3020 + }, + { + "epoch": 0.12490755197633331, + "grad_norm": 0.9044725894927979, + "learning_rate": 0.00019945137780685138, + "loss": 0.8311, + "step": 3040 + }, + { + "epoch": 0.1257293121867039, + "grad_norm": 1.075642466545105, + "learning_rate": 0.00019943736924270679, + "loss": 0.8177, + "step": 3060 + }, + { + "epoch": 0.12655107239707453, + "grad_norm": 0.8421041369438171, + "learning_rate": 0.0001994231845826354, + "loss": 0.8205, + "step": 3080 + }, + { + "epoch": 0.12737283260744514, + "grad_norm": 0.9026604890823364, + "learning_rate": 0.0001994088238517572, + "loss": 0.8431, + "step": 3100 + }, + { + "epoch": 0.12819459281781576, + "grad_norm": 0.882366955280304, + "learning_rate": 0.00019939428707550395, + "loss": 0.836, + "step": 3120 + }, + { + "epoch": 0.12901635302818637, + "grad_norm": 0.8641519546508789, + "learning_rate": 0.00019937957427961918, + "loss": 0.8237, + "step": 3140 + }, + { + "epoch": 0.12983811323855698, + "grad_norm": 0.9230402708053589, + "learning_rate": 0.00019936468549015804, + "loss": 0.847, + "step": 3160 + }, + { + "epoch": 0.1306598734489276, + "grad_norm": 0.8539056181907654, + "learning_rate": 0.0001993496207334875, + "loss": 0.8347, + "step": 3180 + }, + { + "epoch": 0.1314816336592982, + "grad_norm": 1.0330878496170044, + "learning_rate": 0.00019933438003628604, + "loss": 0.8183, + "step": 3200 + }, + { + "epoch": 0.13230339386966883, + "grad_norm": 0.9674934148788452, + "learning_rate": 0.0001993189634255438, + "loss": 0.8457, + "step": 3220 + }, + { + "epoch": 0.13312515408003944, + "grad_norm": 0.7981789708137512, + "learning_rate": 0.00019930337092856243, + "loss": 0.8404, + "step": 3240 + }, + { + "epoch": 0.13394691429041006, + "grad_norm": 0.9389087557792664, + "learning_rate": 0.00019928760257295494, + "loss": 0.8212, + "step": 3260 + }, + { + "epoch": 0.13476867450078067, + "grad_norm": 0.907522439956665, + "learning_rate": 0.00019927165838664598, + "loss": 0.829, + "step": 3280 + }, + { + "epoch": 0.1355904347111513, + "grad_norm": 0.7471171021461487, + "learning_rate": 0.00019925553839787147, + "loss": 0.8199, + "step": 3300 + }, + { + "epoch": 0.1364121949215219, + "grad_norm": 0.8877785205841064, + "learning_rate": 0.00019923924263517856, + "loss": 0.82, + "step": 3320 + }, + { + "epoch": 0.13723395513189252, + "grad_norm": 0.9058935046195984, + "learning_rate": 0.00019922277112742592, + "loss": 0.8416, + "step": 3340 + }, + { + "epoch": 0.13805571534226313, + "grad_norm": 1.0315223932266235, + "learning_rate": 0.0001992061239037833, + "loss": 0.8356, + "step": 3360 + }, + { + "epoch": 0.13887747555263374, + "grad_norm": 0.9553551077842712, + "learning_rate": 0.00019918930099373157, + "loss": 0.8114, + "step": 3380 + }, + { + "epoch": 0.13969923576300436, + "grad_norm": 0.8403427600860596, + "learning_rate": 0.00019917230242706287, + "loss": 0.8311, + "step": 3400 + }, + { + "epoch": 0.14052099597337497, + "grad_norm": 0.8313325643539429, + "learning_rate": 0.00019915512823388034, + "loss": 0.8412, + "step": 3420 + }, + { + "epoch": 0.1413427561837456, + "grad_norm": 0.9792637228965759, + "learning_rate": 0.00019913777844459814, + "loss": 0.8405, + "step": 3440 + }, + { + "epoch": 0.1421645163941162, + "grad_norm": 0.8566033244132996, + "learning_rate": 0.00019912025308994148, + "loss": 0.8007, + "step": 3460 + }, + { + "epoch": 0.14298627660448682, + "grad_norm": 0.83521568775177, + "learning_rate": 0.00019910255220094634, + "loss": 0.8301, + "step": 3480 + }, + { + "epoch": 0.14380803681485743, + "grad_norm": 0.9577664136886597, + "learning_rate": 0.00019908467580895964, + "loss": 0.8332, + "step": 3500 + }, + { + "epoch": 0.14462979702522805, + "grad_norm": 0.8675875663757324, + "learning_rate": 0.00019906662394563913, + "loss": 0.8434, + "step": 3520 + }, + { + "epoch": 0.14545155723559866, + "grad_norm": 0.9066608548164368, + "learning_rate": 0.00019904839664295322, + "loss": 0.8273, + "step": 3540 + }, + { + "epoch": 0.14627331744596928, + "grad_norm": 0.8817648887634277, + "learning_rate": 0.00019902999393318113, + "loss": 0.8201, + "step": 3560 + }, + { + "epoch": 0.1470950776563399, + "grad_norm": 0.931098997592926, + "learning_rate": 0.00019901141584891262, + "loss": 0.8631, + "step": 3580 + }, + { + "epoch": 0.1479168378667105, + "grad_norm": 0.9475566744804382, + "learning_rate": 0.00019899266242304797, + "loss": 0.8335, + "step": 3600 + }, + { + "epoch": 0.14873859807708112, + "grad_norm": 0.9767733812332153, + "learning_rate": 0.00019897373368879816, + "loss": 0.8283, + "step": 3620 + }, + { + "epoch": 0.14956035828745173, + "grad_norm": 0.9582000374794006, + "learning_rate": 0.00019895462967968444, + "loss": 0.857, + "step": 3640 + }, + { + "epoch": 0.15038211849782235, + "grad_norm": 1.07439124584198, + "learning_rate": 0.0001989353504295386, + "loss": 0.8316, + "step": 3660 + }, + { + "epoch": 0.15120387870819296, + "grad_norm": 0.7958013415336609, + "learning_rate": 0.00019891589597250265, + "loss": 0.8285, + "step": 3680 + }, + { + "epoch": 0.15202563891856358, + "grad_norm": 0.9222463965415955, + "learning_rate": 0.000198896266343029, + "loss": 0.8262, + "step": 3700 + }, + { + "epoch": 0.15284739912893416, + "grad_norm": 0.8631669282913208, + "learning_rate": 0.00019887646157588015, + "loss": 0.8399, + "step": 3720 + }, + { + "epoch": 0.15366915933930478, + "grad_norm": 0.9066493511199951, + "learning_rate": 0.0001988564817061289, + "loss": 0.8441, + "step": 3740 + }, + { + "epoch": 0.1544909195496754, + "grad_norm": 0.8802810907363892, + "learning_rate": 0.000198836326769158, + "loss": 0.8382, + "step": 3760 + }, + { + "epoch": 0.155312679760046, + "grad_norm": 0.8617781400680542, + "learning_rate": 0.00019881599680066024, + "loss": 0.8322, + "step": 3780 + }, + { + "epoch": 0.15613443997041662, + "grad_norm": 0.9121951460838318, + "learning_rate": 0.00019879549183663854, + "loss": 0.8103, + "step": 3800 + }, + { + "epoch": 0.15695620018078724, + "grad_norm": 0.8364813923835754, + "learning_rate": 0.0001987748119134056, + "loss": 0.8258, + "step": 3820 + }, + { + "epoch": 0.15777796039115785, + "grad_norm": 0.971182644367218, + "learning_rate": 0.00019875395706758388, + "loss": 0.8195, + "step": 3840 + }, + { + "epoch": 0.15859972060152847, + "grad_norm": 0.9081626534461975, + "learning_rate": 0.00019873292733610577, + "loss": 0.7991, + "step": 3860 + }, + { + "epoch": 0.15942148081189908, + "grad_norm": 1.0428310632705688, + "learning_rate": 0.00019871278713727932, + "loss": 0.8337, + "step": 3880 + }, + { + "epoch": 0.1602432410222697, + "grad_norm": 0.8817542791366577, + "learning_rate": 0.0001986914164861707, + "loss": 0.8422, + "step": 3900 + }, + { + "epoch": 0.1610650012326403, + "grad_norm": 0.9848262667655945, + "learning_rate": 0.00019867095248177687, + "loss": 0.8604, + "step": 3920 + }, + { + "epoch": 0.16188676144301092, + "grad_norm": 0.8702915906906128, + "learning_rate": 0.00019864924105494623, + "loss": 0.824, + "step": 3940 + }, + { + "epoch": 0.16270852165338154, + "grad_norm": 0.8769912123680115, + "learning_rate": 0.00019862735492790314, + "loss": 0.8125, + "step": 3960 + }, + { + "epoch": 0.16353028186375215, + "grad_norm": 0.9141522645950317, + "learning_rate": 0.00019860529413940633, + "loss": 0.8426, + "step": 3980 + }, + { + "epoch": 0.16435204207412277, + "grad_norm": 0.9294391870498657, + "learning_rate": 0.00019858305872852373, + "loss": 0.8605, + "step": 4000 + }, + { + "epoch": 0.16435204207412277, + "eval_loss": 1.0459537506103516, + "eval_runtime": 16.5325, + "eval_samples_per_second": 158.476, + "eval_steps_per_second": 4.96, + "step": 4000 + }, + { + "epoch": 0.16517380228449338, + "grad_norm": 0.8110593557357788, + "learning_rate": 0.00019856064873463252, + "loss": 0.839, + "step": 4020 + }, + { + "epoch": 0.165995562494864, + "grad_norm": 0.8444618582725525, + "learning_rate": 0.00019853806419741908, + "loss": 0.8355, + "step": 4040 + }, + { + "epoch": 0.1668173227052346, + "grad_norm": 1.0283797979354858, + "learning_rate": 0.00019851530515687892, + "loss": 0.8491, + "step": 4060 + }, + { + "epoch": 0.16763908291560523, + "grad_norm": 1.0117839574813843, + "learning_rate": 0.00019849237165331656, + "loss": 0.8466, + "step": 4080 + }, + { + "epoch": 0.16846084312597584, + "grad_norm": 0.8671867847442627, + "learning_rate": 0.0001984692637273454, + "loss": 0.8684, + "step": 4100 + }, + { + "epoch": 0.16928260333634645, + "grad_norm": 1.0693740844726562, + "learning_rate": 0.00019844598141988782, + "loss": 0.8419, + "step": 4120 + }, + { + "epoch": 0.17010436354671707, + "grad_norm": 1.088922381401062, + "learning_rate": 0.00019842252477217503, + "loss": 0.8169, + "step": 4140 + }, + { + "epoch": 0.17092612375708768, + "grad_norm": 0.8914449214935303, + "learning_rate": 0.00019839889382574692, + "loss": 0.8305, + "step": 4160 + }, + { + "epoch": 0.1717478839674583, + "grad_norm": 1.0136765241622925, + "learning_rate": 0.00019837508862245208, + "loss": 0.8481, + "step": 4180 + }, + { + "epoch": 0.1725696441778289, + "grad_norm": 0.9876135587692261, + "learning_rate": 0.00019835110920444772, + "loss": 0.8384, + "step": 4200 + }, + { + "epoch": 0.17339140438819953, + "grad_norm": 0.9133214354515076, + "learning_rate": 0.0001983269556141995, + "loss": 0.8679, + "step": 4220 + }, + { + "epoch": 0.17421316459857014, + "grad_norm": 0.9155489802360535, + "learning_rate": 0.0001983026278944816, + "loss": 0.8546, + "step": 4240 + }, + { + "epoch": 0.17503492480894076, + "grad_norm": 1.0384551286697388, + "learning_rate": 0.0001982781260883765, + "loss": 0.8076, + "step": 4260 + }, + { + "epoch": 0.17585668501931137, + "grad_norm": 0.9135899543762207, + "learning_rate": 0.00019825345023927505, + "loss": 0.8113, + "step": 4280 + }, + { + "epoch": 0.17667844522968199, + "grad_norm": 0.8842368125915527, + "learning_rate": 0.00019822860039087628, + "loss": 0.8315, + "step": 4300 + }, + { + "epoch": 0.1775002054400526, + "grad_norm": 0.9855481386184692, + "learning_rate": 0.00019820357658718738, + "loss": 0.8051, + "step": 4320 + }, + { + "epoch": 0.17832196565042321, + "grad_norm": 0.9533560276031494, + "learning_rate": 0.0001981783788725235, + "loss": 0.8215, + "step": 4340 + }, + { + "epoch": 0.17914372586079383, + "grad_norm": 0.9931275248527527, + "learning_rate": 0.00019815300729150793, + "loss": 0.8463, + "step": 4360 + }, + { + "epoch": 0.17996548607116444, + "grad_norm": 0.9402872323989868, + "learning_rate": 0.00019812746188907173, + "loss": 0.8228, + "step": 4380 + }, + { + "epoch": 0.18078724628153506, + "grad_norm": 0.8862301111221313, + "learning_rate": 0.0001981017427104539, + "loss": 0.8377, + "step": 4400 + }, + { + "epoch": 0.18160900649190567, + "grad_norm": 1.069446086883545, + "learning_rate": 0.0001980771485720597, + "loss": 0.8357, + "step": 4420 + }, + { + "epoch": 0.1824307667022763, + "grad_norm": 0.9147841334342957, + "learning_rate": 0.0001980510906611715, + "loss": 0.8329, + "step": 4440 + }, + { + "epoch": 0.1832525269126469, + "grad_norm": 0.9529426097869873, + "learning_rate": 0.0001980248591093492, + "loss": 0.8439, + "step": 4460 + }, + { + "epoch": 0.18407428712301752, + "grad_norm": 0.9662571549415588, + "learning_rate": 0.00019799845396304688, + "loss": 0.8325, + "step": 4480 + }, + { + "epoch": 0.18489604733338813, + "grad_norm": 0.9648681879043579, + "learning_rate": 0.000197971875269026, + "loss": 0.8324, + "step": 4500 + }, + { + "epoch": 0.18571780754375875, + "grad_norm": 1.0208882093429565, + "learning_rate": 0.00019794646480399925, + "loss": 0.8536, + "step": 4520 + }, + { + "epoch": 0.18653956775412933, + "grad_norm": 1.0145078897476196, + "learning_rate": 0.00019791954782758873, + "loss": 0.8317, + "step": 4540 + }, + { + "epoch": 0.18736132796449995, + "grad_norm": 1.0626345872879028, + "learning_rate": 0.00019789245744319638, + "loss": 0.8443, + "step": 4560 + }, + { + "epoch": 0.18818308817487056, + "grad_norm": 1.0466902256011963, + "learning_rate": 0.00019786519369879716, + "loss": 0.823, + "step": 4580 + }, + { + "epoch": 0.18900484838524118, + "grad_norm": 0.9382312893867493, + "learning_rate": 0.00019783775664267302, + "loss": 0.8326, + "step": 4600 + }, + { + "epoch": 0.1898266085956118, + "grad_norm": 0.9509291648864746, + "learning_rate": 0.00019781014632341292, + "loss": 0.8415, + "step": 4620 + }, + { + "epoch": 0.1906483688059824, + "grad_norm": 1.018986701965332, + "learning_rate": 0.0001977823627899126, + "loss": 0.8415, + "step": 4640 + }, + { + "epoch": 0.19147012901635302, + "grad_norm": 1.0662530660629272, + "learning_rate": 0.0001977544060913746, + "loss": 0.8453, + "step": 4660 + }, + { + "epoch": 0.19229188922672363, + "grad_norm": 1.0088191032409668, + "learning_rate": 0.000197726276277308, + "loss": 0.8611, + "step": 4680 + }, + { + "epoch": 0.19311364943709425, + "grad_norm": 0.9862759709358215, + "learning_rate": 0.0001976979733975286, + "loss": 0.8222, + "step": 4700 + }, + { + "epoch": 0.19393540964746486, + "grad_norm": 0.982672929763794, + "learning_rate": 0.0001976694975021586, + "loss": 0.8109, + "step": 4720 + }, + { + "epoch": 0.19475716985783548, + "grad_norm": 0.8332194685935974, + "learning_rate": 0.0001976408486416266, + "loss": 0.826, + "step": 4740 + }, + { + "epoch": 0.1955789300682061, + "grad_norm": 1.1170557737350464, + "learning_rate": 0.00019761202686666756, + "loss": 0.8748, + "step": 4760 + }, + { + "epoch": 0.1964006902785767, + "grad_norm": 0.8429856300354004, + "learning_rate": 0.0001975830322283226, + "loss": 0.8304, + "step": 4780 + }, + { + "epoch": 0.19722245048894732, + "grad_norm": 1.002530813217163, + "learning_rate": 0.000197553864777939, + "loss": 0.8542, + "step": 4800 + }, + { + "epoch": 0.19804421069931794, + "grad_norm": 1.2244071960449219, + "learning_rate": 0.0001975245245671701, + "loss": 0.8368, + "step": 4820 + }, + { + "epoch": 0.19886597090968855, + "grad_norm": 0.9923454523086548, + "learning_rate": 0.0001974950116479751, + "loss": 0.8236, + "step": 4840 + }, + { + "epoch": 0.19968773112005916, + "grad_norm": 0.9827476143836975, + "learning_rate": 0.00019746532607261915, + "loss": 0.8356, + "step": 4860 + }, + { + "epoch": 0.20050949133042978, + "grad_norm": 0.9938998222351074, + "learning_rate": 0.0001974354678936731, + "loss": 0.8383, + "step": 4880 + }, + { + "epoch": 0.2013312515408004, + "grad_norm": 0.9436901807785034, + "learning_rate": 0.00019740543716401346, + "loss": 0.8464, + "step": 4900 + }, + { + "epoch": 0.202153011751171, + "grad_norm": 0.8767272233963013, + "learning_rate": 0.0001973752339368224, + "loss": 0.849, + "step": 4920 + }, + { + "epoch": 0.20297477196154162, + "grad_norm": 0.9653998613357544, + "learning_rate": 0.00019734485826558747, + "loss": 0.8356, + "step": 4940 + }, + { + "epoch": 0.20379653217191224, + "grad_norm": 0.8907719850540161, + "learning_rate": 0.00019731431020410167, + "loss": 0.8323, + "step": 4960 + }, + { + "epoch": 0.20461829238228285, + "grad_norm": 1.1022579669952393, + "learning_rate": 0.00019728358980646325, + "loss": 0.8437, + "step": 4980 + }, + { + "epoch": 0.20544005259265347, + "grad_norm": 0.9100618958473206, + "learning_rate": 0.00019725269712707566, + "loss": 0.8502, + "step": 5000 + }, + { + "epoch": 0.20626181280302408, + "grad_norm": 0.9730123281478882, + "learning_rate": 0.0001972216322206475, + "loss": 0.8245, + "step": 5020 + }, + { + "epoch": 0.2070835730133947, + "grad_norm": 0.9601908922195435, + "learning_rate": 0.00019719039514219224, + "loss": 0.8326, + "step": 5040 + }, + { + "epoch": 0.2079053332237653, + "grad_norm": 1.0868589878082275, + "learning_rate": 0.00019715898594702843, + "loss": 0.8378, + "step": 5060 + }, + { + "epoch": 0.20872709343413592, + "grad_norm": 0.924371600151062, + "learning_rate": 0.0001971274046907793, + "loss": 0.832, + "step": 5080 + }, + { + "epoch": 0.20954885364450654, + "grad_norm": 1.1059744358062744, + "learning_rate": 0.00019709565142937287, + "loss": 0.8093, + "step": 5100 + }, + { + "epoch": 0.21037061385487715, + "grad_norm": 1.0641423463821411, + "learning_rate": 0.00019706372621904164, + "loss": 0.8173, + "step": 5120 + }, + { + "epoch": 0.21119237406524777, + "grad_norm": 1.1420958042144775, + "learning_rate": 0.00019703162911632275, + "loss": 0.8424, + "step": 5140 + }, + { + "epoch": 0.21201413427561838, + "grad_norm": 0.9624399542808533, + "learning_rate": 0.00019699936017805768, + "loss": 0.8347, + "step": 5160 + }, + { + "epoch": 0.212835894485989, + "grad_norm": 0.9680808186531067, + "learning_rate": 0.00019696691946139225, + "loss": 0.8207, + "step": 5180 + }, + { + "epoch": 0.2136576546963596, + "grad_norm": 0.9132868647575378, + "learning_rate": 0.00019693430702377647, + "loss": 0.8398, + "step": 5200 + }, + { + "epoch": 0.21447941490673023, + "grad_norm": 0.9321950674057007, + "learning_rate": 0.00019690152292296446, + "loss": 0.85, + "step": 5220 + }, + { + "epoch": 0.21530117511710084, + "grad_norm": 0.9252221584320068, + "learning_rate": 0.00019686856721701435, + "loss": 0.8251, + "step": 5240 + }, + { + "epoch": 0.21612293532747145, + "grad_norm": 0.9873983860015869, + "learning_rate": 0.00019683543996428811, + "loss": 0.8092, + "step": 5260 + }, + { + "epoch": 0.21694469553784207, + "grad_norm": 1.0303717851638794, + "learning_rate": 0.0001968021412234516, + "loss": 0.8396, + "step": 5280 + }, + { + "epoch": 0.21776645574821268, + "grad_norm": 0.9478332996368408, + "learning_rate": 0.00019676867105347431, + "loss": 0.8194, + "step": 5300 + }, + { + "epoch": 0.2185882159585833, + "grad_norm": 1.01088547706604, + "learning_rate": 0.00019673502951362935, + "loss": 0.8207, + "step": 5320 + }, + { + "epoch": 0.2194099761689539, + "grad_norm": 0.9483580589294434, + "learning_rate": 0.00019670121666349327, + "loss": 0.8452, + "step": 5340 + }, + { + "epoch": 0.2202317363793245, + "grad_norm": 1.0161420106887817, + "learning_rate": 0.00019666723256294604, + "loss": 0.8377, + "step": 5360 + }, + { + "epoch": 0.22105349658969511, + "grad_norm": 1.0933947563171387, + "learning_rate": 0.00019663307727217085, + "loss": 0.847, + "step": 5380 + }, + { + "epoch": 0.22187525680006573, + "grad_norm": 1.0978140830993652, + "learning_rate": 0.0001965987508516542, + "loss": 0.8423, + "step": 5400 + }, + { + "epoch": 0.22269701701043634, + "grad_norm": 0.9424787163734436, + "learning_rate": 0.00019656425336218544, + "loss": 0.8106, + "step": 5420 + }, + { + "epoch": 0.22351877722080696, + "grad_norm": 0.9634792804718018, + "learning_rate": 0.00019652958486485696, + "loss": 0.8387, + "step": 5440 + }, + { + "epoch": 0.22434053743117757, + "grad_norm": 1.0137280225753784, + "learning_rate": 0.0001964947454210641, + "loss": 0.8375, + "step": 5460 + }, + { + "epoch": 0.2251622976415482, + "grad_norm": 1.0315325260162354, + "learning_rate": 0.00019645973509250467, + "loss": 0.851, + "step": 5480 + }, + { + "epoch": 0.2259840578519188, + "grad_norm": 0.978634238243103, + "learning_rate": 0.00019642455394117944, + "loss": 0.7957, + "step": 5500 + }, + { + "epoch": 0.22680581806228942, + "grad_norm": 0.9586151838302612, + "learning_rate": 0.00019638920202939142, + "loss": 0.8423, + "step": 5520 + }, + { + "epoch": 0.22762757827266003, + "grad_norm": 0.9884860515594482, + "learning_rate": 0.00019635367941974615, + "loss": 0.85, + "step": 5540 + }, + { + "epoch": 0.22844933848303065, + "grad_norm": 0.884602963924408, + "learning_rate": 0.00019631798617515144, + "loss": 0.8204, + "step": 5560 + }, + { + "epoch": 0.22927109869340126, + "grad_norm": 0.9110316038131714, + "learning_rate": 0.0001962821223588173, + "loss": 0.8184, + "step": 5580 + }, + { + "epoch": 0.23009285890377187, + "grad_norm": 0.9724137783050537, + "learning_rate": 0.00019624608803425574, + "loss": 0.8198, + "step": 5600 + }, + { + "epoch": 0.2309146191141425, + "grad_norm": 1.004752278327942, + "learning_rate": 0.00019620988326528077, + "loss": 0.8502, + "step": 5620 + }, + { + "epoch": 0.2317363793245131, + "grad_norm": 1.0108088254928589, + "learning_rate": 0.00019617350811600831, + "loss": 0.8367, + "step": 5640 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 1.0130361318588257, + "learning_rate": 0.00019613696265085591, + "loss": 0.8151, + "step": 5660 + }, + { + "epoch": 0.23337989974525433, + "grad_norm": 1.0931516885757446, + "learning_rate": 0.0001961002469345428, + "loss": 0.8427, + "step": 5680 + }, + { + "epoch": 0.23420165995562495, + "grad_norm": 0.9896870851516724, + "learning_rate": 0.00019606336103208968, + "loss": 0.8365, + "step": 5700 + }, + { + "epoch": 0.23502342016599556, + "grad_norm": 1.0382894277572632, + "learning_rate": 0.0001960263050088186, + "loss": 0.8229, + "step": 5720 + }, + { + "epoch": 0.23584518037636618, + "grad_norm": 0.9921779036521912, + "learning_rate": 0.00019598907893035299, + "loss": 0.8462, + "step": 5740 + }, + { + "epoch": 0.2366669405867368, + "grad_norm": 1.02907395362854, + "learning_rate": 0.00019595168286261732, + "loss": 0.8369, + "step": 5760 + }, + { + "epoch": 0.2374887007971074, + "grad_norm": 0.962459921836853, + "learning_rate": 0.00019591411687183715, + "loss": 0.8207, + "step": 5780 + }, + { + "epoch": 0.23831046100747802, + "grad_norm": 1.0783615112304688, + "learning_rate": 0.000195876381024539, + "loss": 0.8238, + "step": 5800 + }, + { + "epoch": 0.23913222121784863, + "grad_norm": 1.0806901454925537, + "learning_rate": 0.00019583847538755014, + "loss": 0.8596, + "step": 5820 + }, + { + "epoch": 0.23995398142821925, + "grad_norm": 1.0261567831039429, + "learning_rate": 0.00019580040002799848, + "loss": 0.835, + "step": 5840 + }, + { + "epoch": 0.24077574163858986, + "grad_norm": 1.0381710529327393, + "learning_rate": 0.0001957621550133126, + "loss": 0.8614, + "step": 5860 + }, + { + "epoch": 0.24159750184896048, + "grad_norm": 1.0195953845977783, + "learning_rate": 0.00019572374041122148, + "loss": 0.8411, + "step": 5880 + }, + { + "epoch": 0.2424192620593311, + "grad_norm": 0.9467645287513733, + "learning_rate": 0.0001956851562897544, + "loss": 0.8375, + "step": 5900 + }, + { + "epoch": 0.2432410222697017, + "grad_norm": 0.9575105309486389, + "learning_rate": 0.0001956464027172409, + "loss": 0.8215, + "step": 5920 + }, + { + "epoch": 0.24406278248007232, + "grad_norm": 1.0752394199371338, + "learning_rate": 0.00019560747976231054, + "loss": 0.8377, + "step": 5940 + }, + { + "epoch": 0.24488454269044294, + "grad_norm": 0.9938384890556335, + "learning_rate": 0.0001955683874938929, + "loss": 0.8152, + "step": 5960 + }, + { + "epoch": 0.24570630290081355, + "grad_norm": 0.9279704689979553, + "learning_rate": 0.00019552912598121735, + "loss": 0.8061, + "step": 5980 + }, + { + "epoch": 0.24652806311118416, + "grad_norm": 0.9615955948829651, + "learning_rate": 0.00019548969529381306, + "loss": 0.8492, + "step": 6000 + }, + { + "epoch": 0.24652806311118416, + "eval_loss": 1.0622910261154175, + "eval_runtime": 16.4076, + "eval_samples_per_second": 159.682, + "eval_steps_per_second": 4.998, + "step": 6000 + }, + { + "epoch": 0.24734982332155478, + "grad_norm": 1.052895188331604, + "learning_rate": 0.0001954500955015087, + "loss": 0.8352, + "step": 6020 + }, + { + "epoch": 0.2481715835319254, + "grad_norm": 1.0382367372512817, + "learning_rate": 0.00019541032667443243, + "loss": 0.837, + "step": 6040 + }, + { + "epoch": 0.248993343742296, + "grad_norm": 1.0128381252288818, + "learning_rate": 0.00019537038888301183, + "loss": 0.8292, + "step": 6060 + }, + { + "epoch": 0.24981510395266662, + "grad_norm": 1.0595107078552246, + "learning_rate": 0.00019533028219797366, + "loss": 0.8431, + "step": 6080 + }, + { + "epoch": 0.2506368641630372, + "grad_norm": 1.0556915998458862, + "learning_rate": 0.00019529000669034376, + "loss": 0.8316, + "step": 6100 + }, + { + "epoch": 0.2514586243734078, + "grad_norm": 0.9882562160491943, + "learning_rate": 0.00019524956243144692, + "loss": 0.8377, + "step": 6120 + }, + { + "epoch": 0.25228038458377844, + "grad_norm": 1.0965570211410522, + "learning_rate": 0.00019520894949290684, + "loss": 0.8383, + "step": 6140 + }, + { + "epoch": 0.25310214479414905, + "grad_norm": 1.075129747390747, + "learning_rate": 0.0001951681679466459, + "loss": 0.8372, + "step": 6160 + }, + { + "epoch": 0.25392390500451967, + "grad_norm": 1.1094706058502197, + "learning_rate": 0.00019512721786488509, + "loss": 0.8321, + "step": 6180 + }, + { + "epoch": 0.2547456652148903, + "grad_norm": 0.9913383722305298, + "learning_rate": 0.00019508609932014382, + "loss": 0.8238, + "step": 6200 + }, + { + "epoch": 0.2555674254252609, + "grad_norm": 1.10612154006958, + "learning_rate": 0.0001950448123852399, + "loss": 0.8637, + "step": 6220 + }, + { + "epoch": 0.2563891856356315, + "grad_norm": 1.0326836109161377, + "learning_rate": 0.00019500335713328932, + "loss": 0.84, + "step": 6240 + }, + { + "epoch": 0.2572109458460021, + "grad_norm": 0.9649391174316406, + "learning_rate": 0.00019496173363770615, + "loss": 0.8217, + "step": 6260 + }, + { + "epoch": 0.25803270605637274, + "grad_norm": 0.9617984890937805, + "learning_rate": 0.0001949199419722023, + "loss": 0.8537, + "step": 6280 + }, + { + "epoch": 0.25885446626674335, + "grad_norm": 1.1347591876983643, + "learning_rate": 0.0001948779822107877, + "loss": 0.8131, + "step": 6300 + }, + { + "epoch": 0.25967622647711397, + "grad_norm": 0.9121894240379333, + "learning_rate": 0.00019483585442776983, + "loss": 0.8407, + "step": 6320 + }, + { + "epoch": 0.2604979866874846, + "grad_norm": 1.203627586364746, + "learning_rate": 0.00019479355869775374, + "loss": 0.8455, + "step": 6340 + }, + { + "epoch": 0.2613197468978552, + "grad_norm": 1.0639876127243042, + "learning_rate": 0.00019475109509564192, + "loss": 0.8466, + "step": 6360 + }, + { + "epoch": 0.2621415071082258, + "grad_norm": 1.0065891742706299, + "learning_rate": 0.00019470846369663413, + "loss": 0.8502, + "step": 6380 + }, + { + "epoch": 0.2629632673185964, + "grad_norm": 1.0567289590835571, + "learning_rate": 0.00019466566457622734, + "loss": 0.8375, + "step": 6400 + }, + { + "epoch": 0.26378502752896704, + "grad_norm": 1.1206752061843872, + "learning_rate": 0.0001946226978102154, + "loss": 0.8333, + "step": 6420 + }, + { + "epoch": 0.26460678773933766, + "grad_norm": 1.0563714504241943, + "learning_rate": 0.00019457956347468925, + "loss": 0.836, + "step": 6440 + }, + { + "epoch": 0.26542854794970827, + "grad_norm": 1.051429033279419, + "learning_rate": 0.0001945362616460364, + "loss": 0.8287, + "step": 6460 + }, + { + "epoch": 0.2662503081600789, + "grad_norm": 0.9783703088760376, + "learning_rate": 0.0001944927924009411, + "loss": 0.8148, + "step": 6480 + }, + { + "epoch": 0.2670720683704495, + "grad_norm": 1.2782011032104492, + "learning_rate": 0.00019444915581638404, + "loss": 0.866, + "step": 6500 + }, + { + "epoch": 0.2678938285808201, + "grad_norm": 0.8880527019500732, + "learning_rate": 0.0001944053519696422, + "loss": 0.8282, + "step": 6520 + }, + { + "epoch": 0.26871558879119073, + "grad_norm": 1.0727986097335815, + "learning_rate": 0.0001943613809382889, + "loss": 0.8523, + "step": 6540 + }, + { + "epoch": 0.26953734900156134, + "grad_norm": 1.0758675336837769, + "learning_rate": 0.00019431724280019342, + "loss": 0.8381, + "step": 6560 + }, + { + "epoch": 0.27035910921193196, + "grad_norm": 1.10956609249115, + "learning_rate": 0.00019427293763352096, + "loss": 0.8159, + "step": 6580 + }, + { + "epoch": 0.2711808694223026, + "grad_norm": 1.1092921495437622, + "learning_rate": 0.00019422846551673262, + "loss": 0.8387, + "step": 6600 + }, + { + "epoch": 0.2720026296326732, + "grad_norm": 1.057029366493225, + "learning_rate": 0.00019418382652858506, + "loss": 0.8306, + "step": 6620 + }, + { + "epoch": 0.2728243898430438, + "grad_norm": 0.9744523763656616, + "learning_rate": 0.00019413902074813047, + "loss": 0.8582, + "step": 6640 + }, + { + "epoch": 0.2736461500534144, + "grad_norm": 0.9532150626182556, + "learning_rate": 0.00019409404825471654, + "loss": 0.8531, + "step": 6660 + }, + { + "epoch": 0.27446791026378503, + "grad_norm": 1.0654603242874146, + "learning_rate": 0.00019404890912798597, + "loss": 0.8437, + "step": 6680 + }, + { + "epoch": 0.27528967047415565, + "grad_norm": 1.0381238460540771, + "learning_rate": 0.00019400360344787676, + "loss": 0.8618, + "step": 6700 + }, + { + "epoch": 0.27611143068452626, + "grad_norm": 1.1071590185165405, + "learning_rate": 0.00019395813129462176, + "loss": 0.824, + "step": 6720 + }, + { + "epoch": 0.2769331908948969, + "grad_norm": 1.1044433116912842, + "learning_rate": 0.00019391249274874865, + "loss": 0.8332, + "step": 6740 + }, + { + "epoch": 0.2777549511052675, + "grad_norm": 1.145683765411377, + "learning_rate": 0.00019386668789107977, + "loss": 0.83, + "step": 6760 + }, + { + "epoch": 0.2785767113156381, + "grad_norm": 1.196481466293335, + "learning_rate": 0.00019382071680273198, + "loss": 0.8368, + "step": 6780 + }, + { + "epoch": 0.2793984715260087, + "grad_norm": 1.0642255544662476, + "learning_rate": 0.00019377457956511662, + "loss": 0.8439, + "step": 6800 + }, + { + "epoch": 0.28022023173637933, + "grad_norm": 1.0132989883422852, + "learning_rate": 0.0001937282762599391, + "loss": 0.8403, + "step": 6820 + }, + { + "epoch": 0.28104199194674995, + "grad_norm": 1.1021807193756104, + "learning_rate": 0.00019368180696919905, + "loss": 0.8373, + "step": 6840 + }, + { + "epoch": 0.28186375215712056, + "grad_norm": 1.0249390602111816, + "learning_rate": 0.00019363517177519004, + "loss": 0.8246, + "step": 6860 + }, + { + "epoch": 0.2826855123674912, + "grad_norm": 1.0310267210006714, + "learning_rate": 0.0001935883707604993, + "loss": 0.8266, + "step": 6880 + }, + { + "epoch": 0.2835072725778618, + "grad_norm": 1.1064010858535767, + "learning_rate": 0.00019354140400800797, + "loss": 0.8403, + "step": 6900 + }, + { + "epoch": 0.2843290327882324, + "grad_norm": 1.0507344007492065, + "learning_rate": 0.0001934942716008904, + "loss": 0.8365, + "step": 6920 + }, + { + "epoch": 0.285150792998603, + "grad_norm": 1.2774583101272583, + "learning_rate": 0.00019344697362261458, + "loss": 0.8394, + "step": 6940 + }, + { + "epoch": 0.28597255320897363, + "grad_norm": 1.1305222511291504, + "learning_rate": 0.0001933995101569415, + "loss": 0.8446, + "step": 6960 + }, + { + "epoch": 0.28679431341934425, + "grad_norm": 1.0519880056381226, + "learning_rate": 0.00019335188128792542, + "loss": 0.8589, + "step": 6980 + }, + { + "epoch": 0.28761607362971486, + "grad_norm": 0.9783779978752136, + "learning_rate": 0.00019330408709991326, + "loss": 0.8364, + "step": 7000 + }, + { + "epoch": 0.2884378338400855, + "grad_norm": 1.033211350440979, + "learning_rate": 0.0001932561276775449, + "loss": 0.8412, + "step": 7020 + }, + { + "epoch": 0.2892595940504561, + "grad_norm": 1.1831096410751343, + "learning_rate": 0.00019320800310575288, + "loss": 0.8495, + "step": 7040 + }, + { + "epoch": 0.2900813542608267, + "grad_norm": 1.0064650774002075, + "learning_rate": 0.00019315971346976193, + "loss": 0.8482, + "step": 7060 + }, + { + "epoch": 0.2909031144711973, + "grad_norm": 1.0976219177246094, + "learning_rate": 0.00019311125885508945, + "loss": 0.8173, + "step": 7080 + }, + { + "epoch": 0.29172487468156794, + "grad_norm": 1.110113263130188, + "learning_rate": 0.00019306263934754477, + "loss": 0.8285, + "step": 7100 + }, + { + "epoch": 0.29254663489193855, + "grad_norm": 1.0953800678253174, + "learning_rate": 0.0001930138550332292, + "loss": 0.8382, + "step": 7120 + }, + { + "epoch": 0.29336839510230917, + "grad_norm": 1.049208402633667, + "learning_rate": 0.0001929649059985362, + "loss": 0.8238, + "step": 7140 + }, + { + "epoch": 0.2941901553126798, + "grad_norm": 1.096807837486267, + "learning_rate": 0.0001929157923301506, + "loss": 0.8139, + "step": 7160 + }, + { + "epoch": 0.2950119155230504, + "grad_norm": 1.2268364429473877, + "learning_rate": 0.00019286651411504893, + "loss": 0.8349, + "step": 7180 + }, + { + "epoch": 0.295833675733421, + "grad_norm": 1.0595046281814575, + "learning_rate": 0.00019281707144049915, + "loss": 0.8448, + "step": 7200 + }, + { + "epoch": 0.2966554359437916, + "grad_norm": 1.0562009811401367, + "learning_rate": 0.00019276746439406047, + "loss": 0.834, + "step": 7220 + }, + { + "epoch": 0.29747719615416224, + "grad_norm": 1.0876846313476562, + "learning_rate": 0.0001927176930635831, + "loss": 0.8201, + "step": 7240 + }, + { + "epoch": 0.29829895636453285, + "grad_norm": 0.9775159955024719, + "learning_rate": 0.00019266775753720822, + "loss": 0.8612, + "step": 7260 + }, + { + "epoch": 0.29912071657490347, + "grad_norm": 1.1259452104568481, + "learning_rate": 0.00019261765790336784, + "loss": 0.8139, + "step": 7280 + }, + { + "epoch": 0.2999424767852741, + "grad_norm": 1.00784432888031, + "learning_rate": 0.00019256739425078454, + "loss": 0.8237, + "step": 7300 + }, + { + "epoch": 0.3007642369956447, + "grad_norm": 1.0394659042358398, + "learning_rate": 0.00019251696666847137, + "loss": 0.8692, + "step": 7320 + }, + { + "epoch": 0.3015859972060153, + "grad_norm": 1.1670759916305542, + "learning_rate": 0.00019246637524573173, + "loss": 0.8478, + "step": 7340 + }, + { + "epoch": 0.3024077574163859, + "grad_norm": 1.0558823347091675, + "learning_rate": 0.0001924156200721591, + "loss": 0.8413, + "step": 7360 + }, + { + "epoch": 0.30322951762675654, + "grad_norm": 1.0909830331802368, + "learning_rate": 0.000192364701237637, + "loss": 0.8584, + "step": 7380 + }, + { + "epoch": 0.30405127783712715, + "grad_norm": 0.9988498091697693, + "learning_rate": 0.00019231361883233878, + "loss": 0.8212, + "step": 7400 + }, + { + "epoch": 0.3048730380474977, + "grad_norm": 1.0689078569412231, + "learning_rate": 0.00019226237294672744, + "loss": 0.8464, + "step": 7420 + }, + { + "epoch": 0.3056947982578683, + "grad_norm": 1.0881212949752808, + "learning_rate": 0.00019221096367155548, + "loss": 0.8607, + "step": 7440 + }, + { + "epoch": 0.30651655846823894, + "grad_norm": 1.1320979595184326, + "learning_rate": 0.00019215939109786477, + "loss": 0.8359, + "step": 7460 + }, + { + "epoch": 0.30733831867860956, + "grad_norm": 0.8782603144645691, + "learning_rate": 0.0001921076553169864, + "loss": 0.8558, + "step": 7480 + }, + { + "epoch": 0.30816007888898017, + "grad_norm": 1.0550236701965332, + "learning_rate": 0.00019205575642054044, + "loss": 0.8088, + "step": 7500 + }, + { + "epoch": 0.3089818390993508, + "grad_norm": 1.1194961071014404, + "learning_rate": 0.0001920036945004358, + "loss": 0.8233, + "step": 7520 + }, + { + "epoch": 0.3098035993097214, + "grad_norm": 1.10885751247406, + "learning_rate": 0.00019195146964887024, + "loss": 0.8341, + "step": 7540 + }, + { + "epoch": 0.310625359520092, + "grad_norm": 1.015629768371582, + "learning_rate": 0.0001918990819583298, + "loss": 0.8547, + "step": 7560 + }, + { + "epoch": 0.31144711973046263, + "grad_norm": 1.142196536064148, + "learning_rate": 0.0001918465315215892, + "loss": 0.856, + "step": 7580 + }, + { + "epoch": 0.31226887994083324, + "grad_norm": 0.9691776037216187, + "learning_rate": 0.0001917938184317111, + "loss": 0.8295, + "step": 7600 + }, + { + "epoch": 0.31309064015120386, + "grad_norm": 0.9759687781333923, + "learning_rate": 0.00019174094278204636, + "loss": 0.8333, + "step": 7620 + }, + { + "epoch": 0.3139124003615745, + "grad_norm": 1.148779034614563, + "learning_rate": 0.00019168790466623375, + "loss": 0.8263, + "step": 7640 + }, + { + "epoch": 0.3147341605719451, + "grad_norm": 1.2947983741760254, + "learning_rate": 0.00019163470417819963, + "loss": 0.8627, + "step": 7660 + }, + { + "epoch": 0.3155559207823157, + "grad_norm": 1.0895724296569824, + "learning_rate": 0.00019158134141215792, + "loss": 0.8429, + "step": 7680 + }, + { + "epoch": 0.3163776809926863, + "grad_norm": 1.070154070854187, + "learning_rate": 0.0001915278164626101, + "loss": 0.8188, + "step": 7700 + }, + { + "epoch": 0.31719944120305693, + "grad_norm": 1.00252366065979, + "learning_rate": 0.00019147412942434463, + "loss": 0.8482, + "step": 7720 + }, + { + "epoch": 0.31802120141342755, + "grad_norm": 1.273224949836731, + "learning_rate": 0.00019142028039243717, + "loss": 0.8326, + "step": 7740 + }, + { + "epoch": 0.31884296162379816, + "grad_norm": 1.2733259201049805, + "learning_rate": 0.00019136626946225017, + "loss": 0.8377, + "step": 7760 + }, + { + "epoch": 0.3196647218341688, + "grad_norm": 0.9789584279060364, + "learning_rate": 0.00019131209672943288, + "loss": 0.8355, + "step": 7780 + }, + { + "epoch": 0.3204864820445394, + "grad_norm": 1.1093429327011108, + "learning_rate": 0.00019125776228992103, + "loss": 0.836, + "step": 7800 + }, + { + "epoch": 0.32130824225491, + "grad_norm": 1.0080488920211792, + "learning_rate": 0.00019120326623993668, + "loss": 0.8375, + "step": 7820 + }, + { + "epoch": 0.3221300024652806, + "grad_norm": 1.0703438520431519, + "learning_rate": 0.0001911486086759882, + "loss": 0.8306, + "step": 7840 + }, + { + "epoch": 0.32295176267565123, + "grad_norm": 1.0767182111740112, + "learning_rate": 0.0001910937896948699, + "loss": 0.8483, + "step": 7860 + }, + { + "epoch": 0.32377352288602185, + "grad_norm": 0.9747923612594604, + "learning_rate": 0.00019103880939366197, + "loss": 0.8489, + "step": 7880 + }, + { + "epoch": 0.32459528309639246, + "grad_norm": 1.0351313352584839, + "learning_rate": 0.00019098366786973032, + "loss": 0.8351, + "step": 7900 + }, + { + "epoch": 0.3254170433067631, + "grad_norm": 1.049666404724121, + "learning_rate": 0.00019092836522072631, + "loss": 0.8271, + "step": 7920 + }, + { + "epoch": 0.3262388035171337, + "grad_norm": 1.0181846618652344, + "learning_rate": 0.0001908729015445867, + "loss": 0.8352, + "step": 7940 + }, + { + "epoch": 0.3270605637275043, + "grad_norm": 1.1390068531036377, + "learning_rate": 0.00019081727693953337, + "loss": 0.8392, + "step": 7960 + }, + { + "epoch": 0.3278823239378749, + "grad_norm": 1.0242650508880615, + "learning_rate": 0.00019076149150407324, + "loss": 0.8398, + "step": 7980 + }, + { + "epoch": 0.32870408414824553, + "grad_norm": 1.0052822828292847, + "learning_rate": 0.0001907083464611993, + "loss": 0.8257, + "step": 8000 + }, + { + "epoch": 0.32870408414824553, + "eval_loss": 1.0951544046401978, + "eval_runtime": 16.4946, + "eval_samples_per_second": 158.84, + "eval_steps_per_second": 4.971, + "step": 8000 + }, + { + "epoch": 0.32952584435861615, + "grad_norm": 1.0948665142059326, + "learning_rate": 0.00019065224769085476, + "loss": 0.8343, + "step": 8020 + }, + { + "epoch": 0.33034760456898676, + "grad_norm": 1.1585348844528198, + "learning_rate": 0.00019059598838235754, + "loss": 0.8272, + "step": 8040 + }, + { + "epoch": 0.3311693647793574, + "grad_norm": 1.0641188621520996, + "learning_rate": 0.00019053956863533854, + "loss": 0.8499, + "step": 8060 + }, + { + "epoch": 0.331991124989728, + "grad_norm": 1.0595240592956543, + "learning_rate": 0.00019048298854971272, + "loss": 0.8407, + "step": 8080 + }, + { + "epoch": 0.3328128852000986, + "grad_norm": 1.1425433158874512, + "learning_rate": 0.00019042624822567908, + "loss": 0.8671, + "step": 8100 + }, + { + "epoch": 0.3336346454104692, + "grad_norm": 1.1736706495285034, + "learning_rate": 0.0001903693477637204, + "loss": 0.8115, + "step": 8120 + }, + { + "epoch": 0.33445640562083984, + "grad_norm": 1.062788724899292, + "learning_rate": 0.000190312287264603, + "loss": 0.8416, + "step": 8140 + }, + { + "epoch": 0.33527816583121045, + "grad_norm": 1.0873854160308838, + "learning_rate": 0.0001902550668293766, + "loss": 0.8513, + "step": 8160 + }, + { + "epoch": 0.33609992604158107, + "grad_norm": 1.0588126182556152, + "learning_rate": 0.00019019768655937423, + "loss": 0.8232, + "step": 8180 + }, + { + "epoch": 0.3369216862519517, + "grad_norm": 1.0401087999343872, + "learning_rate": 0.00019014014655621193, + "loss": 0.8369, + "step": 8200 + }, + { + "epoch": 0.3377434464623223, + "grad_norm": 0.988150417804718, + "learning_rate": 0.0001900824469217886, + "loss": 0.8076, + "step": 8220 + }, + { + "epoch": 0.3385652066726929, + "grad_norm": 1.2257081270217896, + "learning_rate": 0.00019002458775828584, + "loss": 0.824, + "step": 8240 + }, + { + "epoch": 0.3393869668830635, + "grad_norm": 0.9933615922927856, + "learning_rate": 0.0001899665691681678, + "loss": 0.8391, + "step": 8260 + }, + { + "epoch": 0.34020872709343414, + "grad_norm": 1.1779851913452148, + "learning_rate": 0.0001899083912541809, + "loss": 0.8333, + "step": 8280 + }, + { + "epoch": 0.34103048730380475, + "grad_norm": 1.118120551109314, + "learning_rate": 0.0001898500541193538, + "loss": 0.8333, + "step": 8300 + }, + { + "epoch": 0.34185224751417537, + "grad_norm": 1.0865180492401123, + "learning_rate": 0.00018979155786699706, + "loss": 0.8497, + "step": 8320 + }, + { + "epoch": 0.342674007724546, + "grad_norm": 0.9836400747299194, + "learning_rate": 0.000189732902600703, + "loss": 0.8547, + "step": 8340 + }, + { + "epoch": 0.3434957679349166, + "grad_norm": 1.1521192789077759, + "learning_rate": 0.00018967408842434562, + "loss": 0.8476, + "step": 8360 + }, + { + "epoch": 0.3443175281452872, + "grad_norm": 1.1589045524597168, + "learning_rate": 0.0001896151154420803, + "loss": 0.8479, + "step": 8380 + }, + { + "epoch": 0.3451392883556578, + "grad_norm": 1.0231435298919678, + "learning_rate": 0.00018955598375834364, + "loss": 0.863, + "step": 8400 + }, + { + "epoch": 0.34596104856602844, + "grad_norm": 1.0295898914337158, + "learning_rate": 0.00018949669347785328, + "loss": 0.8224, + "step": 8420 + }, + { + "epoch": 0.34678280877639905, + "grad_norm": 1.1238269805908203, + "learning_rate": 0.00018943724470560778, + "loss": 0.8276, + "step": 8440 + }, + { + "epoch": 0.34760456898676967, + "grad_norm": 1.0870115756988525, + "learning_rate": 0.00018937763754688634, + "loss": 0.8372, + "step": 8460 + }, + { + "epoch": 0.3484263291971403, + "grad_norm": 1.1568728685379028, + "learning_rate": 0.0001893178721072486, + "loss": 0.862, + "step": 8480 + }, + { + "epoch": 0.3492480894075109, + "grad_norm": 1.0375559329986572, + "learning_rate": 0.00018925794849253462, + "loss": 0.843, + "step": 8500 + }, + { + "epoch": 0.3500698496178815, + "grad_norm": 1.1177926063537598, + "learning_rate": 0.00018919786680886443, + "loss": 0.8303, + "step": 8520 + }, + { + "epoch": 0.3508916098282521, + "grad_norm": 1.1874128580093384, + "learning_rate": 0.00018913762716263818, + "loss": 0.8429, + "step": 8540 + }, + { + "epoch": 0.35171337003862274, + "grad_norm": 1.2707151174545288, + "learning_rate": 0.00018907722966053555, + "loss": 0.8257, + "step": 8560 + }, + { + "epoch": 0.35253513024899336, + "grad_norm": 1.1079628467559814, + "learning_rate": 0.00018901667440951586, + "loss": 0.849, + "step": 8580 + }, + { + "epoch": 0.35335689045936397, + "grad_norm": 1.1875925064086914, + "learning_rate": 0.0001889559615168179, + "loss": 0.8704, + "step": 8600 + }, + { + "epoch": 0.3541786506697346, + "grad_norm": 1.1461087465286255, + "learning_rate": 0.00018889509108995943, + "loss": 0.8292, + "step": 8620 + }, + { + "epoch": 0.3550004108801052, + "grad_norm": 1.3481261730194092, + "learning_rate": 0.0001888340632367373, + "loss": 0.8163, + "step": 8640 + }, + { + "epoch": 0.3558221710904758, + "grad_norm": 1.1863452196121216, + "learning_rate": 0.00018877287806522722, + "loss": 0.8532, + "step": 8660 + }, + { + "epoch": 0.35664393130084643, + "grad_norm": 1.3977798223495483, + "learning_rate": 0.00018871153568378332, + "loss": 0.8714, + "step": 8680 + }, + { + "epoch": 0.35746569151121704, + "grad_norm": 1.1754332780838013, + "learning_rate": 0.0001886500362010383, + "loss": 0.8243, + "step": 8700 + }, + { + "epoch": 0.35828745172158766, + "grad_norm": 1.1255104541778564, + "learning_rate": 0.000188588379725903, + "loss": 0.8025, + "step": 8720 + }, + { + "epoch": 0.35910921193195827, + "grad_norm": 1.0885831117630005, + "learning_rate": 0.00018852656636756627, + "loss": 0.8179, + "step": 8740 + }, + { + "epoch": 0.3599309721423289, + "grad_norm": 1.121172308921814, + "learning_rate": 0.00018846459623549482, + "loss": 0.831, + "step": 8760 + }, + { + "epoch": 0.3607527323526995, + "grad_norm": 1.2006275653839111, + "learning_rate": 0.000188402469439433, + "loss": 0.8451, + "step": 8780 + }, + { + "epoch": 0.3615744925630701, + "grad_norm": 1.0075160264968872, + "learning_rate": 0.00018834018608940257, + "loss": 0.8326, + "step": 8800 + }, + { + "epoch": 0.36239625277344073, + "grad_norm": 1.3210777044296265, + "learning_rate": 0.00018827774629570252, + "loss": 0.8466, + "step": 8820 + }, + { + "epoch": 0.36321801298381134, + "grad_norm": 1.157143473625183, + "learning_rate": 0.00018821515016890895, + "loss": 0.84, + "step": 8840 + }, + { + "epoch": 0.36403977319418196, + "grad_norm": 1.0349316596984863, + "learning_rate": 0.0001881523978198748, + "loss": 0.8166, + "step": 8860 + }, + { + "epoch": 0.3648615334045526, + "grad_norm": 1.1739977598190308, + "learning_rate": 0.00018808948935972964, + "loss": 0.8154, + "step": 8880 + }, + { + "epoch": 0.3656832936149232, + "grad_norm": 1.0839564800262451, + "learning_rate": 0.00018802642489987946, + "loss": 0.8446, + "step": 8900 + }, + { + "epoch": 0.3665050538252938, + "grad_norm": 1.126232624053955, + "learning_rate": 0.0001879632045520066, + "loss": 0.8262, + "step": 8920 + }, + { + "epoch": 0.3673268140356644, + "grad_norm": 1.1430919170379639, + "learning_rate": 0.00018789982842806947, + "loss": 0.8449, + "step": 8940 + }, + { + "epoch": 0.36814857424603503, + "grad_norm": 1.1284793615341187, + "learning_rate": 0.00018783629664030226, + "loss": 0.8482, + "step": 8960 + }, + { + "epoch": 0.36897033445640565, + "grad_norm": 1.0018378496170044, + "learning_rate": 0.00018777260930121487, + "loss": 0.8353, + "step": 8980 + }, + { + "epoch": 0.36979209466677626, + "grad_norm": 1.055388331413269, + "learning_rate": 0.00018771515779286891, + "loss": 0.8397, + "step": 9000 + }, + { + "epoch": 0.3706138548771469, + "grad_norm": 1.152448296546936, + "learning_rate": 0.00018765117521722443, + "loss": 0.8607, + "step": 9020 + }, + { + "epoch": 0.3714356150875175, + "grad_norm": 1.1177656650543213, + "learning_rate": 0.00018758703741809558, + "loss": 0.8254, + "step": 9040 + }, + { + "epoch": 0.37225737529788805, + "grad_norm": 1.336777687072754, + "learning_rate": 0.00018752274450906545, + "loss": 0.8367, + "step": 9060 + }, + { + "epoch": 0.37307913550825866, + "grad_norm": 1.19560706615448, + "learning_rate": 0.00018745829660399185, + "loss": 0.8528, + "step": 9080 + }, + { + "epoch": 0.3739008957186293, + "grad_norm": 1.2169603109359741, + "learning_rate": 0.00018739369381700707, + "loss": 0.8425, + "step": 9100 + }, + { + "epoch": 0.3747226559289999, + "grad_norm": 1.1716234683990479, + "learning_rate": 0.00018732893626251766, + "loss": 0.8358, + "step": 9120 + }, + { + "epoch": 0.3755444161393705, + "grad_norm": 1.0265463590621948, + "learning_rate": 0.00018726402405520425, + "loss": 0.8447, + "step": 9140 + }, + { + "epoch": 0.3763661763497411, + "grad_norm": 1.2216025590896606, + "learning_rate": 0.00018719895731002137, + "loss": 0.8247, + "step": 9160 + }, + { + "epoch": 0.37718793656011174, + "grad_norm": 1.0820845365524292, + "learning_rate": 0.0001871337361421972, + "loss": 0.8452, + "step": 9180 + }, + { + "epoch": 0.37800969677048235, + "grad_norm": 1.019952416419983, + "learning_rate": 0.00018706836066723347, + "loss": 0.8493, + "step": 9200 + }, + { + "epoch": 0.37883145698085297, + "grad_norm": 1.0316121578216553, + "learning_rate": 0.00018700283100090502, + "loss": 0.8447, + "step": 9220 + }, + { + "epoch": 0.3796532171912236, + "grad_norm": 1.1700369119644165, + "learning_rate": 0.00018693714725925994, + "loss": 0.8337, + "step": 9240 + }, + { + "epoch": 0.3804749774015942, + "grad_norm": 1.032667636871338, + "learning_rate": 0.00018687130955861902, + "loss": 0.8325, + "step": 9260 + }, + { + "epoch": 0.3812967376119648, + "grad_norm": 1.2093219757080078, + "learning_rate": 0.0001868053180155758, + "loss": 0.8295, + "step": 9280 + }, + { + "epoch": 0.3821184978223354, + "grad_norm": 1.1522185802459717, + "learning_rate": 0.00018673917274699618, + "loss": 0.841, + "step": 9300 + }, + { + "epoch": 0.38294025803270604, + "grad_norm": 1.2028223276138306, + "learning_rate": 0.00018667287387001834, + "loss": 0.8432, + "step": 9320 + }, + { + "epoch": 0.38376201824307665, + "grad_norm": 1.2288753986358643, + "learning_rate": 0.00018660642150205255, + "loss": 0.8586, + "step": 9340 + }, + { + "epoch": 0.38458377845344727, + "grad_norm": 1.1289194822311401, + "learning_rate": 0.00018653981576078075, + "loss": 0.8408, + "step": 9360 + }, + { + "epoch": 0.3854055386638179, + "grad_norm": 1.208264708518982, + "learning_rate": 0.00018647305676415665, + "loss": 0.8233, + "step": 9380 + }, + { + "epoch": 0.3862272988741885, + "grad_norm": 1.23066246509552, + "learning_rate": 0.00018640949387227146, + "loss": 0.8227, + "step": 9400 + }, + { + "epoch": 0.3870490590845591, + "grad_norm": 1.2344571352005005, + "learning_rate": 0.0001863424363680021, + "loss": 0.8376, + "step": 9420 + }, + { + "epoch": 0.3878708192949297, + "grad_norm": 1.1864609718322754, + "learning_rate": 0.00018627522595792413, + "loss": 0.8391, + "step": 9440 + }, + { + "epoch": 0.38869257950530034, + "grad_norm": 1.109244465827942, + "learning_rate": 0.00018620786276106203, + "loss": 0.8256, + "step": 9460 + }, + { + "epoch": 0.38951433971567095, + "grad_norm": 1.0809723138809204, + "learning_rate": 0.00018614034689671082, + "loss": 0.8492, + "step": 9480 + }, + { + "epoch": 0.39033609992604157, + "grad_norm": 1.2196381092071533, + "learning_rate": 0.00018607267848443591, + "loss": 0.8415, + "step": 9500 + }, + { + "epoch": 0.3911578601364122, + "grad_norm": 1.1039822101593018, + "learning_rate": 0.00018600485764407282, + "loss": 0.8464, + "step": 9520 + }, + { + "epoch": 0.3919796203467828, + "grad_norm": 1.115871548652649, + "learning_rate": 0.00018593688449572703, + "loss": 0.8368, + "step": 9540 + }, + { + "epoch": 0.3928013805571534, + "grad_norm": 1.0675318241119385, + "learning_rate": 0.0001858687591597738, + "loss": 0.8276, + "step": 9560 + }, + { + "epoch": 0.393623140767524, + "grad_norm": 1.1515909433364868, + "learning_rate": 0.00018580048175685784, + "loss": 0.8259, + "step": 9580 + }, + { + "epoch": 0.39444490097789464, + "grad_norm": 1.1502107381820679, + "learning_rate": 0.00018573205240789316, + "loss": 0.8549, + "step": 9600 + }, + { + "epoch": 0.39526666118826526, + "grad_norm": 1.0891849994659424, + "learning_rate": 0.00018566347123406284, + "loss": 0.8396, + "step": 9620 + }, + { + "epoch": 0.39608842139863587, + "grad_norm": 1.1536388397216797, + "learning_rate": 0.00018559473835681896, + "loss": 0.8401, + "step": 9640 + }, + { + "epoch": 0.3969101816090065, + "grad_norm": 1.320541262626648, + "learning_rate": 0.00018552585389788203, + "loss": 0.8454, + "step": 9660 + }, + { + "epoch": 0.3977319418193771, + "grad_norm": 0.96424800157547, + "learning_rate": 0.00018545681797924125, + "loss": 0.8257, + "step": 9680 + }, + { + "epoch": 0.3985537020297477, + "grad_norm": 1.1451895236968994, + "learning_rate": 0.00018538763072315382, + "loss": 0.8327, + "step": 9700 + }, + { + "epoch": 0.39937546224011833, + "grad_norm": 1.1693811416625977, + "learning_rate": 0.00018531829225214508, + "loss": 0.8342, + "step": 9720 + }, + { + "epoch": 0.40019722245048894, + "grad_norm": 1.0285801887512207, + "learning_rate": 0.00018524880268900812, + "loss": 0.8261, + "step": 9740 + }, + { + "epoch": 0.40101898266085956, + "grad_norm": 1.1713870763778687, + "learning_rate": 0.00018517916215680363, + "loss": 0.8597, + "step": 9760 + }, + { + "epoch": 0.40184074287123017, + "grad_norm": 1.117725133895874, + "learning_rate": 0.00018510937077885958, + "loss": 0.8281, + "step": 9780 + }, + { + "epoch": 0.4026625030816008, + "grad_norm": 1.2378820180892944, + "learning_rate": 0.00018503942867877118, + "loss": 0.8619, + "step": 9800 + }, + { + "epoch": 0.4034842632919714, + "grad_norm": 1.2209067344665527, + "learning_rate": 0.00018496933598040048, + "loss": 0.8549, + "step": 9820 + }, + { + "epoch": 0.404306023502342, + "grad_norm": 1.0837441682815552, + "learning_rate": 0.00018489909280787627, + "loss": 0.8575, + "step": 9840 + }, + { + "epoch": 0.40512778371271263, + "grad_norm": 1.2987329959869385, + "learning_rate": 0.00018482869928559379, + "loss": 0.8467, + "step": 9860 + }, + { + "epoch": 0.40594954392308324, + "grad_norm": 1.216752052307129, + "learning_rate": 0.00018475815553821456, + "loss": 0.8333, + "step": 9880 + }, + { + "epoch": 0.40677130413345386, + "grad_norm": 1.2210928201675415, + "learning_rate": 0.0001846874616906661, + "loss": 0.8377, + "step": 9900 + }, + { + "epoch": 0.4075930643438245, + "grad_norm": 1.007938027381897, + "learning_rate": 0.0001846166178681418, + "loss": 0.8422, + "step": 9920 + }, + { + "epoch": 0.4084148245541951, + "grad_norm": 1.1827200651168823, + "learning_rate": 0.00018454562419610058, + "loss": 0.8456, + "step": 9940 + }, + { + "epoch": 0.4092365847645657, + "grad_norm": 1.2097376585006714, + "learning_rate": 0.00018447448080026682, + "loss": 0.8408, + "step": 9960 + }, + { + "epoch": 0.4100583449749363, + "grad_norm": 1.0128288269042969, + "learning_rate": 0.00018440318780662998, + "loss": 0.8517, + "step": 9980 + }, + { + "epoch": 0.41088010518530693, + "grad_norm": 1.1385818719863892, + "learning_rate": 0.00018433174534144444, + "loss": 0.8532, + "step": 10000 + }, + { + "epoch": 0.41088010518530693, + "eval_loss": 1.1022228002548218, + "eval_runtime": 16.4762, + "eval_samples_per_second": 159.018, + "eval_steps_per_second": 4.977, + "step": 10000 + }, + { + "epoch": 0.41170186539567755, + "grad_norm": 1.2568473815917969, + "learning_rate": 0.00018426015353122934, + "loss": 0.8304, + "step": 10020 + }, + { + "epoch": 0.41252362560604816, + "grad_norm": 1.2366386651992798, + "learning_rate": 0.00018418841250276825, + "loss": 0.8244, + "step": 10040 + }, + { + "epoch": 0.4133453858164188, + "grad_norm": 1.0323954820632935, + "learning_rate": 0.000184116522383109, + "loss": 0.8167, + "step": 10060 + }, + { + "epoch": 0.4141671460267894, + "grad_norm": 1.1676981449127197, + "learning_rate": 0.00018404448329956344, + "loss": 0.8356, + "step": 10080 + }, + { + "epoch": 0.41498890623716, + "grad_norm": 1.2117750644683838, + "learning_rate": 0.0001839722953797073, + "loss": 0.8483, + "step": 10100 + }, + { + "epoch": 0.4158106664475306, + "grad_norm": 1.1228010654449463, + "learning_rate": 0.00018389995875137978, + "loss": 0.818, + "step": 10120 + }, + { + "epoch": 0.41663242665790123, + "grad_norm": 1.03129243850708, + "learning_rate": 0.00018382747354268351, + "loss": 0.8556, + "step": 10140 + }, + { + "epoch": 0.41745418686827185, + "grad_norm": 1.239634394645691, + "learning_rate": 0.00018375847508876958, + "loss": 0.8581, + "step": 10160 + }, + { + "epoch": 0.41827594707864246, + "grad_norm": 1.046134114265442, + "learning_rate": 0.0001836857005178056, + "loss": 0.8241, + "step": 10180 + }, + { + "epoch": 0.4190977072890131, + "grad_norm": 1.3205509185791016, + "learning_rate": 0.00018361277774590765, + "loss": 0.8409, + "step": 10200 + }, + { + "epoch": 0.4199194674993837, + "grad_norm": 1.1549128293991089, + "learning_rate": 0.00018353970690221646, + "loss": 0.8252, + "step": 10220 + }, + { + "epoch": 0.4207412277097543, + "grad_norm": 1.1062310934066772, + "learning_rate": 0.00018346648811613477, + "loss": 0.8428, + "step": 10240 + }, + { + "epoch": 0.4215629879201249, + "grad_norm": 1.2223172187805176, + "learning_rate": 0.0001833931215173274, + "loss": 0.8118, + "step": 10260 + }, + { + "epoch": 0.42238474813049554, + "grad_norm": 1.2306350469589233, + "learning_rate": 0.00018331960723572105, + "loss": 0.8615, + "step": 10280 + }, + { + "epoch": 0.42320650834086615, + "grad_norm": 1.301054835319519, + "learning_rate": 0.0001832459454015038, + "loss": 0.8388, + "step": 10300 + }, + { + "epoch": 0.42402826855123676, + "grad_norm": 1.1266446113586426, + "learning_rate": 0.00018317213614512507, + "loss": 0.844, + "step": 10320 + }, + { + "epoch": 0.4248500287616074, + "grad_norm": 1.1231412887573242, + "learning_rate": 0.0001830981795972954, + "loss": 0.8581, + "step": 10340 + }, + { + "epoch": 0.425671788971978, + "grad_norm": 1.0722932815551758, + "learning_rate": 0.00018302407588898612, + "loss": 0.844, + "step": 10360 + }, + { + "epoch": 0.4264935491823486, + "grad_norm": 1.122125506401062, + "learning_rate": 0.0001829498251514292, + "loss": 0.817, + "step": 10380 + }, + { + "epoch": 0.4273153093927192, + "grad_norm": 1.1031594276428223, + "learning_rate": 0.00018287542751611703, + "loss": 0.8448, + "step": 10400 + }, + { + "epoch": 0.42813706960308984, + "grad_norm": 1.166870355606079, + "learning_rate": 0.00018280088311480201, + "loss": 0.8607, + "step": 10420 + }, + { + "epoch": 0.42895882981346045, + "grad_norm": 1.1672008037567139, + "learning_rate": 0.0001827261920794966, + "loss": 0.8206, + "step": 10440 + }, + { + "epoch": 0.42978059002383107, + "grad_norm": 1.7684413194656372, + "learning_rate": 0.00018265135454247284, + "loss": 0.8503, + "step": 10460 + }, + { + "epoch": 0.4306023502342017, + "grad_norm": 1.3215503692626953, + "learning_rate": 0.00018257637063626226, + "loss": 0.8535, + "step": 10480 + }, + { + "epoch": 0.4314241104445723, + "grad_norm": 1.1748920679092407, + "learning_rate": 0.0001825012404936556, + "loss": 0.8364, + "step": 10500 + }, + { + "epoch": 0.4322458706549429, + "grad_norm": 1.0914533138275146, + "learning_rate": 0.00018242596424770252, + "loss": 0.8144, + "step": 10520 + }, + { + "epoch": 0.4330676308653135, + "grad_norm": 1.2216529846191406, + "learning_rate": 0.0001823505420317115, + "loss": 0.8566, + "step": 10540 + }, + { + "epoch": 0.43388939107568414, + "grad_norm": 1.2634260654449463, + "learning_rate": 0.00018227497397924948, + "loss": 0.8372, + "step": 10560 + }, + { + "epoch": 0.43471115128605475, + "grad_norm": 1.244780421257019, + "learning_rate": 0.00018219926022414163, + "loss": 0.8397, + "step": 10580 + }, + { + "epoch": 0.43553291149642537, + "grad_norm": 1.177216649055481, + "learning_rate": 0.00018212340090047118, + "loss": 0.8122, + "step": 10600 + }, + { + "epoch": 0.436354671706796, + "grad_norm": 1.1915186643600464, + "learning_rate": 0.0001820473961425792, + "loss": 0.829, + "step": 10620 + }, + { + "epoch": 0.4371764319171666, + "grad_norm": 0.9915897250175476, + "learning_rate": 0.00018197124608506423, + "loss": 0.8401, + "step": 10640 + }, + { + "epoch": 0.4379981921275372, + "grad_norm": 1.1870684623718262, + "learning_rate": 0.00018189495086278214, + "loss": 0.8481, + "step": 10660 + }, + { + "epoch": 0.4388199523379078, + "grad_norm": 1.1735584735870361, + "learning_rate": 0.00018181851061084596, + "loss": 0.8539, + "step": 10680 + }, + { + "epoch": 0.4396417125482784, + "grad_norm": 1.2218267917633057, + "learning_rate": 0.00018174192546462542, + "loss": 0.8416, + "step": 10700 + }, + { + "epoch": 0.440463472758649, + "grad_norm": 1.2126885652542114, + "learning_rate": 0.000181665195559747, + "loss": 0.8353, + "step": 10720 + }, + { + "epoch": 0.4412852329690196, + "grad_norm": 1.1790574789047241, + "learning_rate": 0.00018158832103209335, + "loss": 0.8235, + "step": 10740 + }, + { + "epoch": 0.44210699317939023, + "grad_norm": 1.153855323791504, + "learning_rate": 0.00018151130201780343, + "loss": 0.8415, + "step": 10760 + }, + { + "epoch": 0.44292875338976084, + "grad_norm": 1.0696359872817993, + "learning_rate": 0.00018143413865327198, + "loss": 0.8514, + "step": 10780 + }, + { + "epoch": 0.44375051360013146, + "grad_norm": 1.2022265195846558, + "learning_rate": 0.00018135683107514932, + "loss": 0.8102, + "step": 10800 + }, + { + "epoch": 0.44457227381050207, + "grad_norm": 1.1839139461517334, + "learning_rate": 0.00018127937942034127, + "loss": 0.8243, + "step": 10820 + }, + { + "epoch": 0.4453940340208727, + "grad_norm": 1.1672074794769287, + "learning_rate": 0.00018120178382600874, + "loss": 0.8707, + "step": 10840 + }, + { + "epoch": 0.4462157942312433, + "grad_norm": 1.1144109964370728, + "learning_rate": 0.00018112404442956754, + "loss": 0.833, + "step": 10860 + }, + { + "epoch": 0.4470375544416139, + "grad_norm": 1.2560542821884155, + "learning_rate": 0.00018104616136868816, + "loss": 0.8164, + "step": 10880 + }, + { + "epoch": 0.44785931465198453, + "grad_norm": 1.1620038747787476, + "learning_rate": 0.00018096813478129552, + "loss": 0.857, + "step": 10900 + }, + { + "epoch": 0.44868107486235514, + "grad_norm": 1.0956732034683228, + "learning_rate": 0.0001808899648055687, + "loss": 0.8069, + "step": 10920 + }, + { + "epoch": 0.44950283507272576, + "grad_norm": 1.153667688369751, + "learning_rate": 0.00018081165157994068, + "loss": 0.8228, + "step": 10940 + }, + { + "epoch": 0.4503245952830964, + "grad_norm": 1.1663360595703125, + "learning_rate": 0.00018073319524309822, + "loss": 0.8438, + "step": 10960 + }, + { + "epoch": 0.451146355493467, + "grad_norm": 1.0581029653549194, + "learning_rate": 0.00018065459593398137, + "loss": 0.8245, + "step": 10980 + }, + { + "epoch": 0.4519681157038376, + "grad_norm": 1.3234822750091553, + "learning_rate": 0.0001805758537917835, + "loss": 0.8462, + "step": 11000 + }, + { + "epoch": 0.4527898759142082, + "grad_norm": 1.1863617897033691, + "learning_rate": 0.0001804969689559509, + "loss": 0.815, + "step": 11020 + }, + { + "epoch": 0.45361163612457883, + "grad_norm": 1.2528035640716553, + "learning_rate": 0.00018041794156618252, + "loss": 0.8239, + "step": 11040 + }, + { + "epoch": 0.45443339633494945, + "grad_norm": 2.205885887145996, + "learning_rate": 0.00018033877176242975, + "loss": 0.8061, + "step": 11060 + }, + { + "epoch": 0.45525515654532006, + "grad_norm": 1.2284212112426758, + "learning_rate": 0.00018025945968489626, + "loss": 0.837, + "step": 11080 + }, + { + "epoch": 0.4560769167556907, + "grad_norm": 1.1616712808609009, + "learning_rate": 0.00018018000547403765, + "loss": 0.8502, + "step": 11100 + }, + { + "epoch": 0.4568986769660613, + "grad_norm": 1.188586711883545, + "learning_rate": 0.00018010040927056117, + "loss": 0.842, + "step": 11120 + }, + { + "epoch": 0.4577204371764319, + "grad_norm": 1.1483261585235596, + "learning_rate": 0.00018002067121542558, + "loss": 0.8503, + "step": 11140 + }, + { + "epoch": 0.4585421973868025, + "grad_norm": 1.1956408023834229, + "learning_rate": 0.00017994079144984087, + "loss": 0.8452, + "step": 11160 + }, + { + "epoch": 0.45936395759717313, + "grad_norm": 1.281135082244873, + "learning_rate": 0.00017986077011526792, + "loss": 0.8641, + "step": 11180 + }, + { + "epoch": 0.46018571780754375, + "grad_norm": 1.3326410055160522, + "learning_rate": 0.00017978060735341836, + "loss": 0.81, + "step": 11200 + }, + { + "epoch": 0.46100747801791436, + "grad_norm": 1.331778645515442, + "learning_rate": 0.0001797003033062543, + "loss": 0.8401, + "step": 11220 + }, + { + "epoch": 0.461829238228285, + "grad_norm": 1.1928249597549438, + "learning_rate": 0.000179619858115988, + "loss": 0.8165, + "step": 11240 + }, + { + "epoch": 0.4626509984386556, + "grad_norm": 1.3430322408676147, + "learning_rate": 0.0001795392719250817, + "loss": 0.8581, + "step": 11260 + }, + { + "epoch": 0.4634727586490262, + "grad_norm": 1.142386555671692, + "learning_rate": 0.00017945854487624733, + "loss": 0.8397, + "step": 11280 + }, + { + "epoch": 0.4642945188593968, + "grad_norm": 1.1435790061950684, + "learning_rate": 0.0001793776771124463, + "loss": 0.8781, + "step": 11300 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 1.1941109895706177, + "learning_rate": 0.00017929666877688919, + "loss": 0.8508, + "step": 11320 + }, + { + "epoch": 0.46593803928013805, + "grad_norm": 1.1792101860046387, + "learning_rate": 0.00017921552001303552, + "loss": 0.8559, + "step": 11340 + }, + { + "epoch": 0.46675979949050866, + "grad_norm": 1.2072590589523315, + "learning_rate": 0.00017913423096459354, + "loss": 0.8416, + "step": 11360 + }, + { + "epoch": 0.4675815597008793, + "grad_norm": 1.220444679260254, + "learning_rate": 0.00017905280177551983, + "loss": 0.8261, + "step": 11380 + }, + { + "epoch": 0.4684033199112499, + "grad_norm": 1.1701765060424805, + "learning_rate": 0.00017897123259001926, + "loss": 0.8509, + "step": 11400 + }, + { + "epoch": 0.4692250801216205, + "grad_norm": 1.2121479511260986, + "learning_rate": 0.00017888952355254455, + "loss": 0.8424, + "step": 11420 + }, + { + "epoch": 0.4700468403319911, + "grad_norm": 1.1610180139541626, + "learning_rate": 0.0001788076748077962, + "loss": 0.8433, + "step": 11440 + }, + { + "epoch": 0.47086860054236174, + "grad_norm": 1.0766757726669312, + "learning_rate": 0.0001787256865007219, + "loss": 0.8175, + "step": 11460 + }, + { + "epoch": 0.47169036075273235, + "grad_norm": 1.2801762819290161, + "learning_rate": 0.00017864355877651676, + "loss": 0.8416, + "step": 11480 + }, + { + "epoch": 0.47251212096310297, + "grad_norm": 1.167050838470459, + "learning_rate": 0.00017856129178062257, + "loss": 0.842, + "step": 11500 + }, + { + "epoch": 0.4733338811734736, + "grad_norm": 1.0567585229873657, + "learning_rate": 0.0001784788856587279, + "loss": 0.8467, + "step": 11520 + }, + { + "epoch": 0.4741556413838442, + "grad_norm": 1.101453185081482, + "learning_rate": 0.00017839634055676762, + "loss": 0.836, + "step": 11540 + }, + { + "epoch": 0.4749774015942148, + "grad_norm": 1.2371095418930054, + "learning_rate": 0.00017831365662092274, + "loss": 0.8623, + "step": 11560 + }, + { + "epoch": 0.4757991618045854, + "grad_norm": 1.2750577926635742, + "learning_rate": 0.00017823083399762018, + "loss": 0.8413, + "step": 11580 + }, + { + "epoch": 0.47662092201495604, + "grad_norm": 1.2494827508926392, + "learning_rate": 0.00017814787283353245, + "loss": 0.8393, + "step": 11600 + }, + { + "epoch": 0.47744268222532665, + "grad_norm": 1.1090929508209229, + "learning_rate": 0.0001780647732755773, + "loss": 0.8224, + "step": 11620 + }, + { + "epoch": 0.47826444243569727, + "grad_norm": 1.2686065435409546, + "learning_rate": 0.00017798153547091773, + "loss": 0.8468, + "step": 11640 + }, + { + "epoch": 0.4790862026460679, + "grad_norm": 1.0384360551834106, + "learning_rate": 0.0001778981595669615, + "loss": 0.8346, + "step": 11660 + }, + { + "epoch": 0.4799079628564385, + "grad_norm": 1.1490122079849243, + "learning_rate": 0.0001778146457113608, + "loss": 0.8565, + "step": 11680 + }, + { + "epoch": 0.4807297230668091, + "grad_norm": 1.1408191919326782, + "learning_rate": 0.00017773099405201236, + "loss": 0.8337, + "step": 11700 + }, + { + "epoch": 0.4815514832771797, + "grad_norm": 1.1228752136230469, + "learning_rate": 0.00017764720473705675, + "loss": 0.8236, + "step": 11720 + }, + { + "epoch": 0.48237324348755034, + "grad_norm": 1.0747302770614624, + "learning_rate": 0.00017756327791487847, + "loss": 0.8439, + "step": 11740 + }, + { + "epoch": 0.48319500369792096, + "grad_norm": 1.1848806142807007, + "learning_rate": 0.0001774792137341054, + "loss": 0.8433, + "step": 11760 + }, + { + "epoch": 0.48401676390829157, + "grad_norm": 1.1458165645599365, + "learning_rate": 0.00017739501234360875, + "loss": 0.8513, + "step": 11780 + }, + { + "epoch": 0.4848385241186622, + "grad_norm": 1.2221580743789673, + "learning_rate": 0.00017731067389250272, + "loss": 0.8345, + "step": 11800 + }, + { + "epoch": 0.4856602843290328, + "grad_norm": 1.2535064220428467, + "learning_rate": 0.00017722619853014423, + "loss": 0.842, + "step": 11820 + }, + { + "epoch": 0.4864820445394034, + "grad_norm": 1.2956807613372803, + "learning_rate": 0.0001771415864061326, + "loss": 0.833, + "step": 11840 + }, + { + "epoch": 0.487303804749774, + "grad_norm": 1.3538552522659302, + "learning_rate": 0.0001770568376703094, + "loss": 0.8389, + "step": 11860 + }, + { + "epoch": 0.48812556496014464, + "grad_norm": 1.325257658958435, + "learning_rate": 0.00017697195247275813, + "loss": 0.8041, + "step": 11880 + }, + { + "epoch": 0.48894732517051526, + "grad_norm": 1.1765714883804321, + "learning_rate": 0.00017688693096380392, + "loss": 0.8406, + "step": 11900 + }, + { + "epoch": 0.48976908538088587, + "grad_norm": 1.1218301057815552, + "learning_rate": 0.00017680177329401333, + "loss": 0.8562, + "step": 11920 + }, + { + "epoch": 0.4905908455912565, + "grad_norm": 1.1272341012954712, + "learning_rate": 0.00017671647961419406, + "loss": 0.8638, + "step": 11940 + }, + { + "epoch": 0.4914126058016271, + "grad_norm": 1.0992316007614136, + "learning_rate": 0.00017663105007539463, + "loss": 0.8156, + "step": 11960 + }, + { + "epoch": 0.4922343660119977, + "grad_norm": 1.1406649351119995, + "learning_rate": 0.00017654548482890414, + "loss": 0.8444, + "step": 11980 + }, + { + "epoch": 0.49305612622236833, + "grad_norm": 1.251295566558838, + "learning_rate": 0.00017645978402625214, + "loss": 0.8549, + "step": 12000 + }, + { + "epoch": 0.49305612622236833, + "eval_loss": 1.1104093790054321, + "eval_runtime": 16.7503, + "eval_samples_per_second": 156.415, + "eval_steps_per_second": 4.895, + "step": 12000 + }, + { + "epoch": 0.49387788643273894, + "grad_norm": 1.153436303138733, + "learning_rate": 0.00017637394781920812, + "loss": 0.8262, + "step": 12020 + }, + { + "epoch": 0.49469964664310956, + "grad_norm": 1.200875997543335, + "learning_rate": 0.00017628797635978134, + "loss": 0.8644, + "step": 12040 + }, + { + "epoch": 0.4955214068534802, + "grad_norm": 1.2446619272232056, + "learning_rate": 0.00017620186980022072, + "loss": 0.8485, + "step": 12060 + }, + { + "epoch": 0.4963431670638508, + "grad_norm": 1.1602336168289185, + "learning_rate": 0.00017611562829301429, + "loss": 0.8318, + "step": 12080 + }, + { + "epoch": 0.4971649272742214, + "grad_norm": 1.4564729928970337, + "learning_rate": 0.00017602925199088917, + "loss": 0.8375, + "step": 12100 + }, + { + "epoch": 0.497986687484592, + "grad_norm": 1.107946753501892, + "learning_rate": 0.00017594274104681108, + "loss": 0.8292, + "step": 12120 + }, + { + "epoch": 0.49880844769496263, + "grad_norm": 1.250048279762268, + "learning_rate": 0.00017585609561398426, + "loss": 0.817, + "step": 12140 + }, + { + "epoch": 0.49963020790533325, + "grad_norm": 0.956064760684967, + "learning_rate": 0.00017576931584585117, + "loss": 0.8311, + "step": 12160 + }, + { + "epoch": 0.5004519681157038, + "grad_norm": 1.1825581789016724, + "learning_rate": 0.000175682401896092, + "loss": 0.8232, + "step": 12180 + }, + { + "epoch": 0.5012737283260744, + "grad_norm": 1.26679265499115, + "learning_rate": 0.00017559535391862476, + "loss": 0.7985, + "step": 12200 + }, + { + "epoch": 0.502095488536445, + "grad_norm": 1.1247515678405762, + "learning_rate": 0.00017550817206760463, + "loss": 0.8177, + "step": 12220 + }, + { + "epoch": 0.5029172487468156, + "grad_norm": 1.2112337350845337, + "learning_rate": 0.00017542085649742403, + "loss": 0.851, + "step": 12240 + }, + { + "epoch": 0.5037390089571863, + "grad_norm": 1.3968725204467773, + "learning_rate": 0.00017533340736271207, + "loss": 0.8422, + "step": 12260 + }, + { + "epoch": 0.5045607691675569, + "grad_norm": 1.2992043495178223, + "learning_rate": 0.00017524582481833444, + "loss": 0.8492, + "step": 12280 + }, + { + "epoch": 0.5053825293779275, + "grad_norm": 1.1721656322479248, + "learning_rate": 0.0001751581090193931, + "loss": 0.8321, + "step": 12300 + }, + { + "epoch": 0.5062042895882981, + "grad_norm": 1.3727058172225952, + "learning_rate": 0.00017507026012122595, + "loss": 0.8666, + "step": 12320 + }, + { + "epoch": 0.5070260497986687, + "grad_norm": 1.27950119972229, + "learning_rate": 0.0001749822782794067, + "loss": 0.8643, + "step": 12340 + }, + { + "epoch": 0.5078478100090393, + "grad_norm": 0.9998101592063904, + "learning_rate": 0.00017489416364974432, + "loss": 0.8319, + "step": 12360 + }, + { + "epoch": 0.50866957021941, + "grad_norm": 1.210250973701477, + "learning_rate": 0.0001748059163882831, + "loss": 0.8183, + "step": 12380 + }, + { + "epoch": 0.5094913304297806, + "grad_norm": 1.2826182842254639, + "learning_rate": 0.00017471753665130213, + "loss": 0.8421, + "step": 12400 + }, + { + "epoch": 0.5103130906401512, + "grad_norm": 1.2036994695663452, + "learning_rate": 0.00017462902459531508, + "loss": 0.8363, + "step": 12420 + }, + { + "epoch": 0.5111348508505218, + "grad_norm": 1.0195825099945068, + "learning_rate": 0.00017454038037707008, + "loss": 0.8338, + "step": 12440 + }, + { + "epoch": 0.5119566110608924, + "grad_norm": 1.2347939014434814, + "learning_rate": 0.00017445160415354916, + "loss": 0.839, + "step": 12460 + }, + { + "epoch": 0.512778371271263, + "grad_norm": 1.196473240852356, + "learning_rate": 0.00017436269608196817, + "loss": 0.85, + "step": 12480 + }, + { + "epoch": 0.5136001314816336, + "grad_norm": 1.4037846326828003, + "learning_rate": 0.00017427365631977648, + "loss": 0.84, + "step": 12500 + }, + { + "epoch": 0.5144218916920043, + "grad_norm": 1.2222518920898438, + "learning_rate": 0.00017418448502465667, + "loss": 0.855, + "step": 12520 + }, + { + "epoch": 0.5152436519023749, + "grad_norm": 1.2747788429260254, + "learning_rate": 0.0001740951823545242, + "loss": 0.8545, + "step": 12540 + }, + { + "epoch": 0.5160654121127455, + "grad_norm": 1.2375946044921875, + "learning_rate": 0.00017400574846752724, + "loss": 0.8351, + "step": 12560 + }, + { + "epoch": 0.5168871723231161, + "grad_norm": 1.084808588027954, + "learning_rate": 0.00017391618352204633, + "loss": 0.8344, + "step": 12580 + }, + { + "epoch": 0.5177089325334867, + "grad_norm": 1.228043794631958, + "learning_rate": 0.00017382648767669408, + "loss": 0.8321, + "step": 12600 + }, + { + "epoch": 0.5185306927438573, + "grad_norm": 1.125532865524292, + "learning_rate": 0.00017373666109031497, + "loss": 0.8383, + "step": 12620 + }, + { + "epoch": 0.5193524529542279, + "grad_norm": 1.1619880199432373, + "learning_rate": 0.00017364670392198492, + "loss": 0.8318, + "step": 12640 + }, + { + "epoch": 0.5201742131645986, + "grad_norm": 1.1036595106124878, + "learning_rate": 0.00017355661633101116, + "loss": 0.8265, + "step": 12660 + }, + { + "epoch": 0.5209959733749692, + "grad_norm": 1.084410548210144, + "learning_rate": 0.0001734663984769319, + "loss": 0.8293, + "step": 12680 + }, + { + "epoch": 0.5218177335853398, + "grad_norm": 1.3622374534606934, + "learning_rate": 0.000173376050519516, + "loss": 0.8496, + "step": 12700 + }, + { + "epoch": 0.5226394937957104, + "grad_norm": 1.2676513195037842, + "learning_rate": 0.00017328557261876273, + "loss": 0.8357, + "step": 12720 + }, + { + "epoch": 0.523461254006081, + "grad_norm": 1.1944401264190674, + "learning_rate": 0.00017319496493490148, + "loss": 0.8162, + "step": 12740 + }, + { + "epoch": 0.5242830142164516, + "grad_norm": 1.2380664348602295, + "learning_rate": 0.00017310422762839155, + "loss": 0.8319, + "step": 12760 + }, + { + "epoch": 0.5251047744268222, + "grad_norm": 1.3865251541137695, + "learning_rate": 0.00017301336085992163, + "loss": 0.8181, + "step": 12780 + }, + { + "epoch": 0.5259265346371929, + "grad_norm": 1.1436952352523804, + "learning_rate": 0.00017292236479040984, + "loss": 0.827, + "step": 12800 + }, + { + "epoch": 0.5267482948475635, + "grad_norm": 1.4181216955184937, + "learning_rate": 0.0001728312395810032, + "loss": 0.8297, + "step": 12820 + }, + { + "epoch": 0.5275700550579341, + "grad_norm": 1.4452232122421265, + "learning_rate": 0.00017273998539307742, + "loss": 0.8397, + "step": 12840 + }, + { + "epoch": 0.5283918152683047, + "grad_norm": 1.1824501752853394, + "learning_rate": 0.00017264860238823667, + "loss": 0.8088, + "step": 12860 + }, + { + "epoch": 0.5292135754786753, + "grad_norm": 1.1516762971878052, + "learning_rate": 0.0001725570907283132, + "loss": 0.8235, + "step": 12880 + }, + { + "epoch": 0.5300353356890459, + "grad_norm": 1.0846797227859497, + "learning_rate": 0.00017246545057536712, + "loss": 0.8149, + "step": 12900 + }, + { + "epoch": 0.5308570958994165, + "grad_norm": 1.244383692741394, + "learning_rate": 0.00017237368209168608, + "loss": 0.8488, + "step": 12920 + }, + { + "epoch": 0.5316788561097872, + "grad_norm": 1.0443450212478638, + "learning_rate": 0.000172281785439785, + "loss": 0.8089, + "step": 12940 + }, + { + "epoch": 0.5325006163201578, + "grad_norm": 1.282185673713684, + "learning_rate": 0.00017218976078240582, + "loss": 0.8164, + "step": 12960 + }, + { + "epoch": 0.5333223765305284, + "grad_norm": 1.264277696609497, + "learning_rate": 0.0001720976082825171, + "loss": 0.8512, + "step": 12980 + }, + { + "epoch": 0.534144136740899, + "grad_norm": 1.1357461214065552, + "learning_rate": 0.00017200532810331378, + "loss": 0.8368, + "step": 13000 + }, + { + "epoch": 0.5349658969512696, + "grad_norm": 1.066361427307129, + "learning_rate": 0.00017191292040821696, + "loss": 0.8273, + "step": 13020 + }, + { + "epoch": 0.5357876571616402, + "grad_norm": 1.2606313228607178, + "learning_rate": 0.00017182038536087363, + "loss": 0.8513, + "step": 13040 + }, + { + "epoch": 0.5366094173720108, + "grad_norm": 1.1224803924560547, + "learning_rate": 0.00017172772312515618, + "loss": 0.8417, + "step": 13060 + }, + { + "epoch": 0.5374311775823815, + "grad_norm": 1.2137328386306763, + "learning_rate": 0.00017163957634245275, + "loss": 0.8264, + "step": 13080 + }, + { + "epoch": 0.5382529377927521, + "grad_norm": 1.1143057346343994, + "learning_rate": 0.00017154666656159665, + "loss": 0.8317, + "step": 13100 + }, + { + "epoch": 0.5390746980031227, + "grad_norm": 1.087512493133545, + "learning_rate": 0.00017145363007710135, + "loss": 0.8194, + "step": 13120 + }, + { + "epoch": 0.5398964582134933, + "grad_norm": 1.2861449718475342, + "learning_rate": 0.0001713604670537273, + "loss": 0.843, + "step": 13140 + }, + { + "epoch": 0.5407182184238639, + "grad_norm": 1.2231301069259644, + "learning_rate": 0.00017126717765645908, + "loss": 0.8192, + "step": 13160 + }, + { + "epoch": 0.5415399786342345, + "grad_norm": 1.2475714683532715, + "learning_rate": 0.00017117376205050502, + "loss": 0.8524, + "step": 13180 + }, + { + "epoch": 0.5423617388446051, + "grad_norm": 1.1694715023040771, + "learning_rate": 0.00017108022040129695, + "loss": 0.8381, + "step": 13200 + }, + { + "epoch": 0.5431834990549758, + "grad_norm": 1.29911470413208, + "learning_rate": 0.00017098655287448993, + "loss": 0.8056, + "step": 13220 + }, + { + "epoch": 0.5440052592653464, + "grad_norm": 1.063346028327942, + "learning_rate": 0.00017089275963596195, + "loss": 0.8328, + "step": 13240 + }, + { + "epoch": 0.544827019475717, + "grad_norm": 1.6195141077041626, + "learning_rate": 0.0001707988408518136, + "loss": 0.8597, + "step": 13260 + }, + { + "epoch": 0.5456487796860876, + "grad_norm": 1.2005921602249146, + "learning_rate": 0.00017070479668836785, + "loss": 0.8415, + "step": 13280 + }, + { + "epoch": 0.5464705398964582, + "grad_norm": 1.1937131881713867, + "learning_rate": 0.00017061533875220887, + "loss": 0.8171, + "step": 13300 + }, + { + "epoch": 0.5472923001068288, + "grad_norm": 1.1822235584259033, + "learning_rate": 0.0001705210505783601, + "loss": 0.8596, + "step": 13320 + }, + { + "epoch": 0.5481140603171994, + "grad_norm": 1.0105253458023071, + "learning_rate": 0.00017042663751715912, + "loss": 0.8266, + "step": 13340 + }, + { + "epoch": 0.5489358205275701, + "grad_norm": 1.20473051071167, + "learning_rate": 0.00017033209973580418, + "loss": 0.8437, + "step": 13360 + }, + { + "epoch": 0.5497575807379407, + "grad_norm": 1.237752914428711, + "learning_rate": 0.00017023743740171438, + "loss": 0.8278, + "step": 13380 + }, + { + "epoch": 0.5505793409483113, + "grad_norm": 1.2165151834487915, + "learning_rate": 0.00017014265068252948, + "loss": 0.8494, + "step": 13400 + }, + { + "epoch": 0.5514011011586819, + "grad_norm": 1.2971493005752563, + "learning_rate": 0.00017004773974610941, + "loss": 0.8312, + "step": 13420 + }, + { + "epoch": 0.5522228613690525, + "grad_norm": 1.091404676437378, + "learning_rate": 0.0001699527047605342, + "loss": 0.8247, + "step": 13440 + }, + { + "epoch": 0.5530446215794231, + "grad_norm": 1.1684538125991821, + "learning_rate": 0.00016985754589410342, + "loss": 0.8149, + "step": 13460 + }, + { + "epoch": 0.5538663817897937, + "grad_norm": 1.201493740081787, + "learning_rate": 0.00016976226331533617, + "loss": 0.8288, + "step": 13480 + }, + { + "epoch": 0.5546881420001644, + "grad_norm": 1.2418882846832275, + "learning_rate": 0.0001696668571929705, + "loss": 0.8286, + "step": 13500 + }, + { + "epoch": 0.555509902210535, + "grad_norm": 1.073002576828003, + "learning_rate": 0.00016957132769596336, + "loss": 0.8178, + "step": 13520 + }, + { + "epoch": 0.5563316624209056, + "grad_norm": 1.156518816947937, + "learning_rate": 0.0001694756749934901, + "loss": 0.8322, + "step": 13540 + }, + { + "epoch": 0.5571534226312762, + "grad_norm": 1.2362408638000488, + "learning_rate": 0.00016937989925494432, + "loss": 0.8338, + "step": 13560 + }, + { + "epoch": 0.5579751828416468, + "grad_norm": 1.1928801536560059, + "learning_rate": 0.00016928400064993745, + "loss": 0.8243, + "step": 13580 + }, + { + "epoch": 0.5587969430520174, + "grad_norm": 1.1574454307556152, + "learning_rate": 0.0001691879793482986, + "loss": 0.8124, + "step": 13600 + }, + { + "epoch": 0.559618703262388, + "grad_norm": 1.0951565504074097, + "learning_rate": 0.00016909183552007398, + "loss": 0.8575, + "step": 13620 + }, + { + "epoch": 0.5604404634727587, + "grad_norm": 1.2538108825683594, + "learning_rate": 0.00016899556933552704, + "loss": 0.8149, + "step": 13640 + }, + { + "epoch": 0.5612622236831293, + "grad_norm": 1.2423375844955444, + "learning_rate": 0.0001688991809651377, + "loss": 0.8114, + "step": 13660 + }, + { + "epoch": 0.5620839838934999, + "grad_norm": 1.0723458528518677, + "learning_rate": 0.00016880267057960239, + "loss": 0.8384, + "step": 13680 + }, + { + "epoch": 0.5629057441038705, + "grad_norm": 1.0973854064941406, + "learning_rate": 0.00016870603834983356, + "loss": 0.8373, + "step": 13700 + }, + { + "epoch": 0.5637275043142411, + "grad_norm": 1.1496849060058594, + "learning_rate": 0.00016860928444695943, + "loss": 0.831, + "step": 13720 + }, + { + "epoch": 0.5645492645246117, + "grad_norm": 1.175132393836975, + "learning_rate": 0.00016851240904232375, + "loss": 0.8486, + "step": 13740 + }, + { + "epoch": 0.5653710247349824, + "grad_norm": 1.018654227256775, + "learning_rate": 0.0001684154123074854, + "loss": 0.8332, + "step": 13760 + }, + { + "epoch": 0.566192784945353, + "grad_norm": 1.3159968852996826, + "learning_rate": 0.00016831829441421809, + "loss": 0.8336, + "step": 13780 + }, + { + "epoch": 0.5670145451557236, + "grad_norm": 1.1875556707382202, + "learning_rate": 0.0001682210555345102, + "loss": 0.8209, + "step": 13800 + }, + { + "epoch": 0.5678363053660942, + "grad_norm": 1.2860361337661743, + "learning_rate": 0.00016812369584056424, + "loss": 0.8453, + "step": 13820 + }, + { + "epoch": 0.5686580655764648, + "grad_norm": 1.2040901184082031, + "learning_rate": 0.00016802621550479675, + "loss": 0.8204, + "step": 13840 + }, + { + "epoch": 0.5694798257868354, + "grad_norm": 1.1987425088882446, + "learning_rate": 0.00016792861469983793, + "loss": 0.8347, + "step": 13860 + }, + { + "epoch": 0.570301585997206, + "grad_norm": 1.1014000177383423, + "learning_rate": 0.00016783089359853127, + "loss": 0.8142, + "step": 13880 + }, + { + "epoch": 0.5711233462075767, + "grad_norm": 1.1898833513259888, + "learning_rate": 0.00016773305237393328, + "loss": 0.8198, + "step": 13900 + }, + { + "epoch": 0.5719451064179473, + "grad_norm": 1.2249850034713745, + "learning_rate": 0.00016763509119931334, + "loss": 0.8039, + "step": 13920 + }, + { + "epoch": 0.5727668666283179, + "grad_norm": 1.2241109609603882, + "learning_rate": 0.00016753701024815304, + "loss": 0.8458, + "step": 13940 + }, + { + "epoch": 0.5735886268386885, + "grad_norm": 1.2025922536849976, + "learning_rate": 0.00016743880969414624, + "loss": 0.8103, + "step": 13960 + }, + { + "epoch": 0.5744103870490591, + "grad_norm": 1.1043455600738525, + "learning_rate": 0.0001673404897111986, + "loss": 0.834, + "step": 13980 + }, + { + "epoch": 0.5752321472594297, + "grad_norm": 1.2452826499938965, + "learning_rate": 0.00016724205047342715, + "loss": 0.8216, + "step": 14000 + }, + { + "epoch": 0.5752321472594297, + "eval_loss": 1.1319005489349365, + "eval_runtime": 16.4046, + "eval_samples_per_second": 159.712, + "eval_steps_per_second": 4.999, + "step": 14000 + }, + { + "epoch": 0.5760539074698003, + "grad_norm": 1.248207688331604, + "learning_rate": 0.00016714349215516032, + "loss": 0.824, + "step": 14020 + }, + { + "epoch": 0.576875667680171, + "grad_norm": 1.3563847541809082, + "learning_rate": 0.0001670448149309372, + "loss": 0.8587, + "step": 14040 + }, + { + "epoch": 0.5776974278905416, + "grad_norm": 1.2468847036361694, + "learning_rate": 0.00016694601897550762, + "loss": 0.8481, + "step": 14060 + }, + { + "epoch": 0.5785191881009122, + "grad_norm": 1.139793872833252, + "learning_rate": 0.0001668471044638316, + "loss": 0.8353, + "step": 14080 + }, + { + "epoch": 0.5793409483112828, + "grad_norm": 1.0366131067276, + "learning_rate": 0.0001667480715710791, + "loss": 0.8091, + "step": 14100 + }, + { + "epoch": 0.5801627085216534, + "grad_norm": 1.1087323427200317, + "learning_rate": 0.00016664892047262977, + "loss": 0.8198, + "step": 14120 + }, + { + "epoch": 0.580984468732024, + "grad_norm": 1.137624740600586, + "learning_rate": 0.0001665496513440726, + "loss": 0.832, + "step": 14140 + }, + { + "epoch": 0.5818062289423946, + "grad_norm": 1.1337158679962158, + "learning_rate": 0.00016645026436120551, + "loss": 0.8438, + "step": 14160 + }, + { + "epoch": 0.5826279891527653, + "grad_norm": 1.1277921199798584, + "learning_rate": 0.00016635075970003525, + "loss": 0.8158, + "step": 14180 + }, + { + "epoch": 0.5834497493631359, + "grad_norm": 1.1939393281936646, + "learning_rate": 0.00016625113753677693, + "loss": 0.835, + "step": 14200 + }, + { + "epoch": 0.5842715095735065, + "grad_norm": 1.343687891960144, + "learning_rate": 0.00016615139804785367, + "loss": 0.8231, + "step": 14220 + }, + { + "epoch": 0.5850932697838771, + "grad_norm": 1.1290326118469238, + "learning_rate": 0.00016605154140989647, + "loss": 0.8456, + "step": 14240 + }, + { + "epoch": 0.5859150299942477, + "grad_norm": 1.274527668952942, + "learning_rate": 0.00016595156779974376, + "loss": 0.85, + "step": 14260 + }, + { + "epoch": 0.5867367902046183, + "grad_norm": 1.2629293203353882, + "learning_rate": 0.00016585147739444104, + "loss": 0.8245, + "step": 14280 + }, + { + "epoch": 0.587558550414989, + "grad_norm": 1.1809213161468506, + "learning_rate": 0.0001657512703712408, + "loss": 0.8063, + "step": 14300 + }, + { + "epoch": 0.5883803106253596, + "grad_norm": 1.0857917070388794, + "learning_rate": 0.00016565094690760193, + "loss": 0.8158, + "step": 14320 + }, + { + "epoch": 0.5892020708357302, + "grad_norm": 1.1015921831130981, + "learning_rate": 0.00016555050718118953, + "loss": 0.8501, + "step": 14340 + }, + { + "epoch": 0.5900238310461008, + "grad_norm": 1.3138587474822998, + "learning_rate": 0.00016544995136987467, + "loss": 0.8416, + "step": 14360 + }, + { + "epoch": 0.5908455912564714, + "grad_norm": 1.476505994796753, + "learning_rate": 0.00016534927965173392, + "loss": 0.8526, + "step": 14380 + }, + { + "epoch": 0.591667351466842, + "grad_norm": 1.1754002571105957, + "learning_rate": 0.00016524849220504918, + "loss": 0.8136, + "step": 14400 + }, + { + "epoch": 0.5924891116772126, + "grad_norm": 1.2151134014129639, + "learning_rate": 0.00016514758920830724, + "loss": 0.821, + "step": 14420 + }, + { + "epoch": 0.5933108718875832, + "grad_norm": 1.2440420389175415, + "learning_rate": 0.0001650465708401995, + "loss": 0.8274, + "step": 14440 + }, + { + "epoch": 0.5941326320979539, + "grad_norm": 1.1762789487838745, + "learning_rate": 0.0001649454372796218, + "loss": 0.8379, + "step": 14460 + }, + { + "epoch": 0.5949543923083245, + "grad_norm": 1.1778429746627808, + "learning_rate": 0.0001648441887056738, + "loss": 0.7901, + "step": 14480 + }, + { + "epoch": 0.5957761525186951, + "grad_norm": 1.1886394023895264, + "learning_rate": 0.0001647478961925985, + "loss": 0.8431, + "step": 14500 + }, + { + "epoch": 0.5965979127290657, + "grad_norm": 1.256296992301941, + "learning_rate": 0.0001646515001956767, + "loss": 0.8181, + "step": 14520 + }, + { + "epoch": 0.5974196729394363, + "grad_norm": 1.2841947078704834, + "learning_rate": 0.00016454991909764397, + "loss": 0.8255, + "step": 14540 + }, + { + "epoch": 0.5982414331498069, + "grad_norm": 1.1896756887435913, + "learning_rate": 0.0001644482236866732, + "loss": 0.8304, + "step": 14560 + }, + { + "epoch": 0.5990631933601775, + "grad_norm": 1.4413669109344482, + "learning_rate": 0.00016434641414285922, + "loss": 0.8374, + "step": 14580 + }, + { + "epoch": 0.5998849535705482, + "grad_norm": 1.2350515127182007, + "learning_rate": 0.00016424449064649882, + "loss": 0.8342, + "step": 14600 + }, + { + "epoch": 0.6007067137809188, + "grad_norm": 1.3465436697006226, + "learning_rate": 0.0001641424533780907, + "loss": 0.8066, + "step": 14620 + }, + { + "epoch": 0.6015284739912894, + "grad_norm": 1.351413369178772, + "learning_rate": 0.00016404030251833502, + "loss": 0.7952, + "step": 14640 + }, + { + "epoch": 0.60235023420166, + "grad_norm": 1.1581010818481445, + "learning_rate": 0.00016393803824813304, + "loss": 0.8005, + "step": 14660 + }, + { + "epoch": 0.6031719944120306, + "grad_norm": 1.218668818473816, + "learning_rate": 0.00016383566074858695, + "loss": 0.8293, + "step": 14680 + }, + { + "epoch": 0.6039937546224012, + "grad_norm": 1.1792479753494263, + "learning_rate": 0.00016373317020099943, + "loss": 0.8247, + "step": 14700 + }, + { + "epoch": 0.6048155148327718, + "grad_norm": 1.2358107566833496, + "learning_rate": 0.00016363056678687335, + "loss": 0.8418, + "step": 14720 + }, + { + "epoch": 0.6056372750431425, + "grad_norm": 1.2904804944992065, + "learning_rate": 0.00016352785068791142, + "loss": 0.8317, + "step": 14740 + }, + { + "epoch": 0.6064590352535131, + "grad_norm": 1.1909780502319336, + "learning_rate": 0.000163425022086016, + "loss": 0.8221, + "step": 14760 + }, + { + "epoch": 0.6072807954638837, + "grad_norm": 1.199325442314148, + "learning_rate": 0.00016332208116328853, + "loss": 0.819, + "step": 14780 + }, + { + "epoch": 0.6081025556742543, + "grad_norm": 1.4012378454208374, + "learning_rate": 0.00016321902810202956, + "loss": 0.8104, + "step": 14800 + }, + { + "epoch": 0.6089243158846248, + "grad_norm": 1.1930865049362183, + "learning_rate": 0.00016311586308473812, + "loss": 0.8227, + "step": 14820 + }, + { + "epoch": 0.6097460760949954, + "grad_norm": 1.055925965309143, + "learning_rate": 0.00016301258629411144, + "loss": 0.8337, + "step": 14840 + }, + { + "epoch": 0.610567836305366, + "grad_norm": 1.1309912204742432, + "learning_rate": 0.00016290919791304487, + "loss": 0.8154, + "step": 14860 + }, + { + "epoch": 0.6113895965157367, + "grad_norm": 1.272495985031128, + "learning_rate": 0.0001628056981246312, + "loss": 0.8277, + "step": 14880 + }, + { + "epoch": 0.6122113567261073, + "grad_norm": 1.4250491857528687, + "learning_rate": 0.00016270208711216059, + "loss": 0.8606, + "step": 14900 + }, + { + "epoch": 0.6130331169364779, + "grad_norm": 1.079211950302124, + "learning_rate": 0.00016259836505912024, + "loss": 0.8485, + "step": 14920 + }, + { + "epoch": 0.6138548771468485, + "grad_norm": 1.2047349214553833, + "learning_rate": 0.00016249453214919383, + "loss": 0.8123, + "step": 14940 + }, + { + "epoch": 0.6146766373572191, + "grad_norm": 1.344088077545166, + "learning_rate": 0.0001623905885662615, + "loss": 0.8155, + "step": 14960 + }, + { + "epoch": 0.6154983975675897, + "grad_norm": 1.3220460414886475, + "learning_rate": 0.0001622865344943993, + "loss": 0.8544, + "step": 14980 + }, + { + "epoch": 0.6163201577779603, + "grad_norm": 1.2667044401168823, + "learning_rate": 0.00016218237011787905, + "loss": 0.8049, + "step": 15000 + }, + { + "epoch": 0.617141917988331, + "grad_norm": 1.2205132246017456, + "learning_rate": 0.0001620780956211678, + "loss": 0.8248, + "step": 15020 + }, + { + "epoch": 0.6179636781987016, + "grad_norm": 1.2170429229736328, + "learning_rate": 0.00016197371118892762, + "loss": 0.8176, + "step": 15040 + }, + { + "epoch": 0.6187854384090722, + "grad_norm": 1.1776739358901978, + "learning_rate": 0.00016186921700601535, + "loss": 0.8358, + "step": 15060 + }, + { + "epoch": 0.6196071986194428, + "grad_norm": 1.305285096168518, + "learning_rate": 0.00016176461325748215, + "loss": 0.8279, + "step": 15080 + }, + { + "epoch": 0.6204289588298134, + "grad_norm": 1.4242192506790161, + "learning_rate": 0.00016165990012857316, + "loss": 0.8331, + "step": 15100 + }, + { + "epoch": 0.621250719040184, + "grad_norm": 1.2353391647338867, + "learning_rate": 0.00016155507780472734, + "loss": 0.833, + "step": 15120 + }, + { + "epoch": 0.6220724792505546, + "grad_norm": 1.0569748878479004, + "learning_rate": 0.0001614501464715769, + "loss": 0.8491, + "step": 15140 + }, + { + "epoch": 0.6228942394609253, + "grad_norm": 1.0396721363067627, + "learning_rate": 0.00016134510631494718, + "loss": 0.7974, + "step": 15160 + }, + { + "epoch": 0.6237159996712959, + "grad_norm": 1.1455570459365845, + "learning_rate": 0.00016123995752085623, + "loss": 0.8384, + "step": 15180 + }, + { + "epoch": 0.6245377598816665, + "grad_norm": 1.1126292943954468, + "learning_rate": 0.00016113470027551442, + "loss": 0.8595, + "step": 15200 + }, + { + "epoch": 0.6253595200920371, + "grad_norm": 1.1759933233261108, + "learning_rate": 0.00016102933476532427, + "loss": 0.8161, + "step": 15220 + }, + { + "epoch": 0.6261812803024077, + "grad_norm": 1.1573827266693115, + "learning_rate": 0.00016092386117687996, + "loss": 0.8321, + "step": 15240 + }, + { + "epoch": 0.6270030405127783, + "grad_norm": 1.1900019645690918, + "learning_rate": 0.00016081827969696713, + "loss": 0.825, + "step": 15260 + }, + { + "epoch": 0.627824800723149, + "grad_norm": 1.1662086248397827, + "learning_rate": 0.0001607125905125624, + "loss": 0.8244, + "step": 15280 + }, + { + "epoch": 0.6286465609335196, + "grad_norm": 1.0952377319335938, + "learning_rate": 0.00016060679381083324, + "loss": 0.8183, + "step": 15300 + }, + { + "epoch": 0.6294683211438902, + "grad_norm": 1.1535894870758057, + "learning_rate": 0.00016050088977913744, + "loss": 0.8147, + "step": 15320 + }, + { + "epoch": 0.6302900813542608, + "grad_norm": 1.3305004835128784, + "learning_rate": 0.0001603948786050229, + "loss": 0.8172, + "step": 15340 + }, + { + "epoch": 0.6311118415646314, + "grad_norm": 1.3150241374969482, + "learning_rate": 0.0001602887604762272, + "loss": 0.8058, + "step": 15360 + }, + { + "epoch": 0.631933601775002, + "grad_norm": 1.2193336486816406, + "learning_rate": 0.00016018253558067744, + "loss": 0.8165, + "step": 15380 + }, + { + "epoch": 0.6327553619853726, + "grad_norm": 1.2560005187988281, + "learning_rate": 0.0001600762041064897, + "loss": 0.8206, + "step": 15400 + }, + { + "epoch": 0.6335771221957432, + "grad_norm": 1.3278276920318604, + "learning_rate": 0.00015996976624196884, + "loss": 0.8178, + "step": 15420 + }, + { + "epoch": 0.6343988824061139, + "grad_norm": 1.2090293169021606, + "learning_rate": 0.00015986322217560815, + "loss": 0.8443, + "step": 15440 + }, + { + "epoch": 0.6352206426164845, + "grad_norm": 1.2651256322860718, + "learning_rate": 0.00015975657209608895, + "loss": 0.7986, + "step": 15460 + }, + { + "epoch": 0.6360424028268551, + "grad_norm": 1.229529619216919, + "learning_rate": 0.00015964981619228028, + "loss": 0.8127, + "step": 15480 + }, + { + "epoch": 0.6368641630372257, + "grad_norm": 1.235855221748352, + "learning_rate": 0.00015954295465323866, + "loss": 0.8184, + "step": 15500 + }, + { + "epoch": 0.6376859232475963, + "grad_norm": 1.1475471258163452, + "learning_rate": 0.00015943598766820772, + "loss": 0.8258, + "step": 15520 + }, + { + "epoch": 0.6385076834579669, + "grad_norm": 1.4335112571716309, + "learning_rate": 0.00015932891542661768, + "loss": 0.8153, + "step": 15540 + }, + { + "epoch": 0.6393294436683375, + "grad_norm": 1.1948201656341553, + "learning_rate": 0.00015922173811808526, + "loss": 0.8501, + "step": 15560 + }, + { + "epoch": 0.6401512038787082, + "grad_norm": 1.1850942373275757, + "learning_rate": 0.00015911445593241318, + "loss": 0.8307, + "step": 15580 + }, + { + "epoch": 0.6409729640890788, + "grad_norm": 1.2479064464569092, + "learning_rate": 0.00015900706905959, + "loss": 0.812, + "step": 15600 + }, + { + "epoch": 0.6417947242994494, + "grad_norm": 1.0971261262893677, + "learning_rate": 0.00015889957768978956, + "loss": 0.8059, + "step": 15620 + }, + { + "epoch": 0.64261648450982, + "grad_norm": 1.409441351890564, + "learning_rate": 0.0001587919820133708, + "loss": 0.806, + "step": 15640 + }, + { + "epoch": 0.6434382447201906, + "grad_norm": 1.1296477317810059, + "learning_rate": 0.00015868428222087745, + "loss": 0.8279, + "step": 15660 + }, + { + "epoch": 0.6442600049305612, + "grad_norm": 1.2360674142837524, + "learning_rate": 0.00015857647850303744, + "loss": 0.8098, + "step": 15680 + }, + { + "epoch": 0.6450817651409319, + "grad_norm": 1.2544413805007935, + "learning_rate": 0.00015846857105076295, + "loss": 0.8236, + "step": 15700 + }, + { + "epoch": 0.6459035253513025, + "grad_norm": 0.9945731163024902, + "learning_rate": 0.0001583605600551497, + "loss": 0.8175, + "step": 15720 + }, + { + "epoch": 0.6467252855616731, + "grad_norm": 1.3537676334381104, + "learning_rate": 0.00015825244570747695, + "loss": 0.807, + "step": 15740 + }, + { + "epoch": 0.6475470457720437, + "grad_norm": 1.1799793243408203, + "learning_rate": 0.0001581442281992068, + "loss": 0.8446, + "step": 15760 + }, + { + "epoch": 0.6483688059824143, + "grad_norm": 1.1959508657455444, + "learning_rate": 0.00015803590772198417, + "loss": 0.8156, + "step": 15780 + }, + { + "epoch": 0.6491905661927849, + "grad_norm": 1.1003177165985107, + "learning_rate": 0.0001579274844676362, + "loss": 0.8295, + "step": 15800 + }, + { + "epoch": 0.6500123264031555, + "grad_norm": 1.1961150169372559, + "learning_rate": 0.00015781895862817226, + "loss": 0.8428, + "step": 15820 + }, + { + "epoch": 0.6508340866135262, + "grad_norm": 1.3239089250564575, + "learning_rate": 0.0001577103303957832, + "loss": 0.8322, + "step": 15840 + }, + { + "epoch": 0.6516558468238968, + "grad_norm": 1.190565586090088, + "learning_rate": 0.0001576015999628412, + "loss": 0.8332, + "step": 15860 + }, + { + "epoch": 0.6524776070342674, + "grad_norm": 1.158389925956726, + "learning_rate": 0.0001574927675218996, + "loss": 0.8065, + "step": 15880 + }, + { + "epoch": 0.653299367244638, + "grad_norm": 1.1195902824401855, + "learning_rate": 0.00015738383326569218, + "loss": 0.8145, + "step": 15900 + }, + { + "epoch": 0.6541211274550086, + "grad_norm": 1.1594524383544922, + "learning_rate": 0.00015727479738713315, + "loss": 0.8094, + "step": 15920 + }, + { + "epoch": 0.6549428876653792, + "grad_norm": 1.176584005355835, + "learning_rate": 0.0001571656600793167, + "loss": 0.8383, + "step": 15940 + }, + { + "epoch": 0.6557646478757498, + "grad_norm": 1.188091516494751, + "learning_rate": 0.00015705642153551654, + "loss": 0.8155, + "step": 15960 + }, + { + "epoch": 0.6565864080861205, + "grad_norm": 1.1001297235488892, + "learning_rate": 0.00015694708194918575, + "loss": 0.8627, + "step": 15980 + }, + { + "epoch": 0.6574081682964911, + "grad_norm": 1.2960819005966187, + "learning_rate": 0.00015683764151395635, + "loss": 0.8426, + "step": 16000 + }, + { + "epoch": 0.6574081682964911, + "eval_loss": 1.121155858039856, + "eval_runtime": 16.4628, + "eval_samples_per_second": 159.147, + "eval_steps_per_second": 4.981, + "step": 16000 + }, + { + "epoch": 0.6582299285068617, + "grad_norm": 1.2257232666015625, + "learning_rate": 0.00015672810042363885, + "loss": 0.8271, + "step": 16020 + }, + { + "epoch": 0.6590516887172323, + "grad_norm": 1.309381127357483, + "learning_rate": 0.0001566239433327474, + "loss": 0.8404, + "step": 16040 + }, + { + "epoch": 0.6598734489276029, + "grad_norm": 1.1223074197769165, + "learning_rate": 0.00015651420652313064, + "loss": 0.8329, + "step": 16060 + }, + { + "epoch": 0.6606952091379735, + "grad_norm": 1.2301430702209473, + "learning_rate": 0.00015640436963120435, + "loss": 0.8332, + "step": 16080 + }, + { + "epoch": 0.6615169693483441, + "grad_norm": 1.0917423963546753, + "learning_rate": 0.00015629443285148113, + "loss": 0.8068, + "step": 16100 + }, + { + "epoch": 0.6623387295587148, + "grad_norm": 1.2451525926589966, + "learning_rate": 0.0001561843963786506, + "loss": 0.8214, + "step": 16120 + }, + { + "epoch": 0.6631604897690854, + "grad_norm": 1.3091555833816528, + "learning_rate": 0.00015607426040757885, + "loss": 0.8122, + "step": 16140 + }, + { + "epoch": 0.663982249979456, + "grad_norm": 1.1900159120559692, + "learning_rate": 0.00015596402513330822, + "loss": 0.8296, + "step": 16160 + }, + { + "epoch": 0.6648040101898266, + "grad_norm": 1.2061822414398193, + "learning_rate": 0.00015585369075105693, + "loss": 0.8226, + "step": 16180 + }, + { + "epoch": 0.6656257704001972, + "grad_norm": 1.3246262073516846, + "learning_rate": 0.00015574325745621866, + "loss": 0.8141, + "step": 16200 + }, + { + "epoch": 0.6664475306105678, + "grad_norm": 1.1857435703277588, + "learning_rate": 0.00015563272544436222, + "loss": 0.8213, + "step": 16220 + }, + { + "epoch": 0.6672692908209384, + "grad_norm": 1.2091962099075317, + "learning_rate": 0.00015552209491123136, + "loss": 0.8306, + "step": 16240 + }, + { + "epoch": 0.6680910510313091, + "grad_norm": 1.2190937995910645, + "learning_rate": 0.00015541136605274423, + "loss": 0.804, + "step": 16260 + }, + { + "epoch": 0.6689128112416797, + "grad_norm": 1.2466769218444824, + "learning_rate": 0.00015530053906499306, + "loss": 0.8405, + "step": 16280 + }, + { + "epoch": 0.6697345714520503, + "grad_norm": 1.1477371454238892, + "learning_rate": 0.000155189614144244, + "loss": 0.8193, + "step": 16300 + }, + { + "epoch": 0.6705563316624209, + "grad_norm": 1.3594835996627808, + "learning_rate": 0.00015508414493801062, + "loss": 0.8351, + "step": 16320 + }, + { + "epoch": 0.6713780918727915, + "grad_norm": 1.2145652770996094, + "learning_rate": 0.00015497302961308234, + "loss": 0.8394, + "step": 16340 + }, + { + "epoch": 0.6721998520831621, + "grad_norm": 1.1144131422042847, + "learning_rate": 0.00015486181693515012, + "loss": 0.8403, + "step": 16360 + }, + { + "epoch": 0.6730216122935327, + "grad_norm": 1.1824201345443726, + "learning_rate": 0.0001547505071011631, + "loss": 0.7953, + "step": 16380 + }, + { + "epoch": 0.6738433725039034, + "grad_norm": 1.1716006994247437, + "learning_rate": 0.00015463910030824243, + "loss": 0.8262, + "step": 16400 + }, + { + "epoch": 0.674665132714274, + "grad_norm": 1.243807315826416, + "learning_rate": 0.00015452759675368093, + "loss": 0.8239, + "step": 16420 + }, + { + "epoch": 0.6754868929246446, + "grad_norm": 1.235845923423767, + "learning_rate": 0.00015441599663494287, + "loss": 0.8226, + "step": 16440 + }, + { + "epoch": 0.6763086531350152, + "grad_norm": 1.1748180389404297, + "learning_rate": 0.0001543043001496634, + "loss": 0.8161, + "step": 16460 + }, + { + "epoch": 0.6771304133453858, + "grad_norm": 1.1699196100234985, + "learning_rate": 0.00015419250749564841, + "loss": 0.8131, + "step": 16480 + }, + { + "epoch": 0.6779521735557564, + "grad_norm": 1.238051176071167, + "learning_rate": 0.00015408061887087416, + "loss": 0.8475, + "step": 16500 + }, + { + "epoch": 0.678773933766127, + "grad_norm": 1.3277729749679565, + "learning_rate": 0.0001539686344734867, + "loss": 0.8217, + "step": 16520 + }, + { + "epoch": 0.6795956939764977, + "grad_norm": 1.1437911987304688, + "learning_rate": 0.00015385655450180185, + "loss": 0.8305, + "step": 16540 + }, + { + "epoch": 0.6804174541868683, + "grad_norm": 1.2318732738494873, + "learning_rate": 0.00015374437915430456, + "loss": 0.8295, + "step": 16560 + }, + { + "epoch": 0.6812392143972389, + "grad_norm": 1.1004635095596313, + "learning_rate": 0.00015363210862964874, + "loss": 0.8028, + "step": 16580 + }, + { + "epoch": 0.6820609746076095, + "grad_norm": 0.9944893717765808, + "learning_rate": 0.00015351974312665685, + "loss": 0.8375, + "step": 16600 + }, + { + "epoch": 0.6828827348179801, + "grad_norm": 1.3072353601455688, + "learning_rate": 0.00015340728284431957, + "loss": 0.8113, + "step": 16620 + }, + { + "epoch": 0.6837044950283507, + "grad_norm": 1.173128604888916, + "learning_rate": 0.0001532947279817954, + "loss": 0.8208, + "step": 16640 + }, + { + "epoch": 0.6845262552387213, + "grad_norm": 1.1952176094055176, + "learning_rate": 0.0001531820787384103, + "loss": 0.8139, + "step": 16660 + }, + { + "epoch": 0.685348015449092, + "grad_norm": 1.189226508140564, + "learning_rate": 0.00015306933531365746, + "loss": 0.837, + "step": 16680 + }, + { + "epoch": 0.6861697756594626, + "grad_norm": 1.0701826810836792, + "learning_rate": 0.0001529564979071968, + "loss": 0.8424, + "step": 16700 + }, + { + "epoch": 0.6869915358698332, + "grad_norm": 1.2586934566497803, + "learning_rate": 0.00015284356671885465, + "loss": 0.81, + "step": 16720 + }, + { + "epoch": 0.6878132960802038, + "grad_norm": 1.2510169744491577, + "learning_rate": 0.00015273054194862344, + "loss": 0.8353, + "step": 16740 + }, + { + "epoch": 0.6886350562905744, + "grad_norm": 1.2519487142562866, + "learning_rate": 0.00015261742379666138, + "loss": 0.8164, + "step": 16760 + }, + { + "epoch": 0.689456816500945, + "grad_norm": 1.2734975814819336, + "learning_rate": 0.000152504212463292, + "loss": 0.8136, + "step": 16780 + }, + { + "epoch": 0.6902785767113157, + "grad_norm": 1.4471458196640015, + "learning_rate": 0.00015239090814900386, + "loss": 0.814, + "step": 16800 + }, + { + "epoch": 0.6911003369216863, + "grad_norm": 1.1526660919189453, + "learning_rate": 0.00015227751105445017, + "loss": 0.8106, + "step": 16820 + }, + { + "epoch": 0.6919220971320569, + "grad_norm": 1.2432206869125366, + "learning_rate": 0.00015216402138044843, + "loss": 0.7911, + "step": 16840 + }, + { + "epoch": 0.6927438573424275, + "grad_norm": 1.3356374502182007, + "learning_rate": 0.00015205043932798015, + "loss": 0.8157, + "step": 16860 + }, + { + "epoch": 0.6935656175527981, + "grad_norm": 1.105976939201355, + "learning_rate": 0.00015193676509819043, + "loss": 0.8088, + "step": 16880 + }, + { + "epoch": 0.6943873777631687, + "grad_norm": 1.2425799369812012, + "learning_rate": 0.0001518229988923875, + "loss": 0.8318, + "step": 16900 + }, + { + "epoch": 0.6952091379735393, + "grad_norm": 1.1508738994598389, + "learning_rate": 0.00015171483598759317, + "loss": 0.84, + "step": 16920 + }, + { + "epoch": 0.69603089818391, + "grad_norm": 1.237749695777893, + "learning_rate": 0.00015160089100819412, + "loss": 0.823, + "step": 16940 + }, + { + "epoch": 0.6968526583942806, + "grad_norm": 1.1718213558197021, + "learning_rate": 0.00015148685464758913, + "loss": 0.7944, + "step": 16960 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 1.207578420639038, + "learning_rate": 0.0001513727271077277, + "loss": 0.8068, + "step": 16980 + }, + { + "epoch": 0.6984961788150218, + "grad_norm": 1.180071473121643, + "learning_rate": 0.00015125850859072098, + "loss": 0.8302, + "step": 17000 + }, + { + "epoch": 0.6993179390253924, + "grad_norm": 1.2875052690505981, + "learning_rate": 0.00015114419929884116, + "loss": 0.8382, + "step": 17020 + }, + { + "epoch": 0.700139699235763, + "grad_norm": 1.3213404417037964, + "learning_rate": 0.0001510297994345212, + "loss": 0.8196, + "step": 17040 + }, + { + "epoch": 0.7009614594461336, + "grad_norm": 1.2812501192092896, + "learning_rate": 0.00015091530920035445, + "loss": 0.85, + "step": 17060 + }, + { + "epoch": 0.7017832196565043, + "grad_norm": 1.122597336769104, + "learning_rate": 0.0001508007287990943, + "loss": 0.8363, + "step": 17080 + }, + { + "epoch": 0.7026049798668749, + "grad_norm": 1.1828125715255737, + "learning_rate": 0.0001506860584336538, + "loss": 0.8217, + "step": 17100 + }, + { + "epoch": 0.7034267400772455, + "grad_norm": 1.187536358833313, + "learning_rate": 0.00015057129830710542, + "loss": 0.826, + "step": 17120 + }, + { + "epoch": 0.7042485002876161, + "grad_norm": 1.1405049562454224, + "learning_rate": 0.00015045644862268044, + "loss": 0.8035, + "step": 17140 + }, + { + "epoch": 0.7050702604979867, + "grad_norm": 1.255892276763916, + "learning_rate": 0.0001503415095837688, + "loss": 0.8251, + "step": 17160 + }, + { + "epoch": 0.7058920207083573, + "grad_norm": 1.1409162282943726, + "learning_rate": 0.00015022648139391875, + "loss": 0.8049, + "step": 17180 + }, + { + "epoch": 0.7067137809187279, + "grad_norm": 1.1684703826904297, + "learning_rate": 0.00015011136425683628, + "loss": 0.8166, + "step": 17200 + }, + { + "epoch": 0.7075355411290986, + "grad_norm": 1.2653696537017822, + "learning_rate": 0.00014999615837638506, + "loss": 0.779, + "step": 17220 + }, + { + "epoch": 0.7083573013394692, + "grad_norm": 1.2622226476669312, + "learning_rate": 0.0001498808639565858, + "loss": 0.8285, + "step": 17240 + }, + { + "epoch": 0.7091790615498398, + "grad_norm": 1.152935266494751, + "learning_rate": 0.00014976548120161607, + "loss": 0.7834, + "step": 17260 + }, + { + "epoch": 0.7100008217602104, + "grad_norm": 1.2912839651107788, + "learning_rate": 0.0001496500103158098, + "loss": 0.8015, + "step": 17280 + }, + { + "epoch": 0.710822581970581, + "grad_norm": 1.2247825860977173, + "learning_rate": 0.00014953445150365705, + "loss": 0.8121, + "step": 17300 + }, + { + "epoch": 0.7116443421809516, + "grad_norm": 1.2684624195098877, + "learning_rate": 0.00014941880496980358, + "loss": 0.8203, + "step": 17320 + }, + { + "epoch": 0.7124661023913222, + "grad_norm": 1.2209104299545288, + "learning_rate": 0.00014930307091905057, + "loss": 0.8254, + "step": 17340 + }, + { + "epoch": 0.7132878626016929, + "grad_norm": 1.3020516633987427, + "learning_rate": 0.000149187249556354, + "loss": 0.8342, + "step": 17360 + }, + { + "epoch": 0.7141096228120635, + "grad_norm": 1.1557644605636597, + "learning_rate": 0.00014907134108682466, + "loss": 0.8199, + "step": 17380 + }, + { + "epoch": 0.7149313830224341, + "grad_norm": 1.177384853363037, + "learning_rate": 0.00014895534571572754, + "loss": 0.8406, + "step": 17400 + }, + { + "epoch": 0.7157531432328047, + "grad_norm": 1.2047346830368042, + "learning_rate": 0.0001488392636484815, + "loss": 0.8091, + "step": 17420 + }, + { + "epoch": 0.7165749034431753, + "grad_norm": 1.171870231628418, + "learning_rate": 0.00014872309509065886, + "loss": 0.7942, + "step": 17440 + }, + { + "epoch": 0.7173966636535459, + "grad_norm": 1.1541531085968018, + "learning_rate": 0.00014860684024798536, + "loss": 0.7921, + "step": 17460 + }, + { + "epoch": 0.7182184238639165, + "grad_norm": 1.1826390027999878, + "learning_rate": 0.0001484904993263392, + "loss": 0.8068, + "step": 17480 + }, + { + "epoch": 0.7190401840742872, + "grad_norm": 1.1863442659378052, + "learning_rate": 0.0001483740725317513, + "loss": 0.8347, + "step": 17500 + }, + { + "epoch": 0.7198619442846578, + "grad_norm": 1.2402416467666626, + "learning_rate": 0.00014825756007040458, + "loss": 0.8276, + "step": 17520 + }, + { + "epoch": 0.7206837044950284, + "grad_norm": 1.2695879936218262, + "learning_rate": 0.00014814096214863355, + "loss": 0.8035, + "step": 17540 + }, + { + "epoch": 0.721505464705399, + "grad_norm": 1.1038098335266113, + "learning_rate": 0.0001480242789729242, + "loss": 0.8131, + "step": 17560 + }, + { + "epoch": 0.7223272249157696, + "grad_norm": 1.3015015125274658, + "learning_rate": 0.0001479075107499135, + "loss": 0.7688, + "step": 17580 + }, + { + "epoch": 0.7231489851261402, + "grad_norm": 1.1493345499038696, + "learning_rate": 0.00014779065768638888, + "loss": 0.8167, + "step": 17600 + }, + { + "epoch": 0.7239707453365108, + "grad_norm": 1.1382313966751099, + "learning_rate": 0.0001476737199892882, + "loss": 0.8435, + "step": 17620 + }, + { + "epoch": 0.7247925055468815, + "grad_norm": 1.2679277658462524, + "learning_rate": 0.0001475566978656991, + "loss": 0.8304, + "step": 17640 + }, + { + "epoch": 0.7256142657572521, + "grad_norm": 1.2502251863479614, + "learning_rate": 0.0001474395915228587, + "loss": 0.8252, + "step": 17660 + }, + { + "epoch": 0.7264360259676227, + "grad_norm": 1.1250088214874268, + "learning_rate": 0.00014732240116815343, + "loss": 0.8166, + "step": 17680 + }, + { + "epoch": 0.7272577861779933, + "grad_norm": 1.1370155811309814, + "learning_rate": 0.0001472051270091183, + "loss": 0.7896, + "step": 17700 + }, + { + "epoch": 0.7280795463883639, + "grad_norm": 1.2608290910720825, + "learning_rate": 0.00014708776925343684, + "loss": 0.8028, + "step": 17720 + }, + { + "epoch": 0.7289013065987345, + "grad_norm": 1.3024847507476807, + "learning_rate": 0.00014697032810894064, + "loss": 0.7981, + "step": 17740 + }, + { + "epoch": 0.7297230668091051, + "grad_norm": 1.1868743896484375, + "learning_rate": 0.00014685280378360884, + "loss": 0.7949, + "step": 17760 + }, + { + "epoch": 0.7305448270194758, + "grad_norm": 1.253355622291565, + "learning_rate": 0.00014673519648556805, + "loss": 0.8107, + "step": 17780 + }, + { + "epoch": 0.7313665872298464, + "grad_norm": 1.2940136194229126, + "learning_rate": 0.00014661750642309173, + "loss": 0.8017, + "step": 17800 + }, + { + "epoch": 0.732188347440217, + "grad_norm": 1.2385964393615723, + "learning_rate": 0.0001464997338045999, + "loss": 0.8055, + "step": 17820 + }, + { + "epoch": 0.7330101076505876, + "grad_norm": 1.2589930295944214, + "learning_rate": 0.0001463818788386588, + "loss": 0.7795, + "step": 17840 + }, + { + "epoch": 0.7338318678609582, + "grad_norm": 1.0926438570022583, + "learning_rate": 0.00014626394173398056, + "loss": 0.8202, + "step": 17860 + }, + { + "epoch": 0.7346536280713288, + "grad_norm": 1.1935014724731445, + "learning_rate": 0.00014614592269942262, + "loss": 0.7994, + "step": 17880 + }, + { + "epoch": 0.7354753882816994, + "grad_norm": 1.1052745580673218, + "learning_rate": 0.0001460278219439877, + "loss": 0.8283, + "step": 17900 + }, + { + "epoch": 0.7362971484920701, + "grad_norm": 1.2212902307510376, + "learning_rate": 0.00014590963967682304, + "loss": 0.7937, + "step": 17920 + }, + { + "epoch": 0.7371189087024407, + "grad_norm": 1.2168195247650146, + "learning_rate": 0.00014579137610722044, + "loss": 0.7938, + "step": 17940 + }, + { + "epoch": 0.7379406689128113, + "grad_norm": 1.1712826490402222, + "learning_rate": 0.00014567303144461552, + "loss": 0.8279, + "step": 17960 + }, + { + "epoch": 0.7387624291231819, + "grad_norm": 1.2737464904785156, + "learning_rate": 0.0001455546058985876, + "loss": 0.8144, + "step": 17980 + }, + { + "epoch": 0.7395841893335525, + "grad_norm": 1.2962466478347778, + "learning_rate": 0.0001454360996788592, + "loss": 0.8398, + "step": 18000 + }, + { + "epoch": 0.7395841893335525, + "eval_loss": 1.0920464992523193, + "eval_runtime": 16.6449, + "eval_samples_per_second": 157.406, + "eval_steps_per_second": 4.926, + "step": 18000 + }, + { + "epoch": 0.7404059495439231, + "grad_norm": 1.1804122924804688, + "learning_rate": 0.0001453175129952957, + "loss": 0.8203, + "step": 18020 + }, + { + "epoch": 0.7412277097542938, + "grad_norm": 1.0877560377120972, + "learning_rate": 0.00014519884605790497, + "loss": 0.7971, + "step": 18040 + }, + { + "epoch": 0.7420494699646644, + "grad_norm": 1.2490891218185425, + "learning_rate": 0.00014508009907683705, + "loss": 0.8479, + "step": 18060 + }, + { + "epoch": 0.742871230175035, + "grad_norm": 1.1328068971633911, + "learning_rate": 0.0001449612722623837, + "loss": 0.8062, + "step": 18080 + }, + { + "epoch": 0.7436929903854055, + "grad_norm": 1.2985849380493164, + "learning_rate": 0.00014484236582497806, + "loss": 0.8267, + "step": 18100 + }, + { + "epoch": 0.7445147505957761, + "grad_norm": 1.1852083206176758, + "learning_rate": 0.00014472337997519432, + "loss": 0.8416, + "step": 18120 + }, + { + "epoch": 0.7453365108061467, + "grad_norm": 1.0732208490371704, + "learning_rate": 0.0001446043149237472, + "loss": 0.7889, + "step": 18140 + }, + { + "epoch": 0.7461582710165173, + "grad_norm": 1.331568717956543, + "learning_rate": 0.00014448517088149176, + "loss": 0.7988, + "step": 18160 + }, + { + "epoch": 0.7469800312268879, + "grad_norm": 1.2419168949127197, + "learning_rate": 0.00014436594805942288, + "loss": 0.8073, + "step": 18180 + }, + { + "epoch": 0.7478017914372586, + "grad_norm": 1.2016359567642212, + "learning_rate": 0.0001442466466686751, + "loss": 0.8104, + "step": 18200 + }, + { + "epoch": 0.7486235516476292, + "grad_norm": 1.0315485000610352, + "learning_rate": 0.00014412726692052195, + "loss": 0.8303, + "step": 18220 + }, + { + "epoch": 0.7494453118579998, + "grad_norm": 1.1660343408584595, + "learning_rate": 0.00014400780902637574, + "loss": 0.8143, + "step": 18240 + }, + { + "epoch": 0.7502670720683704, + "grad_norm": 1.1398091316223145, + "learning_rate": 0.00014388827319778723, + "loss": 0.8144, + "step": 18260 + }, + { + "epoch": 0.751088832278741, + "grad_norm": 1.2301084995269775, + "learning_rate": 0.00014376865964644522, + "loss": 0.8116, + "step": 18280 + }, + { + "epoch": 0.7519105924891116, + "grad_norm": 1.159977674484253, + "learning_rate": 0.00014364896858417607, + "loss": 0.8183, + "step": 18300 + }, + { + "epoch": 0.7527323526994822, + "grad_norm": 1.3030027151107788, + "learning_rate": 0.0001435292002229434, + "loss": 0.8133, + "step": 18320 + }, + { + "epoch": 0.7535541129098529, + "grad_norm": 1.2161463499069214, + "learning_rate": 0.00014340935477484781, + "loss": 0.8049, + "step": 18340 + }, + { + "epoch": 0.7543758731202235, + "grad_norm": 1.2423648834228516, + "learning_rate": 0.0001432894324521263, + "loss": 0.8061, + "step": 18360 + }, + { + "epoch": 0.7551976333305941, + "grad_norm": 1.2223330736160278, + "learning_rate": 0.00014316943346715216, + "loss": 0.8098, + "step": 18380 + }, + { + "epoch": 0.7560193935409647, + "grad_norm": 1.2927947044372559, + "learning_rate": 0.00014304935803243429, + "loss": 0.7944, + "step": 18400 + }, + { + "epoch": 0.7568411537513353, + "grad_norm": 1.2604374885559082, + "learning_rate": 0.00014292920636061704, + "loss": 0.8082, + "step": 18420 + }, + { + "epoch": 0.7576629139617059, + "grad_norm": 1.2504435777664185, + "learning_rate": 0.00014280897866447985, + "loss": 0.8105, + "step": 18440 + }, + { + "epoch": 0.7584846741720765, + "grad_norm": 1.2936044931411743, + "learning_rate": 0.00014268867515693662, + "loss": 0.7844, + "step": 18460 + }, + { + "epoch": 0.7593064343824472, + "grad_norm": 1.1824711561203003, + "learning_rate": 0.00014256829605103564, + "loss": 0.8227, + "step": 18480 + }, + { + "epoch": 0.7601281945928178, + "grad_norm": 1.300419807434082, + "learning_rate": 0.00014244784155995906, + "loss": 0.8513, + "step": 18500 + }, + { + "epoch": 0.7609499548031884, + "grad_norm": 1.376930832862854, + "learning_rate": 0.00014232731189702249, + "loss": 0.8247, + "step": 18520 + }, + { + "epoch": 0.761771715013559, + "grad_norm": 1.229896903038025, + "learning_rate": 0.00014220670727567466, + "loss": 0.7918, + "step": 18540 + }, + { + "epoch": 0.7625934752239296, + "grad_norm": 1.1824287176132202, + "learning_rate": 0.00014208602790949715, + "loss": 0.8236, + "step": 18560 + }, + { + "epoch": 0.7634152354343002, + "grad_norm": 1.1316356658935547, + "learning_rate": 0.00014196527401220374, + "loss": 0.8122, + "step": 18580 + }, + { + "epoch": 0.7642369956446708, + "grad_norm": 1.1641656160354614, + "learning_rate": 0.00014184444579764036, + "loss": 0.7887, + "step": 18600 + }, + { + "epoch": 0.7650587558550415, + "grad_norm": 1.2354685068130493, + "learning_rate": 0.0001417235434797844, + "loss": 0.8223, + "step": 18620 + }, + { + "epoch": 0.7658805160654121, + "grad_norm": 1.108034372329712, + "learning_rate": 0.00014160256727274462, + "loss": 0.8055, + "step": 18640 + }, + { + "epoch": 0.7667022762757827, + "grad_norm": 1.3176486492156982, + "learning_rate": 0.00014148151739076055, + "loss": 0.829, + "step": 18660 + }, + { + "epoch": 0.7675240364861533, + "grad_norm": 1.0911662578582764, + "learning_rate": 0.0001413603940482022, + "loss": 0.7978, + "step": 18680 + }, + { + "epoch": 0.7683457966965239, + "grad_norm": 1.0955551862716675, + "learning_rate": 0.0001412391974595697, + "loss": 0.8174, + "step": 18700 + }, + { + "epoch": 0.7691675569068945, + "grad_norm": 1.1460391283035278, + "learning_rate": 0.0001411179278394929, + "loss": 0.8279, + "step": 18720 + }, + { + "epoch": 0.7699893171172651, + "grad_norm": 1.1242876052856445, + "learning_rate": 0.00014099658540273096, + "loss": 0.8248, + "step": 18740 + }, + { + "epoch": 0.7708110773276358, + "grad_norm": 1.1270663738250732, + "learning_rate": 0.00014087517036417196, + "loss": 0.8211, + "step": 18760 + }, + { + "epoch": 0.7716328375380064, + "grad_norm": 1.1673957109451294, + "learning_rate": 0.0001407536829388326, + "loss": 0.8119, + "step": 18780 + }, + { + "epoch": 0.772454597748377, + "grad_norm": 1.123095989227295, + "learning_rate": 0.00014063212334185774, + "loss": 0.8095, + "step": 18800 + }, + { + "epoch": 0.7732763579587476, + "grad_norm": 1.0859897136688232, + "learning_rate": 0.0001405104917885201, + "loss": 0.7797, + "step": 18820 + }, + { + "epoch": 0.7740981181691182, + "grad_norm": 1.2377300262451172, + "learning_rate": 0.0001403887884942198, + "loss": 0.8273, + "step": 18840 + }, + { + "epoch": 0.7749198783794888, + "grad_norm": 1.2705551385879517, + "learning_rate": 0.000140267013674484, + "loss": 0.8051, + "step": 18860 + }, + { + "epoch": 0.7757416385898595, + "grad_norm": 1.319014072418213, + "learning_rate": 0.00014014516754496656, + "loss": 0.8107, + "step": 18880 + }, + { + "epoch": 0.7765633988002301, + "grad_norm": 1.3769927024841309, + "learning_rate": 0.00014002325032144754, + "loss": 0.7844, + "step": 18900 + }, + { + "epoch": 0.7773851590106007, + "grad_norm": 1.3221933841705322, + "learning_rate": 0.00013990126221983298, + "loss": 0.8131, + "step": 18920 + }, + { + "epoch": 0.7782069192209713, + "grad_norm": 1.2304918766021729, + "learning_rate": 0.00013977920345615444, + "loss": 0.8039, + "step": 18940 + }, + { + "epoch": 0.7790286794313419, + "grad_norm": 1.385318398475647, + "learning_rate": 0.00013965707424656856, + "loss": 0.7985, + "step": 18960 + }, + { + "epoch": 0.7798504396417125, + "grad_norm": 1.1849255561828613, + "learning_rate": 0.00013953487480735679, + "loss": 0.7902, + "step": 18980 + }, + { + "epoch": 0.7806721998520831, + "grad_norm": 1.2645635604858398, + "learning_rate": 0.0001394126053549249, + "loss": 0.7931, + "step": 19000 + }, + { + "epoch": 0.7814939600624538, + "grad_norm": 1.2201372385025024, + "learning_rate": 0.00013929026610580276, + "loss": 0.8244, + "step": 19020 + }, + { + "epoch": 0.7823157202728244, + "grad_norm": 1.1765888929367065, + "learning_rate": 0.0001391739793672815, + "loss": 0.8012, + "step": 19040 + }, + { + "epoch": 0.783137480483195, + "grad_norm": 1.0896601676940918, + "learning_rate": 0.0001390515046378748, + "loss": 0.8107, + "step": 19060 + }, + { + "epoch": 0.7839592406935656, + "grad_norm": 1.125887393951416, + "learning_rate": 0.00013892896075125936, + "loss": 0.8233, + "step": 19080 + }, + { + "epoch": 0.7847810009039362, + "grad_norm": 1.1168248653411865, + "learning_rate": 0.00013880634792445097, + "loss": 0.8036, + "step": 19100 + }, + { + "epoch": 0.7856027611143068, + "grad_norm": 1.3335416316986084, + "learning_rate": 0.00013868366637458755, + "loss": 0.8174, + "step": 19120 + }, + { + "epoch": 0.7864245213246774, + "grad_norm": 1.281420111656189, + "learning_rate": 0.0001385609163189286, + "loss": 0.8116, + "step": 19140 + }, + { + "epoch": 0.787246281535048, + "grad_norm": 1.2268955707550049, + "learning_rate": 0.00013843809797485515, + "loss": 0.8147, + "step": 19160 + }, + { + "epoch": 0.7880680417454187, + "grad_norm": 1.2544898986816406, + "learning_rate": 0.0001383152115598689, + "loss": 0.826, + "step": 19180 + }, + { + "epoch": 0.7888898019557893, + "grad_norm": 1.3066948652267456, + "learning_rate": 0.00013819225729159228, + "loss": 0.8072, + "step": 19200 + }, + { + "epoch": 0.7897115621661599, + "grad_norm": 1.2460455894470215, + "learning_rate": 0.00013806923538776787, + "loss": 0.7989, + "step": 19220 + }, + { + "epoch": 0.7905333223765305, + "grad_norm": 1.155281662940979, + "learning_rate": 0.00013794614606625792, + "loss": 0.8405, + "step": 19240 + }, + { + "epoch": 0.7913550825869011, + "grad_norm": 1.054686427116394, + "learning_rate": 0.00013782914896373142, + "loss": 0.8056, + "step": 19260 + }, + { + "epoch": 0.7921768427972717, + "grad_norm": 1.3717306852340698, + "learning_rate": 0.00013770592880481307, + "loss": 0.802, + "step": 19280 + }, + { + "epoch": 0.7929986030076424, + "grad_norm": 1.2226320505142212, + "learning_rate": 0.0001375826418715972, + "loss": 0.813, + "step": 19300 + }, + { + "epoch": 0.793820363218013, + "grad_norm": 1.250246524810791, + "learning_rate": 0.00013745928838241556, + "loss": 0.8225, + "step": 19320 + }, + { + "epoch": 0.7946421234283836, + "grad_norm": 1.0490273237228394, + "learning_rate": 0.00013733586855571754, + "loss": 0.7996, + "step": 19340 + }, + { + "epoch": 0.7954638836387542, + "grad_norm": 1.1633754968643188, + "learning_rate": 0.00013721238261007023, + "loss": 0.8224, + "step": 19360 + }, + { + "epoch": 0.7962856438491248, + "grad_norm": 1.079055666923523, + "learning_rate": 0.0001370888307641578, + "loss": 0.8207, + "step": 19380 + }, + { + "epoch": 0.7971074040594954, + "grad_norm": 1.3664897680282593, + "learning_rate": 0.00013696521323678092, + "loss": 0.8015, + "step": 19400 + }, + { + "epoch": 0.797929164269866, + "grad_norm": 1.1427098512649536, + "learning_rate": 0.00013684153024685685, + "loss": 0.8093, + "step": 19420 + }, + { + "epoch": 0.7987509244802367, + "grad_norm": 1.1648412942886353, + "learning_rate": 0.00013671778201341855, + "loss": 0.7898, + "step": 19440 + }, + { + "epoch": 0.7995726846906073, + "grad_norm": 1.1498785018920898, + "learning_rate": 0.00013659396875561467, + "loss": 0.8113, + "step": 19460 + }, + { + "epoch": 0.8003944449009779, + "grad_norm": 1.1189064979553223, + "learning_rate": 0.00013647009069270893, + "loss": 0.8062, + "step": 19480 + }, + { + "epoch": 0.8012162051113485, + "grad_norm": 1.225690245628357, + "learning_rate": 0.00013634614804407984, + "loss": 0.7994, + "step": 19500 + }, + { + "epoch": 0.8020379653217191, + "grad_norm": 1.1157780885696411, + "learning_rate": 0.0001362221410292203, + "loss": 0.7882, + "step": 19520 + }, + { + "epoch": 0.8028597255320897, + "grad_norm": 1.0617575645446777, + "learning_rate": 0.00013609806986773722, + "loss": 0.8035, + "step": 19540 + }, + { + "epoch": 0.8036814857424603, + "grad_norm": 1.128070592880249, + "learning_rate": 0.00013597393477935102, + "loss": 0.8095, + "step": 19560 + }, + { + "epoch": 0.804503245952831, + "grad_norm": 1.1231319904327393, + "learning_rate": 0.00013584973598389544, + "loss": 0.799, + "step": 19580 + }, + { + "epoch": 0.8053250061632016, + "grad_norm": 1.4038573503494263, + "learning_rate": 0.00013572547370131695, + "loss": 0.7849, + "step": 19600 + }, + { + "epoch": 0.8061467663735722, + "grad_norm": 1.208066463470459, + "learning_rate": 0.00013560114815167447, + "loss": 0.8127, + "step": 19620 + }, + { + "epoch": 0.8069685265839428, + "grad_norm": 1.243054747581482, + "learning_rate": 0.00013547675955513904, + "loss": 0.8139, + "step": 19640 + }, + { + "epoch": 0.8077902867943134, + "grad_norm": 1.1254897117614746, + "learning_rate": 0.00013535230813199324, + "loss": 0.7909, + "step": 19660 + }, + { + "epoch": 0.808612047004684, + "grad_norm": 1.3242278099060059, + "learning_rate": 0.00013522779410263098, + "loss": 0.8242, + "step": 19680 + }, + { + "epoch": 0.8094338072150546, + "grad_norm": 1.042738437652588, + "learning_rate": 0.00013510321768755702, + "loss": 0.7948, + "step": 19700 + }, + { + "epoch": 0.8102555674254253, + "grad_norm": 1.1268333196640015, + "learning_rate": 0.00013497857910738662, + "loss": 0.7923, + "step": 19720 + }, + { + "epoch": 0.8110773276357959, + "grad_norm": 1.115422010421753, + "learning_rate": 0.00013485387858284506, + "loss": 0.7998, + "step": 19740 + }, + { + "epoch": 0.8118990878461665, + "grad_norm": 1.2183444499969482, + "learning_rate": 0.0001347291163347674, + "loss": 0.8054, + "step": 19760 + }, + { + "epoch": 0.8127208480565371, + "grad_norm": 1.158079981803894, + "learning_rate": 0.000134604292584098, + "loss": 0.7716, + "step": 19780 + }, + { + "epoch": 0.8135426082669077, + "grad_norm": 1.0598976612091064, + "learning_rate": 0.0001344794075518901, + "loss": 0.7967, + "step": 19800 + }, + { + "epoch": 0.8143643684772783, + "grad_norm": 1.1043282747268677, + "learning_rate": 0.00013435446145930544, + "loss": 0.7848, + "step": 19820 + }, + { + "epoch": 0.815186128687649, + "grad_norm": 1.2107045650482178, + "learning_rate": 0.00013422945452761398, + "loss": 0.7996, + "step": 19840 + }, + { + "epoch": 0.8160078888980196, + "grad_norm": 1.1288707256317139, + "learning_rate": 0.00013410438697819337, + "loss": 0.8172, + "step": 19860 + }, + { + "epoch": 0.8168296491083902, + "grad_norm": 1.215134859085083, + "learning_rate": 0.0001339792590325286, + "loss": 0.8155, + "step": 19880 + }, + { + "epoch": 0.8176514093187608, + "grad_norm": 1.3704839944839478, + "learning_rate": 0.00013385407091221163, + "loss": 0.7831, + "step": 19900 + }, + { + "epoch": 0.8184731695291314, + "grad_norm": 1.2613569498062134, + "learning_rate": 0.000133728822838941, + "loss": 0.8154, + "step": 19920 + }, + { + "epoch": 0.819294929739502, + "grad_norm": 1.2519367933273315, + "learning_rate": 0.00013360351503452137, + "loss": 0.8089, + "step": 19940 + }, + { + "epoch": 0.8201166899498726, + "grad_norm": 1.259395956993103, + "learning_rate": 0.0001334781477208632, + "loss": 0.8161, + "step": 19960 + }, + { + "epoch": 0.8209384501602432, + "grad_norm": 1.1715264320373535, + "learning_rate": 0.00013335272111998246, + "loss": 0.8009, + "step": 19980 + }, + { + "epoch": 0.8217602103706139, + "grad_norm": 1.1847702264785767, + "learning_rate": 0.0001332272354539999, + "loss": 0.774, + "step": 20000 + }, + { + "epoch": 0.8217602103706139, + "eval_loss": 1.0668244361877441, + "eval_runtime": 16.5724, + "eval_samples_per_second": 158.094, + "eval_steps_per_second": 4.948, + "step": 20000 + }, + { + "epoch": 0.8225819705809845, + "grad_norm": 1.224970817565918, + "learning_rate": 0.00013310169094514103, + "loss": 0.8, + "step": 20020 + }, + { + "epoch": 0.8234037307913551, + "grad_norm": 1.1203746795654297, + "learning_rate": 0.0001329760878157355, + "loss": 0.8158, + "step": 20040 + }, + { + "epoch": 0.8242254910017257, + "grad_norm": 1.1533716917037964, + "learning_rate": 0.00013285042628821675, + "loss": 0.7896, + "step": 20060 + }, + { + "epoch": 0.8250472512120963, + "grad_norm": 1.213652491569519, + "learning_rate": 0.00013272470658512174, + "loss": 0.8121, + "step": 20080 + }, + { + "epoch": 0.8258690114224669, + "grad_norm": 1.2149641513824463, + "learning_rate": 0.00013259892892909033, + "loss": 0.7926, + "step": 20100 + }, + { + "epoch": 0.8266907716328376, + "grad_norm": 1.3824020624160767, + "learning_rate": 0.00013247309354286503, + "loss": 0.7909, + "step": 20120 + }, + { + "epoch": 0.8275125318432082, + "grad_norm": 1.0057183504104614, + "learning_rate": 0.00013234720064929075, + "loss": 0.819, + "step": 20140 + }, + { + "epoch": 0.8283342920535788, + "grad_norm": 1.1706757545471191, + "learning_rate": 0.000132221250471314, + "loss": 0.8248, + "step": 20160 + }, + { + "epoch": 0.8291560522639494, + "grad_norm": 1.0104855298995972, + "learning_rate": 0.00013209524323198294, + "loss": 0.7992, + "step": 20180 + }, + { + "epoch": 0.82997781247432, + "grad_norm": 1.2821381092071533, + "learning_rate": 0.0001319691791544466, + "loss": 0.802, + "step": 20200 + }, + { + "epoch": 0.8307995726846906, + "grad_norm": 1.106156826019287, + "learning_rate": 0.0001318430584619548, + "loss": 0.7799, + "step": 20220 + }, + { + "epoch": 0.8316213328950612, + "grad_norm": 1.1786506175994873, + "learning_rate": 0.0001317168813778576, + "loss": 0.7927, + "step": 20240 + }, + { + "epoch": 0.8324430931054319, + "grad_norm": 1.280644178390503, + "learning_rate": 0.00013159064812560484, + "loss": 0.7922, + "step": 20260 + }, + { + "epoch": 0.8332648533158025, + "grad_norm": 1.2000395059585571, + "learning_rate": 0.00013146435892874596, + "loss": 0.8024, + "step": 20280 + }, + { + "epoch": 0.8340866135261731, + "grad_norm": 1.384141206741333, + "learning_rate": 0.00013134433257673927, + "loss": 0.8428, + "step": 20300 + }, + { + "epoch": 0.8349083737365437, + "grad_norm": 1.1807432174682617, + "learning_rate": 0.00013121793493125713, + "loss": 0.8039, + "step": 20320 + }, + { + "epoch": 0.8357301339469143, + "grad_norm": 1.1562411785125732, + "learning_rate": 0.00013109148200121524, + "loss": 0.8043, + "step": 20340 + }, + { + "epoch": 0.8365518941572849, + "grad_norm": 1.0757108926773071, + "learning_rate": 0.00013096497401055197, + "loss": 0.7933, + "step": 20360 + }, + { + "epoch": 0.8373736543676555, + "grad_norm": 1.1456866264343262, + "learning_rate": 0.00013083841118330326, + "loss": 0.7793, + "step": 20380 + }, + { + "epoch": 0.8381954145780262, + "grad_norm": 1.3488410711288452, + "learning_rate": 0.0001307117937436021, + "loss": 0.7775, + "step": 20400 + }, + { + "epoch": 0.8390171747883968, + "grad_norm": 1.2938168048858643, + "learning_rate": 0.0001305851219156783, + "loss": 0.8144, + "step": 20420 + }, + { + "epoch": 0.8398389349987674, + "grad_norm": 1.131605625152588, + "learning_rate": 0.00013045839592385787, + "loss": 0.7883, + "step": 20440 + }, + { + "epoch": 0.840660695209138, + "grad_norm": 1.1760270595550537, + "learning_rate": 0.00013033161599256275, + "loss": 0.7984, + "step": 20460 + }, + { + "epoch": 0.8414824554195086, + "grad_norm": 1.1592392921447754, + "learning_rate": 0.00013020478234631049, + "loss": 0.8142, + "step": 20480 + }, + { + "epoch": 0.8423042156298792, + "grad_norm": 1.3598871231079102, + "learning_rate": 0.00013007789520971374, + "loss": 0.7861, + "step": 20500 + }, + { + "epoch": 0.8431259758402498, + "grad_norm": 1.2098135948181152, + "learning_rate": 0.00012995095480747976, + "loss": 0.8069, + "step": 20520 + }, + { + "epoch": 0.8439477360506205, + "grad_norm": 1.0988825559616089, + "learning_rate": 0.00012982396136441033, + "loss": 0.7971, + "step": 20540 + }, + { + "epoch": 0.8447694962609911, + "grad_norm": 0.9486870765686035, + "learning_rate": 0.000129696915105401, + "loss": 0.7807, + "step": 20560 + }, + { + "epoch": 0.8455912564713617, + "grad_norm": 1.1476234197616577, + "learning_rate": 0.00012956981625544093, + "loss": 0.7884, + "step": 20580 + }, + { + "epoch": 0.8464130166817323, + "grad_norm": 1.1225764751434326, + "learning_rate": 0.00012944266503961242, + "loss": 0.7966, + "step": 20600 + }, + { + "epoch": 0.8472347768921029, + "grad_norm": 1.316215991973877, + "learning_rate": 0.00012931546168309046, + "loss": 0.8115, + "step": 20620 + }, + { + "epoch": 0.8480565371024735, + "grad_norm": 1.1493761539459229, + "learning_rate": 0.0001291882064111424, + "loss": 0.8094, + "step": 20640 + }, + { + "epoch": 0.8488782973128441, + "grad_norm": 1.1531728506088257, + "learning_rate": 0.00012906089944912755, + "loss": 0.8158, + "step": 20660 + }, + { + "epoch": 0.8497000575232148, + "grad_norm": 1.1480746269226074, + "learning_rate": 0.00012893354102249673, + "loss": 0.8085, + "step": 20680 + }, + { + "epoch": 0.8505218177335854, + "grad_norm": 1.265832543373108, + "learning_rate": 0.00012880613135679193, + "loss": 0.803, + "step": 20700 + }, + { + "epoch": 0.851343577943956, + "grad_norm": 1.0262377262115479, + "learning_rate": 0.00012867867067764584, + "loss": 0.774, + "step": 20720 + }, + { + "epoch": 0.8521653381543266, + "grad_norm": 1.136478066444397, + "learning_rate": 0.00012855115921078152, + "loss": 0.779, + "step": 20740 + }, + { + "epoch": 0.8529870983646972, + "grad_norm": 1.1349806785583496, + "learning_rate": 0.00012842359718201194, + "loss": 0.7922, + "step": 20760 + }, + { + "epoch": 0.8538088585750678, + "grad_norm": 1.2569667100906372, + "learning_rate": 0.00012829598481723964, + "loss": 0.7739, + "step": 20780 + }, + { + "epoch": 0.8546306187854384, + "grad_norm": 1.2146037817001343, + "learning_rate": 0.00012816832234245634, + "loss": 0.8221, + "step": 20800 + }, + { + "epoch": 0.8554523789958091, + "grad_norm": 1.300690770149231, + "learning_rate": 0.00012804060998374245, + "loss": 0.7928, + "step": 20820 + }, + { + "epoch": 0.8562741392061797, + "grad_norm": 1.0861823558807373, + "learning_rate": 0.00012791284796726663, + "loss": 0.8043, + "step": 20840 + }, + { + "epoch": 0.8570958994165503, + "grad_norm": 1.2741750478744507, + "learning_rate": 0.0001277850365192857, + "loss": 0.8065, + "step": 20860 + }, + { + "epoch": 0.8579176596269209, + "grad_norm": 1.0767971277236938, + "learning_rate": 0.00012765717586614382, + "loss": 0.8153, + "step": 20880 + }, + { + "epoch": 0.8587394198372915, + "grad_norm": 1.1135772466659546, + "learning_rate": 0.0001275292662342724, + "loss": 0.7685, + "step": 20900 + }, + { + "epoch": 0.8595611800476621, + "grad_norm": 1.2278940677642822, + "learning_rate": 0.0001274013078501895, + "loss": 0.7635, + "step": 20920 + }, + { + "epoch": 0.8603829402580327, + "grad_norm": 1.0262848138809204, + "learning_rate": 0.00012727330094049967, + "loss": 0.7907, + "step": 20940 + }, + { + "epoch": 0.8612047004684034, + "grad_norm": 1.2303491830825806, + "learning_rate": 0.0001271452457318932, + "loss": 0.8026, + "step": 20960 + }, + { + "epoch": 0.862026460678774, + "grad_norm": 1.1088433265686035, + "learning_rate": 0.00012701714245114603, + "loss": 0.7966, + "step": 20980 + }, + { + "epoch": 0.8628482208891446, + "grad_norm": 1.1193281412124634, + "learning_rate": 0.00012688899132511924, + "loss": 0.7924, + "step": 21000 + }, + { + "epoch": 0.8636699810995152, + "grad_norm": 1.308245301246643, + "learning_rate": 0.00012676079258075858, + "loss": 0.7885, + "step": 21020 + }, + { + "epoch": 0.8644917413098858, + "grad_norm": 1.2952789068222046, + "learning_rate": 0.0001266325464450942, + "loss": 0.7913, + "step": 21040 + }, + { + "epoch": 0.8653135015202564, + "grad_norm": 1.1136795282363892, + "learning_rate": 0.00012650425314524006, + "loss": 0.7927, + "step": 21060 + }, + { + "epoch": 0.866135261730627, + "grad_norm": 1.222264051437378, + "learning_rate": 0.00012637591290839376, + "loss": 0.8171, + "step": 21080 + }, + { + "epoch": 0.8669570219409977, + "grad_norm": 1.3129292726516724, + "learning_rate": 0.000126247525961836, + "loss": 0.786, + "step": 21100 + }, + { + "epoch": 0.8677787821513683, + "grad_norm": 1.15342116355896, + "learning_rate": 0.00012611909253293016, + "loss": 0.7821, + "step": 21120 + }, + { + "epoch": 0.8686005423617389, + "grad_norm": 1.2096205949783325, + "learning_rate": 0.00012599061284912193, + "loss": 0.7886, + "step": 21140 + }, + { + "epoch": 0.8694223025721095, + "grad_norm": 1.0738475322723389, + "learning_rate": 0.00012586208713793898, + "loss": 0.7813, + "step": 21160 + }, + { + "epoch": 0.8702440627824801, + "grad_norm": 1.1030628681182861, + "learning_rate": 0.00012573351562699048, + "loss": 0.7859, + "step": 21180 + }, + { + "epoch": 0.8710658229928507, + "grad_norm": 1.0959700345993042, + "learning_rate": 0.0001256048985439666, + "loss": 0.8055, + "step": 21200 + }, + { + "epoch": 0.8718875832032214, + "grad_norm": 1.2536059617996216, + "learning_rate": 0.00012547623611663836, + "loss": 0.7837, + "step": 21220 + }, + { + "epoch": 0.872709343413592, + "grad_norm": 1.2621644735336304, + "learning_rate": 0.00012534752857285692, + "loss": 0.7977, + "step": 21240 + }, + { + "epoch": 0.8735311036239626, + "grad_norm": 1.1967788934707642, + "learning_rate": 0.00012521877614055357, + "loss": 0.8035, + "step": 21260 + }, + { + "epoch": 0.8743528638343332, + "grad_norm": 1.2444006204605103, + "learning_rate": 0.00012508997904773884, + "loss": 0.771, + "step": 21280 + }, + { + "epoch": 0.8751746240447038, + "grad_norm": 1.4090861082077026, + "learning_rate": 0.00012497402366798297, + "loss": 0.7657, + "step": 21300 + }, + { + "epoch": 0.8759963842550744, + "grad_norm": 1.2610723972320557, + "learning_rate": 0.00012484514234864898, + "loss": 0.7928, + "step": 21320 + }, + { + "epoch": 0.876818144465445, + "grad_norm": 1.2457926273345947, + "learning_rate": 0.0001247226643378368, + "loss": 0.7985, + "step": 21340 + }, + { + "epoch": 0.8776399046758157, + "grad_norm": 1.2153024673461914, + "learning_rate": 0.0001245936974322525, + "loss": 0.8184, + "step": 21360 + }, + { + "epoch": 0.8784616648861862, + "grad_norm": 1.244173526763916, + "learning_rate": 0.00012446468697312316, + "loss": 0.7855, + "step": 21380 + }, + { + "epoch": 0.8792834250965568, + "grad_norm": 1.2349773645401, + "learning_rate": 0.00012433563318891633, + "loss": 0.7915, + "step": 21400 + }, + { + "epoch": 0.8801051853069274, + "grad_norm": 1.3552359342575073, + "learning_rate": 0.0001242065363081764, + "loss": 0.7878, + "step": 21420 + }, + { + "epoch": 0.880926945517298, + "grad_norm": 1.071760654449463, + "learning_rate": 0.00012407739655952393, + "loss": 0.7836, + "step": 21440 + }, + { + "epoch": 0.8817487057276686, + "grad_norm": 1.2500207424163818, + "learning_rate": 0.0001239482141716555, + "loss": 0.7781, + "step": 21460 + }, + { + "epoch": 0.8825704659380392, + "grad_norm": 1.2251540422439575, + "learning_rate": 0.00012381898937334322, + "loss": 0.7826, + "step": 21480 + }, + { + "epoch": 0.8833922261484098, + "grad_norm": 1.0519969463348389, + "learning_rate": 0.0001236897223934342, + "loss": 0.7764, + "step": 21500 + }, + { + "epoch": 0.8842139863587805, + "grad_norm": 0.9632487297058105, + "learning_rate": 0.00012356041346085032, + "loss": 0.8034, + "step": 21520 + }, + { + "epoch": 0.8850357465691511, + "grad_norm": 1.2027411460876465, + "learning_rate": 0.00012343106280458777, + "loss": 0.8176, + "step": 21540 + }, + { + "epoch": 0.8858575067795217, + "grad_norm": 1.1366872787475586, + "learning_rate": 0.00012330814124322003, + "loss": 0.7884, + "step": 21560 + }, + { + "epoch": 0.8866792669898923, + "grad_norm": 1.137412667274475, + "learning_rate": 0.00012317870988471372, + "loss": 0.7777, + "step": 21580 + }, + { + "epoch": 0.8875010272002629, + "grad_norm": 1.1433818340301514, + "learning_rate": 0.00012304923747849642, + "loss": 0.7818, + "step": 21600 + }, + { + "epoch": 0.8883227874106335, + "grad_norm": 1.1305466890335083, + "learning_rate": 0.00012291972425385373, + "loss": 0.7829, + "step": 21620 + }, + { + "epoch": 0.8891445476210041, + "grad_norm": 1.1296701431274414, + "learning_rate": 0.00012279017044014364, + "loss": 0.8018, + "step": 21640 + }, + { + "epoch": 0.8899663078313748, + "grad_norm": 1.1467466354370117, + "learning_rate": 0.00012266057626679594, + "loss": 0.8009, + "step": 21660 + }, + { + "epoch": 0.8907880680417454, + "grad_norm": 1.2313017845153809, + "learning_rate": 0.00012253094196331202, + "loss": 0.7924, + "step": 21680 + }, + { + "epoch": 0.891609828252116, + "grad_norm": 1.1884132623672485, + "learning_rate": 0.00012240126775926418, + "loss": 0.8, + "step": 21700 + }, + { + "epoch": 0.8924315884624866, + "grad_norm": 1.1451033353805542, + "learning_rate": 0.00012227155388429547, + "loss": 0.8158, + "step": 21720 + }, + { + "epoch": 0.8932533486728572, + "grad_norm": 1.2778671979904175, + "learning_rate": 0.00012214180056811916, + "loss": 0.773, + "step": 21740 + }, + { + "epoch": 0.8940751088832278, + "grad_norm": 1.0006611347198486, + "learning_rate": 0.00012201200804051842, + "loss": 0.7859, + "step": 21760 + }, + { + "epoch": 0.8948968690935984, + "grad_norm": 1.270612120628357, + "learning_rate": 0.0001218821765313458, + "loss": 0.7977, + "step": 21780 + }, + { + "epoch": 0.8957186293039691, + "grad_norm": 1.4604487419128418, + "learning_rate": 0.00012175230627052294, + "loss": 0.8046, + "step": 21800 + }, + { + "epoch": 0.8965403895143397, + "grad_norm": 1.153067946434021, + "learning_rate": 0.00012162239748804005, + "loss": 0.7672, + "step": 21820 + }, + { + "epoch": 0.8973621497247103, + "grad_norm": 1.1451218128204346, + "learning_rate": 0.00012149245041395559, + "loss": 0.7758, + "step": 21840 + }, + { + "epoch": 0.8981839099350809, + "grad_norm": 1.2750080823898315, + "learning_rate": 0.00012136246527839583, + "loss": 0.7816, + "step": 21860 + }, + { + "epoch": 0.8990056701454515, + "grad_norm": 1.0611047744750977, + "learning_rate": 0.00012123244231155442, + "loss": 0.7668, + "step": 21880 + }, + { + "epoch": 0.8998274303558221, + "grad_norm": 1.098158359527588, + "learning_rate": 0.00012110238174369202, + "loss": 0.8227, + "step": 21900 + }, + { + "epoch": 0.9006491905661927, + "grad_norm": 1.0489616394042969, + "learning_rate": 0.00012097228380513595, + "loss": 0.8008, + "step": 21920 + }, + { + "epoch": 0.9014709507765634, + "grad_norm": 1.1194531917572021, + "learning_rate": 0.00012084214872627955, + "loss": 0.8118, + "step": 21940 + }, + { + "epoch": 0.902292710986934, + "grad_norm": 1.1527105569839478, + "learning_rate": 0.00012071197673758212, + "loss": 0.769, + "step": 21960 + }, + { + "epoch": 0.9031144711973046, + "grad_norm": 1.1321651935577393, + "learning_rate": 0.00012058176806956818, + "loss": 0.7789, + "step": 21980 + }, + { + "epoch": 0.9039362314076752, + "grad_norm": 1.25917387008667, + "learning_rate": 0.00012045152295282727, + "loss": 0.7828, + "step": 22000 + }, + { + "epoch": 0.9039362314076752, + "eval_loss": 1.0472674369812012, + "eval_runtime": 23.5948, + "eval_samples_per_second": 111.042, + "eval_steps_per_second": 3.475, + "step": 22000 + }, + { + "epoch": 0.9047579916180458, + "grad_norm": 1.260465145111084, + "learning_rate": 0.00012032124161801345, + "loss": 0.761, + "step": 22020 + }, + { + "epoch": 0.9055797518284164, + "grad_norm": 1.0791617631912231, + "learning_rate": 0.00012019092429584495, + "loss": 0.7737, + "step": 22040 + }, + { + "epoch": 0.906401512038787, + "grad_norm": 1.277144193649292, + "learning_rate": 0.0001200605712171037, + "loss": 0.8014, + "step": 22060 + }, + { + "epoch": 0.9072232722491577, + "grad_norm": 1.1210474967956543, + "learning_rate": 0.00011993018261263503, + "loss": 0.8066, + "step": 22080 + }, + { + "epoch": 0.9080450324595283, + "grad_norm": 1.2601996660232544, + "learning_rate": 0.00011979975871334703, + "loss": 0.7787, + "step": 22100 + }, + { + "epoch": 0.9088667926698989, + "grad_norm": 1.1547856330871582, + "learning_rate": 0.00011966929975021046, + "loss": 0.7936, + "step": 22120 + }, + { + "epoch": 0.9096885528802695, + "grad_norm": 1.1628038883209229, + "learning_rate": 0.00011953880595425808, + "loss": 0.7803, + "step": 22140 + }, + { + "epoch": 0.9105103130906401, + "grad_norm": 1.1185588836669922, + "learning_rate": 0.00011940827755658433, + "loss": 0.8004, + "step": 22160 + }, + { + "epoch": 0.9113320733010107, + "grad_norm": 1.0631072521209717, + "learning_rate": 0.00011927771478834496, + "loss": 0.7898, + "step": 22180 + }, + { + "epoch": 0.9121538335113814, + "grad_norm": 1.2936205863952637, + "learning_rate": 0.00011914711788075663, + "loss": 0.7809, + "step": 22200 + }, + { + "epoch": 0.912975593721752, + "grad_norm": 1.084246039390564, + "learning_rate": 0.00011901648706509636, + "loss": 0.795, + "step": 22220 + }, + { + "epoch": 0.9137973539321226, + "grad_norm": 1.3070451021194458, + "learning_rate": 0.00011888582257270132, + "loss": 0.793, + "step": 22240 + }, + { + "epoch": 0.9146191141424932, + "grad_norm": 1.2026519775390625, + "learning_rate": 0.00011875512463496822, + "loss": 0.8013, + "step": 22260 + }, + { + "epoch": 0.9154408743528638, + "grad_norm": 1.2209608554840088, + "learning_rate": 0.00011862439348335306, + "loss": 0.7708, + "step": 22280 + }, + { + "epoch": 0.9162626345632344, + "grad_norm": 1.1526132822036743, + "learning_rate": 0.00011849362934937065, + "loss": 0.7822, + "step": 22300 + }, + { + "epoch": 0.917084394773605, + "grad_norm": 1.0761892795562744, + "learning_rate": 0.00011836283246459421, + "loss": 0.8061, + "step": 22320 + }, + { + "epoch": 0.9179061549839757, + "grad_norm": 1.2090603113174438, + "learning_rate": 0.00011823200306065494, + "loss": 0.7692, + "step": 22340 + }, + { + "epoch": 0.9187279151943463, + "grad_norm": 1.1188862323760986, + "learning_rate": 0.00011810114136924166, + "loss": 0.8228, + "step": 22360 + }, + { + "epoch": 0.9195496754047169, + "grad_norm": 1.138457179069519, + "learning_rate": 0.00011797024762210034, + "loss": 0.7883, + "step": 22380 + }, + { + "epoch": 0.9203714356150875, + "grad_norm": 1.0787718296051025, + "learning_rate": 0.00011783932205103376, + "loss": 0.7743, + "step": 22400 + }, + { + "epoch": 0.9211931958254581, + "grad_norm": 1.0644888877868652, + "learning_rate": 0.00011770836488790103, + "loss": 0.8028, + "step": 22420 + }, + { + "epoch": 0.9220149560358287, + "grad_norm": 1.1350173950195312, + "learning_rate": 0.00011757737636461711, + "loss": 0.769, + "step": 22440 + }, + { + "epoch": 0.9228367162461993, + "grad_norm": 1.2350654602050781, + "learning_rate": 0.00011744635671315274, + "loss": 0.7614, + "step": 22460 + }, + { + "epoch": 0.92365847645657, + "grad_norm": 1.1380183696746826, + "learning_rate": 0.0001173153061655335, + "loss": 0.7789, + "step": 22480 + }, + { + "epoch": 0.9244802366669406, + "grad_norm": 1.2965232133865356, + "learning_rate": 0.00011718422495383992, + "loss": 0.8028, + "step": 22500 + }, + { + "epoch": 0.9253019968773112, + "grad_norm": 1.2787531614303589, + "learning_rate": 0.00011705311331020667, + "loss": 0.7729, + "step": 22520 + }, + { + "epoch": 0.9261237570876818, + "grad_norm": 1.1900768280029297, + "learning_rate": 0.00011692197146682242, + "loss": 0.7856, + "step": 22540 + }, + { + "epoch": 0.9269455172980524, + "grad_norm": 1.2483367919921875, + "learning_rate": 0.0001167907996559293, + "loss": 0.7906, + "step": 22560 + }, + { + "epoch": 0.927767277508423, + "grad_norm": 1.0251802206039429, + "learning_rate": 0.00011665959810982245, + "loss": 0.7756, + "step": 22580 + }, + { + "epoch": 0.9285890377187936, + "grad_norm": 1.267059087753296, + "learning_rate": 0.00011652836706084969, + "loss": 0.8083, + "step": 22600 + }, + { + "epoch": 0.9294107979291643, + "grad_norm": 1.1620546579360962, + "learning_rate": 0.00011639710674141114, + "loss": 0.7737, + "step": 22620 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 1.09406578540802, + "learning_rate": 0.00011626581738395872, + "loss": 0.8101, + "step": 22640 + }, + { + "epoch": 0.9310543183499055, + "grad_norm": 1.101579189300537, + "learning_rate": 0.00011613449922099576, + "loss": 0.7853, + "step": 22660 + }, + { + "epoch": 0.9318760785602761, + "grad_norm": 1.2870060205459595, + "learning_rate": 0.00011600315248507666, + "loss": 0.7786, + "step": 22680 + }, + { + "epoch": 0.9326978387706467, + "grad_norm": 1.1342302560806274, + "learning_rate": 0.00011587177740880633, + "loss": 0.7688, + "step": 22700 + }, + { + "epoch": 0.9335195989810173, + "grad_norm": 1.2600188255310059, + "learning_rate": 0.00011574037422483995, + "loss": 0.7672, + "step": 22720 + }, + { + "epoch": 0.9343413591913879, + "grad_norm": 1.1466922760009766, + "learning_rate": 0.00011560894316588243, + "loss": 0.7855, + "step": 22740 + }, + { + "epoch": 0.9351631194017586, + "grad_norm": 1.2696952819824219, + "learning_rate": 0.00011547748446468802, + "loss": 0.7869, + "step": 22760 + }, + { + "epoch": 0.9359848796121292, + "grad_norm": 1.2031900882720947, + "learning_rate": 0.00011534599835406001, + "loss": 0.784, + "step": 22780 + }, + { + "epoch": 0.9368066398224998, + "grad_norm": 1.1982570886611938, + "learning_rate": 0.00011521448506685022, + "loss": 0.7838, + "step": 22800 + }, + { + "epoch": 0.9376284000328704, + "grad_norm": 1.0865845680236816, + "learning_rate": 0.00011508294483595845, + "loss": 0.7608, + "step": 22820 + }, + { + "epoch": 0.938450160243241, + "grad_norm": 1.1527810096740723, + "learning_rate": 0.00011495137789433243, + "loss": 0.7637, + "step": 22840 + }, + { + "epoch": 0.9392719204536116, + "grad_norm": 1.3062829971313477, + "learning_rate": 0.00011481978447496704, + "loss": 0.7919, + "step": 22860 + }, + { + "epoch": 0.9400936806639822, + "grad_norm": 1.1722383499145508, + "learning_rate": 0.00011468816481090406, + "loss": 0.7742, + "step": 22880 + }, + { + "epoch": 0.9409154408743529, + "grad_norm": 1.1022741794586182, + "learning_rate": 0.00011455651913523184, + "loss": 0.7798, + "step": 22900 + }, + { + "epoch": 0.9417372010847235, + "grad_norm": 1.2316502332687378, + "learning_rate": 0.0001144248476810847, + "loss": 0.7957, + "step": 22920 + }, + { + "epoch": 0.9425589612950941, + "grad_norm": 1.0445023775100708, + "learning_rate": 0.00011429315068164269, + "loss": 0.7784, + "step": 22940 + }, + { + "epoch": 0.9433807215054647, + "grad_norm": 1.1832512617111206, + "learning_rate": 0.00011416142837013101, + "loss": 0.7939, + "step": 22960 + }, + { + "epoch": 0.9442024817158353, + "grad_norm": 1.1852409839630127, + "learning_rate": 0.00011402968097981976, + "loss": 0.7692, + "step": 22980 + }, + { + "epoch": 0.9450242419262059, + "grad_norm": 1.3063126802444458, + "learning_rate": 0.00011389790874402347, + "loss": 0.7647, + "step": 23000 + }, + { + "epoch": 0.9458460021365765, + "grad_norm": 1.3952887058258057, + "learning_rate": 0.00011376611189610056, + "loss": 0.7942, + "step": 23020 + }, + { + "epoch": 0.9466677623469472, + "grad_norm": 1.1620287895202637, + "learning_rate": 0.0001136342906694531, + "loss": 0.791, + "step": 23040 + }, + { + "epoch": 0.9474895225573178, + "grad_norm": 1.1343626976013184, + "learning_rate": 0.0001135024452975264, + "loss": 0.7983, + "step": 23060 + }, + { + "epoch": 0.9483112827676884, + "grad_norm": 1.3010191917419434, + "learning_rate": 0.00011337057601380841, + "loss": 0.7934, + "step": 23080 + }, + { + "epoch": 0.949133042978059, + "grad_norm": 1.0730324983596802, + "learning_rate": 0.0001132386830518295, + "loss": 0.7907, + "step": 23100 + }, + { + "epoch": 0.9499548031884296, + "grad_norm": 1.1792380809783936, + "learning_rate": 0.00011310676664516196, + "loss": 0.7756, + "step": 23120 + }, + { + "epoch": 0.9507765633988002, + "grad_norm": 1.037816047668457, + "learning_rate": 0.00011297482702741958, + "loss": 0.7706, + "step": 23140 + }, + { + "epoch": 0.9515983236091708, + "grad_norm": 1.2920358180999756, + "learning_rate": 0.00011284286443225725, + "loss": 0.7788, + "step": 23160 + }, + { + "epoch": 0.9524200838195415, + "grad_norm": 1.3024156093597412, + "learning_rate": 0.00011271087909337059, + "loss": 0.7726, + "step": 23180 + }, + { + "epoch": 0.9532418440299121, + "grad_norm": 1.0410034656524658, + "learning_rate": 0.00011257887124449549, + "loss": 0.7358, + "step": 23200 + }, + { + "epoch": 0.9540636042402827, + "grad_norm": 1.1379297971725464, + "learning_rate": 0.00011244684111940765, + "loss": 0.7654, + "step": 23220 + }, + { + "epoch": 0.9548853644506533, + "grad_norm": 1.1472039222717285, + "learning_rate": 0.00011231478895192232, + "loss": 0.7398, + "step": 23240 + }, + { + "epoch": 0.9557071246610239, + "grad_norm": 1.1397725343704224, + "learning_rate": 0.00011218271497589364, + "loss": 0.802, + "step": 23260 + }, + { + "epoch": 0.9565288848713945, + "grad_norm": 1.2153700590133667, + "learning_rate": 0.00011205061942521453, + "loss": 0.8053, + "step": 23280 + }, + { + "epoch": 0.9573506450817652, + "grad_norm": 1.1267844438552856, + "learning_rate": 0.00011191850253381601, + "loss": 0.7778, + "step": 23300 + }, + { + "epoch": 0.9581724052921358, + "grad_norm": 1.2119829654693604, + "learning_rate": 0.00011178636453566691, + "loss": 0.8014, + "step": 23320 + }, + { + "epoch": 0.9589941655025064, + "grad_norm": 1.1640605926513672, + "learning_rate": 0.00011165420566477351, + "loss": 0.7643, + "step": 23340 + }, + { + "epoch": 0.959815925712877, + "grad_norm": 1.2424917221069336, + "learning_rate": 0.0001115220261551789, + "loss": 0.7769, + "step": 23360 + }, + { + "epoch": 0.9606376859232476, + "grad_norm": 1.1949933767318726, + "learning_rate": 0.0001113898262409629, + "loss": 0.7647, + "step": 23380 + }, + { + "epoch": 0.9614594461336182, + "grad_norm": 1.0828710794448853, + "learning_rate": 0.0001112576061562414, + "loss": 0.7729, + "step": 23400 + }, + { + "epoch": 0.9622812063439888, + "grad_norm": 1.1788461208343506, + "learning_rate": 0.00011112536613516589, + "loss": 0.7833, + "step": 23420 + }, + { + "epoch": 0.9631029665543595, + "grad_norm": 1.1178921461105347, + "learning_rate": 0.00011099310641192335, + "loss": 0.7873, + "step": 23440 + }, + { + "epoch": 0.9639247267647301, + "grad_norm": 1.131232738494873, + "learning_rate": 0.00011086082722073556, + "loss": 0.7622, + "step": 23460 + }, + { + "epoch": 0.9647464869751007, + "grad_norm": 1.1513261795043945, + "learning_rate": 0.00011072852879585876, + "loss": 0.7781, + "step": 23480 + }, + { + "epoch": 0.9655682471854713, + "grad_norm": 1.185996413230896, + "learning_rate": 0.00011059621137158332, + "loss": 0.7987, + "step": 23500 + }, + { + "epoch": 0.9663900073958419, + "grad_norm": 1.2174021005630493, + "learning_rate": 0.00011046387518223314, + "loss": 0.7732, + "step": 23520 + }, + { + "epoch": 0.9672117676062125, + "grad_norm": 1.2955466508865356, + "learning_rate": 0.00011033152046216546, + "loss": 0.772, + "step": 23540 + }, + { + "epoch": 0.9680335278165831, + "grad_norm": 1.1724337339401245, + "learning_rate": 0.00011019914744577034, + "loss": 0.7811, + "step": 23560 + }, + { + "epoch": 0.9688552880269538, + "grad_norm": 1.1841801404953003, + "learning_rate": 0.00011006675636747017, + "loss": 0.7893, + "step": 23580 + }, + { + "epoch": 0.9696770482373244, + "grad_norm": 1.1488229036331177, + "learning_rate": 0.00010993434746171933, + "loss": 0.759, + "step": 23600 + }, + { + "epoch": 0.970498808447695, + "grad_norm": 1.2947933673858643, + "learning_rate": 0.00010980192096300389, + "loss": 0.7821, + "step": 23620 + }, + { + "epoch": 0.9713205686580656, + "grad_norm": 1.1167196035385132, + "learning_rate": 0.00010966947710584086, + "loss": 0.7765, + "step": 23640 + }, + { + "epoch": 0.9721423288684362, + "grad_norm": 1.114274024963379, + "learning_rate": 0.00010953701612477821, + "loss": 0.7563, + "step": 23660 + }, + { + "epoch": 0.9729640890788068, + "grad_norm": 1.1539628505706787, + "learning_rate": 0.00010940453825439411, + "loss": 0.7754, + "step": 23680 + }, + { + "epoch": 0.9737858492891774, + "grad_norm": 1.1876133680343628, + "learning_rate": 0.00010927204372929667, + "loss": 0.7681, + "step": 23700 + }, + { + "epoch": 0.974607609499548, + "grad_norm": 1.0904812812805176, + "learning_rate": 0.00010913953278412353, + "loss": 0.7726, + "step": 23720 + }, + { + "epoch": 0.9754293697099187, + "grad_norm": 1.1562939882278442, + "learning_rate": 0.00010900700565354131, + "loss": 0.7774, + "step": 23740 + }, + { + "epoch": 0.9762511299202893, + "grad_norm": 1.227735161781311, + "learning_rate": 0.0001088744625722454, + "loss": 0.7907, + "step": 23760 + }, + { + "epoch": 0.9770728901306599, + "grad_norm": 1.1245887279510498, + "learning_rate": 0.00010874190377495938, + "loss": 0.7882, + "step": 23780 + }, + { + "epoch": 0.9778946503410305, + "grad_norm": 1.1092678308486938, + "learning_rate": 0.0001086093294964347, + "loss": 0.759, + "step": 23800 + }, + { + "epoch": 0.9787164105514011, + "grad_norm": 1.1554052829742432, + "learning_rate": 0.00010847673997145016, + "loss": 0.7665, + "step": 23820 + }, + { + "epoch": 0.9795381707617717, + "grad_norm": 1.1881402730941772, + "learning_rate": 0.00010834413543481163, + "loss": 0.7899, + "step": 23840 + }, + { + "epoch": 0.9803599309721424, + "grad_norm": 1.209314227104187, + "learning_rate": 0.00010821151612135155, + "loss": 0.7478, + "step": 23860 + }, + { + "epoch": 0.981181691182513, + "grad_norm": 1.184097409248352, + "learning_rate": 0.00010807888226592848, + "loss": 0.7626, + "step": 23880 + }, + { + "epoch": 0.9820034513928836, + "grad_norm": 1.109991431236267, + "learning_rate": 0.00010794623410342682, + "loss": 0.7748, + "step": 23900 + }, + { + "epoch": 0.9828252116032542, + "grad_norm": 1.1060677766799927, + "learning_rate": 0.00010781357186875619, + "loss": 0.7873, + "step": 23920 + }, + { + "epoch": 0.9836469718136248, + "grad_norm": 1.1834135055541992, + "learning_rate": 0.00010768089579685126, + "loss": 0.7814, + "step": 23940 + }, + { + "epoch": 0.9844687320239954, + "grad_norm": 1.1708906888961792, + "learning_rate": 0.00010754820612267106, + "loss": 0.753, + "step": 23960 + }, + { + "epoch": 0.985290492234366, + "grad_norm": 1.2200385332107544, + "learning_rate": 0.00010741550308119885, + "loss": 0.7651, + "step": 23980 + }, + { + "epoch": 0.9861122524447367, + "grad_norm": 1.0666234493255615, + "learning_rate": 0.00010728278690744153, + "loss": 0.7795, + "step": 24000 + }, + { + "epoch": 0.9861122524447367, + "eval_loss": 1.02431058883667, + "eval_runtime": 16.5783, + "eval_samples_per_second": 158.038, + "eval_steps_per_second": 4.946, + "step": 24000 + }, + { + "epoch": 0.9869340126551073, + "grad_norm": 1.1617413759231567, + "learning_rate": 0.00010715005783642917, + "loss": 0.7878, + "step": 24020 + }, + { + "epoch": 0.9877557728654779, + "grad_norm": 1.11324942111969, + "learning_rate": 0.00010701731610321475, + "loss": 0.7969, + "step": 24040 + }, + { + "epoch": 0.9885775330758485, + "grad_norm": 1.1129649877548218, + "learning_rate": 0.00010688456194287368, + "loss": 0.7595, + "step": 24060 + }, + { + "epoch": 0.9893992932862191, + "grad_norm": 1.186141848564148, + "learning_rate": 0.00010675179559050332, + "loss": 0.8078, + "step": 24080 + }, + { + "epoch": 0.9902210534965897, + "grad_norm": 1.205206036567688, + "learning_rate": 0.00010661901728122272, + "loss": 0.7449, + "step": 24100 + }, + { + "epoch": 0.9910428137069603, + "grad_norm": 1.2314426898956299, + "learning_rate": 0.00010648622725017199, + "loss": 0.768, + "step": 24120 + }, + { + "epoch": 0.991864573917331, + "grad_norm": 1.1358891725540161, + "learning_rate": 0.00010635342573251209, + "loss": 0.7753, + "step": 24140 + }, + { + "epoch": 0.9926863341277016, + "grad_norm": 1.3078477382659912, + "learning_rate": 0.00010622061296342425, + "loss": 0.7845, + "step": 24160 + }, + { + "epoch": 0.9935080943380722, + "grad_norm": 1.105637550354004, + "learning_rate": 0.0001060877891781097, + "loss": 0.7741, + "step": 24180 + }, + { + "epoch": 0.9943298545484428, + "grad_norm": 1.1030614376068115, + "learning_rate": 0.00010595495461178912, + "loss": 0.7611, + "step": 24200 + }, + { + "epoch": 0.9951516147588134, + "grad_norm": 1.0990793704986572, + "learning_rate": 0.00010582210949970233, + "loss": 0.7711, + "step": 24220 + }, + { + "epoch": 0.995973374969184, + "grad_norm": 1.1833900213241577, + "learning_rate": 0.00010568925407710773, + "loss": 0.7684, + "step": 24240 + }, + { + "epoch": 0.9967951351795546, + "grad_norm": 1.0902843475341797, + "learning_rate": 0.00010555638857928209, + "loss": 0.7709, + "step": 24260 + }, + { + "epoch": 0.9976168953899253, + "grad_norm": 1.1372528076171875, + "learning_rate": 0.00010542351324152, + "loss": 0.7764, + "step": 24280 + }, + { + "epoch": 0.9984386556002959, + "grad_norm": 1.0270088911056519, + "learning_rate": 0.00010529062829913343, + "loss": 0.7638, + "step": 24300 + }, + { + "epoch": 0.9992604158106665, + "grad_norm": 1.0260668992996216, + "learning_rate": 0.0001051577339874514, + "loss": 0.781, + "step": 24320 + }, + { + "epoch": 1.000082176021037, + "grad_norm": 1.0618846416473389, + "learning_rate": 0.00010502483054181948, + "loss": 0.7616, + "step": 24340 + }, + { + "epoch": 1.0009039362314076, + "grad_norm": 1.0496258735656738, + "learning_rate": 0.00010489191819759945, + "loss": 0.6874, + "step": 24360 + }, + { + "epoch": 1.0017256964417782, + "grad_norm": 1.304571270942688, + "learning_rate": 0.00010475899719016886, + "loss": 0.6708, + "step": 24380 + }, + { + "epoch": 1.0025474566521488, + "grad_norm": 1.4253283739089966, + "learning_rate": 0.00010462606775492054, + "loss": 0.6789, + "step": 24400 + }, + { + "epoch": 1.0033692168625195, + "grad_norm": 1.1459177732467651, + "learning_rate": 0.00010449313012726234, + "loss": 0.6678, + "step": 24420 + }, + { + "epoch": 1.00419097707289, + "grad_norm": 1.2071423530578613, + "learning_rate": 0.00010436018454261654, + "loss": 0.6639, + "step": 24440 + }, + { + "epoch": 1.0050127372832607, + "grad_norm": 1.1461540460586548, + "learning_rate": 0.00010422723123641956, + "loss": 0.6722, + "step": 24460 + }, + { + "epoch": 1.0058344974936313, + "grad_norm": 1.2865045070648193, + "learning_rate": 0.00010409427044412141, + "loss": 0.6756, + "step": 24480 + }, + { + "epoch": 1.006656257704002, + "grad_norm": 1.0493274927139282, + "learning_rate": 0.00010396130240118549, + "loss": 0.677, + "step": 24500 + }, + { + "epoch": 1.0074780179143725, + "grad_norm": 1.171351671218872, + "learning_rate": 0.00010382832734308792, + "loss": 0.6736, + "step": 24520 + }, + { + "epoch": 1.0082997781247431, + "grad_norm": 1.1830470561981201, + "learning_rate": 0.00010369534550531734, + "loss": 0.6575, + "step": 24540 + }, + { + "epoch": 1.0091215383351138, + "grad_norm": 1.1490957736968994, + "learning_rate": 0.00010357565624934433, + "loss": 0.6628, + "step": 24560 + }, + { + "epoch": 1.0099432985454844, + "grad_norm": 1.0752952098846436, + "learning_rate": 0.00010344266217900869, + "loss": 0.6489, + "step": 24580 + }, + { + "epoch": 1.010765058755855, + "grad_norm": 1.1401771306991577, + "learning_rate": 0.00010330966201198336, + "loss": 0.6436, + "step": 24600 + }, + { + "epoch": 1.0115868189662256, + "grad_norm": 1.0942751169204712, + "learning_rate": 0.00010317665598380131, + "loss": 0.6908, + "step": 24620 + }, + { + "epoch": 1.0124085791765962, + "grad_norm": 1.23777437210083, + "learning_rate": 0.00010304364433000604, + "loss": 0.676, + "step": 24640 + }, + { + "epoch": 1.0132303393869668, + "grad_norm": 1.3880153894424438, + "learning_rate": 0.00010291062728615099, + "loss": 0.6838, + "step": 24660 + }, + { + "epoch": 1.0140520995973374, + "grad_norm": 1.090610384941101, + "learning_rate": 0.00010277760508779903, + "loss": 0.6505, + "step": 24680 + }, + { + "epoch": 1.014873859807708, + "grad_norm": 1.1579679250717163, + "learning_rate": 0.00010264457797052227, + "loss": 0.679, + "step": 24700 + }, + { + "epoch": 1.0156956200180787, + "grad_norm": 1.1469173431396484, + "learning_rate": 0.00010251154616990151, + "loss": 0.7002, + "step": 24720 + }, + { + "epoch": 1.0165173802284493, + "grad_norm": 1.1271486282348633, + "learning_rate": 0.00010237850992152578, + "loss": 0.6774, + "step": 24740 + }, + { + "epoch": 1.01733914043882, + "grad_norm": 1.2221741676330566, + "learning_rate": 0.00010224546946099209, + "loss": 0.6912, + "step": 24760 + }, + { + "epoch": 1.0181609006491905, + "grad_norm": 1.0773913860321045, + "learning_rate": 0.00010211242502390481, + "loss": 0.6631, + "step": 24780 + }, + { + "epoch": 1.0189826608595611, + "grad_norm": 1.3026174306869507, + "learning_rate": 0.0001019793768458754, + "loss": 0.6731, + "step": 24800 + }, + { + "epoch": 1.0198044210699317, + "grad_norm": 1.1742732524871826, + "learning_rate": 0.00010184632516252199, + "loss": 0.6562, + "step": 24820 + }, + { + "epoch": 1.0206261812803024, + "grad_norm": 1.1922106742858887, + "learning_rate": 0.00010171992303113935, + "loss": 0.6519, + "step": 24840 + }, + { + "epoch": 1.021447941490673, + "grad_norm": 1.0810050964355469, + "learning_rate": 0.0001015868651901237, + "loss": 0.6625, + "step": 24860 + }, + { + "epoch": 1.0222697017010436, + "grad_norm": 1.4511394500732422, + "learning_rate": 0.00010145380453889195, + "loss": 0.6955, + "step": 24880 + }, + { + "epoch": 1.0230914619114142, + "grad_norm": 1.1028388738632202, + "learning_rate": 0.0001013207413130843, + "loss": 0.6576, + "step": 24900 + }, + { + "epoch": 1.0239132221217848, + "grad_norm": 1.169084072113037, + "learning_rate": 0.0001011876757483454, + "loss": 0.6678, + "step": 24920 + }, + { + "epoch": 1.0247349823321554, + "grad_norm": 1.1536842584609985, + "learning_rate": 0.00010105460808032418, + "loss": 0.6477, + "step": 24940 + }, + { + "epoch": 1.025556742542526, + "grad_norm": 1.123879313468933, + "learning_rate": 0.00010092153854467326, + "loss": 0.679, + "step": 24960 + }, + { + "epoch": 1.0263785027528967, + "grad_norm": 1.019872784614563, + "learning_rate": 0.00010078846737704848, + "loss": 0.6854, + "step": 24980 + }, + { + "epoch": 1.0272002629632673, + "grad_norm": 1.3068866729736328, + "learning_rate": 0.00010065539481310871, + "loss": 0.6717, + "step": 25000 + }, + { + "epoch": 1.0280220231736379, + "grad_norm": 1.2124892473220825, + "learning_rate": 0.00010052232108851513, + "loss": 0.649, + "step": 25020 + }, + { + "epoch": 1.0288437833840085, + "grad_norm": 1.2036775350570679, + "learning_rate": 0.00010038924643893113, + "loss": 0.666, + "step": 25040 + }, + { + "epoch": 1.0296655435943791, + "grad_norm": 1.244070053100586, + "learning_rate": 0.00010025617110002165, + "loss": 0.6654, + "step": 25060 + }, + { + "epoch": 1.0304873038047497, + "grad_norm": 1.1506962776184082, + "learning_rate": 0.00010012309530745285, + "loss": 0.6764, + "step": 25080 + }, + { + "epoch": 1.0313090640151203, + "grad_norm": 1.1000773906707764, + "learning_rate": 9.999001929689177e-05, + "loss": 0.6574, + "step": 25100 + }, + { + "epoch": 1.032130824225491, + "grad_norm": 1.0797061920166016, + "learning_rate": 9.985694330400571e-05, + "loss": 0.6618, + "step": 25120 + }, + { + "epoch": 1.0329525844358616, + "grad_norm": 1.1669949293136597, + "learning_rate": 9.972386756446208e-05, + "loss": 0.6731, + "step": 25140 + }, + { + "epoch": 1.0337743446462322, + "grad_norm": 1.1310721635818481, + "learning_rate": 9.959079231392771e-05, + "loss": 0.6773, + "step": 25160 + }, + { + "epoch": 1.0345961048566028, + "grad_norm": 1.1668230295181274, + "learning_rate": 9.945771778806865e-05, + "loss": 0.6772, + "step": 25180 + }, + { + "epoch": 1.0354178650669734, + "grad_norm": 1.1975557804107666, + "learning_rate": 9.93246442225497e-05, + "loss": 0.6801, + "step": 25200 + }, + { + "epoch": 1.036239625277344, + "grad_norm": 1.1820405721664429, + "learning_rate": 9.919157185303379e-05, + "loss": 0.6581, + "step": 25220 + }, + { + "epoch": 1.0370613854877146, + "grad_norm": 1.1540806293487549, + "learning_rate": 9.90585009151819e-05, + "loss": 0.6732, + "step": 25240 + }, + { + "epoch": 1.0378831456980853, + "grad_norm": 1.064178228378296, + "learning_rate": 9.892543164465243e-05, + "loss": 0.6732, + "step": 25260 + }, + { + "epoch": 1.0387049059084559, + "grad_norm": 1.107135534286499, + "learning_rate": 9.879236427710082e-05, + "loss": 0.6708, + "step": 25280 + }, + { + "epoch": 1.0395266661188265, + "grad_norm": 1.172105312347412, + "learning_rate": 9.865929904817909e-05, + "loss": 0.6818, + "step": 25300 + }, + { + "epoch": 1.040348426329197, + "grad_norm": 1.1912182569503784, + "learning_rate": 9.85262361935356e-05, + "loss": 0.7184, + "step": 25320 + }, + { + "epoch": 1.0411701865395677, + "grad_norm": 1.1023154258728027, + "learning_rate": 9.83931759488143e-05, + "loss": 0.6638, + "step": 25340 + }, + { + "epoch": 1.0419919467499383, + "grad_norm": 1.076657772064209, + "learning_rate": 9.826011854965474e-05, + "loss": 0.6727, + "step": 25360 + }, + { + "epoch": 1.042813706960309, + "grad_norm": 1.2090641260147095, + "learning_rate": 9.812706423169129e-05, + "loss": 0.6725, + "step": 25380 + }, + { + "epoch": 1.0436354671706796, + "grad_norm": 1.1991325616836548, + "learning_rate": 9.799401323055292e-05, + "loss": 0.6793, + "step": 25400 + }, + { + "epoch": 1.0444572273810502, + "grad_norm": 1.0969122648239136, + "learning_rate": 9.786096578186273e-05, + "loss": 0.6803, + "step": 25420 + }, + { + "epoch": 1.0452789875914208, + "grad_norm": 1.2352911233901978, + "learning_rate": 9.772792212123748e-05, + "loss": 0.6789, + "step": 25440 + }, + { + "epoch": 1.0461007478017914, + "grad_norm": 1.2759590148925781, + "learning_rate": 9.759488248428725e-05, + "loss": 0.6728, + "step": 25460 + }, + { + "epoch": 1.046922508012162, + "grad_norm": 1.3433208465576172, + "learning_rate": 9.7461847106615e-05, + "loss": 0.6835, + "step": 25480 + }, + { + "epoch": 1.0477442682225326, + "grad_norm": 1.2637454271316528, + "learning_rate": 9.732881622381616e-05, + "loss": 0.6738, + "step": 25500 + }, + { + "epoch": 1.0485660284329033, + "grad_norm": 1.1877262592315674, + "learning_rate": 9.719579007147815e-05, + "loss": 0.6633, + "step": 25520 + }, + { + "epoch": 1.0493877886432739, + "grad_norm": 1.1830005645751953, + "learning_rate": 9.706276888518013e-05, + "loss": 0.6559, + "step": 25540 + }, + { + "epoch": 1.0502095488536445, + "grad_norm": 1.1390386819839478, + "learning_rate": 9.692975290049228e-05, + "loss": 0.6782, + "step": 25560 + }, + { + "epoch": 1.051031309064015, + "grad_norm": 1.1353988647460938, + "learning_rate": 9.679674235297572e-05, + "loss": 0.6781, + "step": 25580 + }, + { + "epoch": 1.0518530692743857, + "grad_norm": 1.1977840662002563, + "learning_rate": 9.666373747818187e-05, + "loss": 0.6697, + "step": 25600 + }, + { + "epoch": 1.0526748294847563, + "grad_norm": 1.0405921936035156, + "learning_rate": 9.653073851165214e-05, + "loss": 0.6715, + "step": 25620 + }, + { + "epoch": 1.053496589695127, + "grad_norm": 1.1129424571990967, + "learning_rate": 9.63977456889175e-05, + "loss": 0.6806, + "step": 25640 + }, + { + "epoch": 1.0543183499054976, + "grad_norm": 1.163406491279602, + "learning_rate": 9.626475924549792e-05, + "loss": 0.6457, + "step": 25660 + }, + { + "epoch": 1.0551401101158682, + "grad_norm": 1.1244803667068481, + "learning_rate": 9.613177941690219e-05, + "loss": 0.6664, + "step": 25680 + }, + { + "epoch": 1.0559618703262388, + "grad_norm": 1.1291669607162476, + "learning_rate": 9.599880643862737e-05, + "loss": 0.6642, + "step": 25700 + }, + { + "epoch": 1.0567836305366094, + "grad_norm": 1.3684684038162231, + "learning_rate": 9.586584054615836e-05, + "loss": 0.6833, + "step": 25720 + }, + { + "epoch": 1.05760539074698, + "grad_norm": 1.0214548110961914, + "learning_rate": 9.57328819749675e-05, + "loss": 0.6693, + "step": 25740 + }, + { + "epoch": 1.0584271509573506, + "grad_norm": 1.0439046621322632, + "learning_rate": 9.559993096051425e-05, + "loss": 0.6563, + "step": 25760 + }, + { + "epoch": 1.0592489111677212, + "grad_norm": 1.1112405061721802, + "learning_rate": 9.546698773824453e-05, + "loss": 0.6683, + "step": 25780 + }, + { + "epoch": 1.0600706713780919, + "grad_norm": 1.2522891759872437, + "learning_rate": 9.53340525435906e-05, + "loss": 0.656, + "step": 25800 + }, + { + "epoch": 1.0608924315884625, + "grad_norm": 1.133664846420288, + "learning_rate": 9.520112561197045e-05, + "loss": 0.6891, + "step": 25820 + }, + { + "epoch": 1.061714191798833, + "grad_norm": 1.3006486892700195, + "learning_rate": 9.506820717878745e-05, + "loss": 0.6536, + "step": 25840 + }, + { + "epoch": 1.0625359520092037, + "grad_norm": 1.2655730247497559, + "learning_rate": 9.493529747942996e-05, + "loss": 0.687, + "step": 25860 + }, + { + "epoch": 1.0633577122195743, + "grad_norm": 1.1235599517822266, + "learning_rate": 9.480239674927074e-05, + "loss": 0.665, + "step": 25880 + }, + { + "epoch": 1.064179472429945, + "grad_norm": 1.250252604484558, + "learning_rate": 9.46695052236668e-05, + "loss": 0.6604, + "step": 25900 + }, + { + "epoch": 1.0650012326403155, + "grad_norm": 1.0628433227539062, + "learning_rate": 9.453662313795879e-05, + "loss": 0.6565, + "step": 25920 + }, + { + "epoch": 1.0658229928506862, + "grad_norm": 1.1810961961746216, + "learning_rate": 9.440375072747064e-05, + "loss": 0.6722, + "step": 25940 + }, + { + "epoch": 1.0666447530610568, + "grad_norm": 1.2217936515808105, + "learning_rate": 9.427753111349993e-05, + "loss": 0.6913, + "step": 25960 + }, + { + "epoch": 1.0674665132714274, + "grad_norm": 1.0944212675094604, + "learning_rate": 9.414467824647579e-05, + "loss": 0.6668, + "step": 25980 + }, + { + "epoch": 1.068288273481798, + "grad_norm": 1.083652377128601, + "learning_rate": 9.401183574877579e-05, + "loss": 0.6713, + "step": 26000 + }, + { + "epoch": 1.068288273481798, + "eval_loss": 1.0167440176010132, + "eval_runtime": 16.6957, + "eval_samples_per_second": 156.927, + "eval_steps_per_second": 4.911, + "step": 26000 + }, + { + "epoch": 1.0691100336921686, + "grad_norm": 1.117463231086731, + "learning_rate": 9.387900385565371e-05, + "loss": 0.6435, + "step": 26020 + }, + { + "epoch": 1.0699317939025392, + "grad_norm": 1.027099847793579, + "learning_rate": 9.374618280234465e-05, + "loss": 0.6909, + "step": 26040 + }, + { + "epoch": 1.0707535541129098, + "grad_norm": 1.3012546300888062, + "learning_rate": 9.36133728240645e-05, + "loss": 0.6741, + "step": 26060 + }, + { + "epoch": 1.0715753143232805, + "grad_norm": 1.179627776145935, + "learning_rate": 9.348057415600942e-05, + "loss": 0.687, + "step": 26080 + }, + { + "epoch": 1.072397074533651, + "grad_norm": 1.1117219924926758, + "learning_rate": 9.334778703335568e-05, + "loss": 0.6616, + "step": 26100 + }, + { + "epoch": 1.0732188347440217, + "grad_norm": 1.1664800643920898, + "learning_rate": 9.321501169125905e-05, + "loss": 0.6648, + "step": 26120 + }, + { + "epoch": 1.0740405949543923, + "grad_norm": 1.1486011743545532, + "learning_rate": 9.308224836485447e-05, + "loss": 0.684, + "step": 26140 + }, + { + "epoch": 1.074862355164763, + "grad_norm": 1.1502150297164917, + "learning_rate": 9.294949728925554e-05, + "loss": 0.6682, + "step": 26160 + }, + { + "epoch": 1.0756841153751335, + "grad_norm": 1.0855724811553955, + "learning_rate": 9.281675869955419e-05, + "loss": 0.671, + "step": 26180 + }, + { + "epoch": 1.0765058755855041, + "grad_norm": 1.267439365386963, + "learning_rate": 9.268403283082024e-05, + "loss": 0.6935, + "step": 26200 + }, + { + "epoch": 1.0773276357958748, + "grad_norm": 1.19661545753479, + "learning_rate": 9.255131991810099e-05, + "loss": 0.6611, + "step": 26220 + }, + { + "epoch": 1.0781493960062454, + "grad_norm": 1.1992172002792358, + "learning_rate": 9.241862019642083e-05, + "loss": 0.6527, + "step": 26240 + }, + { + "epoch": 1.078971156216616, + "grad_norm": 1.117606520652771, + "learning_rate": 9.228593390078073e-05, + "loss": 0.6792, + "step": 26260 + }, + { + "epoch": 1.0797929164269866, + "grad_norm": 1.1372742652893066, + "learning_rate": 9.21532612661579e-05, + "loss": 0.6742, + "step": 26280 + }, + { + "epoch": 1.0806146766373572, + "grad_norm": 1.1025826930999756, + "learning_rate": 9.202060252750539e-05, + "loss": 0.6559, + "step": 26300 + }, + { + "epoch": 1.0814364368477278, + "grad_norm": 1.222730278968811, + "learning_rate": 9.188795791975155e-05, + "loss": 0.6711, + "step": 26320 + }, + { + "epoch": 1.0822581970580984, + "grad_norm": 1.1559460163116455, + "learning_rate": 9.17553276777998e-05, + "loss": 0.6432, + "step": 26340 + }, + { + "epoch": 1.083079957268469, + "grad_norm": 1.0200605392456055, + "learning_rate": 9.162271203652811e-05, + "loss": 0.6606, + "step": 26360 + }, + { + "epoch": 1.0839017174788397, + "grad_norm": 1.1216548681259155, + "learning_rate": 9.149011123078861e-05, + "loss": 0.6608, + "step": 26380 + }, + { + "epoch": 1.0847234776892103, + "grad_norm": 1.0914236307144165, + "learning_rate": 9.135752549540704e-05, + "loss": 0.6644, + "step": 26400 + }, + { + "epoch": 1.085545237899581, + "grad_norm": 1.2880839109420776, + "learning_rate": 9.122495506518254e-05, + "loss": 0.6699, + "step": 26420 + }, + { + "epoch": 1.0863669981099515, + "grad_norm": 1.3740962743759155, + "learning_rate": 9.10924001748872e-05, + "loss": 0.661, + "step": 26440 + }, + { + "epoch": 1.0871887583203221, + "grad_norm": 1.2001346349716187, + "learning_rate": 9.095986105926547e-05, + "loss": 0.6806, + "step": 26460 + }, + { + "epoch": 1.0880105185306927, + "grad_norm": 1.1882902383804321, + "learning_rate": 9.082733795303393e-05, + "loss": 0.6581, + "step": 26480 + }, + { + "epoch": 1.0888322787410634, + "grad_norm": 1.1991751194000244, + "learning_rate": 9.069483109088086e-05, + "loss": 0.6684, + "step": 26500 + }, + { + "epoch": 1.089654038951434, + "grad_norm": 1.2505358457565308, + "learning_rate": 9.056234070746562e-05, + "loss": 0.6719, + "step": 26520 + }, + { + "epoch": 1.0904757991618046, + "grad_norm": 1.1861186027526855, + "learning_rate": 9.042986703741853e-05, + "loss": 0.6723, + "step": 26540 + }, + { + "epoch": 1.0912975593721752, + "grad_norm": 1.2916886806488037, + "learning_rate": 9.029741031534025e-05, + "loss": 0.6649, + "step": 26560 + }, + { + "epoch": 1.0921193195825458, + "grad_norm": 1.1472935676574707, + "learning_rate": 9.016497077580147e-05, + "loss": 0.6717, + "step": 26580 + }, + { + "epoch": 1.0929410797929164, + "grad_norm": 1.1004657745361328, + "learning_rate": 9.00325486533424e-05, + "loss": 0.659, + "step": 26600 + }, + { + "epoch": 1.093762840003287, + "grad_norm": 1.374177098274231, + "learning_rate": 8.990014418247242e-05, + "loss": 0.663, + "step": 26620 + }, + { + "epoch": 1.0945846002136577, + "grad_norm": 1.1015247106552124, + "learning_rate": 8.976775759766963e-05, + "loss": 0.6565, + "step": 26640 + }, + { + "epoch": 1.0954063604240283, + "grad_norm": 1.4717910289764404, + "learning_rate": 8.963538913338051e-05, + "loss": 0.6628, + "step": 26660 + }, + { + "epoch": 1.096228120634399, + "grad_norm": 1.180732011795044, + "learning_rate": 8.950303902401942e-05, + "loss": 0.6674, + "step": 26680 + }, + { + "epoch": 1.0970498808447695, + "grad_norm": 1.168150782585144, + "learning_rate": 8.937070750396826e-05, + "loss": 0.686, + "step": 26700 + }, + { + "epoch": 1.0978716410551401, + "grad_norm": 1.3096245527267456, + "learning_rate": 8.923839480757589e-05, + "loss": 0.6577, + "step": 26720 + }, + { + "epoch": 1.0986934012655107, + "grad_norm": 1.1966016292572021, + "learning_rate": 8.910610116915797e-05, + "loss": 0.6614, + "step": 26740 + }, + { + "epoch": 1.0995151614758814, + "grad_norm": 1.2684301137924194, + "learning_rate": 8.897382682299628e-05, + "loss": 0.6696, + "step": 26760 + }, + { + "epoch": 1.100336921686252, + "grad_norm": 1.1399879455566406, + "learning_rate": 8.884157200333856e-05, + "loss": 0.6443, + "step": 26780 + }, + { + "epoch": 1.1011586818966226, + "grad_norm": 1.2215358018875122, + "learning_rate": 8.870933694439789e-05, + "loss": 0.6749, + "step": 26800 + }, + { + "epoch": 1.1019804421069932, + "grad_norm": 1.1253427267074585, + "learning_rate": 8.857712188035245e-05, + "loss": 0.6739, + "step": 26820 + }, + { + "epoch": 1.1028022023173638, + "grad_norm": 1.0373494625091553, + "learning_rate": 8.844492704534485e-05, + "loss": 0.6877, + "step": 26840 + }, + { + "epoch": 1.1036239625277344, + "grad_norm": 1.2346014976501465, + "learning_rate": 8.831275267348199e-05, + "loss": 0.6754, + "step": 26860 + }, + { + "epoch": 1.104445722738105, + "grad_norm": 0.9985619187355042, + "learning_rate": 8.818059899883456e-05, + "loss": 0.6806, + "step": 26880 + }, + { + "epoch": 1.1052674829484757, + "grad_norm": 1.1723840236663818, + "learning_rate": 8.804846625543646e-05, + "loss": 0.6757, + "step": 26900 + }, + { + "epoch": 1.1060892431588463, + "grad_norm": 1.1545828580856323, + "learning_rate": 8.791635467728471e-05, + "loss": 0.6533, + "step": 26920 + }, + { + "epoch": 1.1069110033692169, + "grad_norm": 1.0837960243225098, + "learning_rate": 8.778426449833865e-05, + "loss": 0.6614, + "step": 26940 + }, + { + "epoch": 1.1077327635795875, + "grad_norm": 1.1886368989944458, + "learning_rate": 8.765219595251986e-05, + "loss": 0.6571, + "step": 26960 + }, + { + "epoch": 1.1085545237899581, + "grad_norm": 1.2560697793960571, + "learning_rate": 8.753335295091204e-05, + "loss": 0.6819, + "step": 26980 + }, + { + "epoch": 1.1093762840003287, + "grad_norm": 1.0834345817565918, + "learning_rate": 8.74013261523514e-05, + "loss": 0.6838, + "step": 27000 + }, + { + "epoch": 1.1101980442106993, + "grad_norm": 1.2474608421325684, + "learning_rate": 8.72693216650723e-05, + "loss": 0.6479, + "step": 27020 + }, + { + "epoch": 1.11101980442107, + "grad_norm": 1.2082535028457642, + "learning_rate": 8.713733972284461e-05, + "loss": 0.6479, + "step": 27040 + }, + { + "epoch": 1.1118415646314406, + "grad_norm": 1.2836271524429321, + "learning_rate": 8.700538055939816e-05, + "loss": 0.6927, + "step": 27060 + }, + { + "epoch": 1.1126633248418112, + "grad_norm": 1.235249400138855, + "learning_rate": 8.687344440842249e-05, + "loss": 0.6672, + "step": 27080 + }, + { + "epoch": 1.1134850850521818, + "grad_norm": 1.1937826871871948, + "learning_rate": 8.67415315035664e-05, + "loss": 0.6775, + "step": 27100 + }, + { + "epoch": 1.1143068452625524, + "grad_norm": 1.210194706916809, + "learning_rate": 8.66096420784374e-05, + "loss": 0.6656, + "step": 27120 + }, + { + "epoch": 1.115128605472923, + "grad_norm": 1.1403838396072388, + "learning_rate": 8.647777636660159e-05, + "loss": 0.6406, + "step": 27140 + }, + { + "epoch": 1.1159503656832936, + "grad_norm": 1.0807080268859863, + "learning_rate": 8.634593460158293e-05, + "loss": 0.6712, + "step": 27160 + }, + { + "epoch": 1.1167721258936643, + "grad_norm": 1.2173619270324707, + "learning_rate": 8.621411701686309e-05, + "loss": 0.6671, + "step": 27180 + }, + { + "epoch": 1.1175938861040349, + "grad_norm": 1.2164521217346191, + "learning_rate": 8.608232384588086e-05, + "loss": 0.6551, + "step": 27200 + }, + { + "epoch": 1.1184156463144055, + "grad_norm": 1.3138072490692139, + "learning_rate": 8.595055532203177e-05, + "loss": 0.681, + "step": 27220 + }, + { + "epoch": 1.119237406524776, + "grad_norm": 1.1198703050613403, + "learning_rate": 8.581881167866774e-05, + "loss": 0.6533, + "step": 27240 + }, + { + "epoch": 1.1200591667351467, + "grad_norm": 1.1022275686264038, + "learning_rate": 8.568709314909663e-05, + "loss": 0.6441, + "step": 27260 + }, + { + "epoch": 1.1208809269455173, + "grad_norm": 1.2056939601898193, + "learning_rate": 8.555539996658184e-05, + "loss": 0.6768, + "step": 27280 + }, + { + "epoch": 1.121702687155888, + "grad_norm": 1.2110202312469482, + "learning_rate": 8.542373236434182e-05, + "loss": 0.6646, + "step": 27300 + }, + { + "epoch": 1.1225244473662586, + "grad_norm": 1.0746920108795166, + "learning_rate": 8.529209057554984e-05, + "loss": 0.679, + "step": 27320 + }, + { + "epoch": 1.1233462075766292, + "grad_norm": 1.1810733079910278, + "learning_rate": 8.516047483333325e-05, + "loss": 0.6643, + "step": 27340 + }, + { + "epoch": 1.1241679677869998, + "grad_norm": 1.1760369539260864, + "learning_rate": 8.502888537077345e-05, + "loss": 0.6543, + "step": 27360 + }, + { + "epoch": 1.1249897279973704, + "grad_norm": 1.2052829265594482, + "learning_rate": 8.489732242090527e-05, + "loss": 0.6448, + "step": 27380 + }, + { + "epoch": 1.125811488207741, + "grad_norm": 1.2746154069900513, + "learning_rate": 8.476578621671659e-05, + "loss": 0.6622, + "step": 27400 + }, + { + "epoch": 1.1266332484181116, + "grad_norm": 1.186718225479126, + "learning_rate": 8.463427699114785e-05, + "loss": 0.6626, + "step": 27420 + }, + { + "epoch": 1.1274550086284822, + "grad_norm": 1.299566388130188, + "learning_rate": 8.450279497709179e-05, + "loss": 0.677, + "step": 27440 + }, + { + "epoch": 1.1282767688388529, + "grad_norm": 1.2594420909881592, + "learning_rate": 8.43713404073929e-05, + "loss": 0.6743, + "step": 27460 + }, + { + "epoch": 1.1290985290492235, + "grad_norm": 1.1725876331329346, + "learning_rate": 8.423991351484716e-05, + "loss": 0.6861, + "step": 27480 + }, + { + "epoch": 1.129920289259594, + "grad_norm": 1.1445516347885132, + "learning_rate": 8.410851453220143e-05, + "loss": 0.6626, + "step": 27500 + }, + { + "epoch": 1.1307420494699647, + "grad_norm": 1.0811272859573364, + "learning_rate": 8.397714369215324e-05, + "loss": 0.6527, + "step": 27520 + }, + { + "epoch": 1.1315638096803353, + "grad_norm": 1.0390878915786743, + "learning_rate": 8.384580122735024e-05, + "loss": 0.6966, + "step": 27540 + }, + { + "epoch": 1.132385569890706, + "grad_norm": 1.1482422351837158, + "learning_rate": 8.371448737038976e-05, + "loss": 0.6674, + "step": 27560 + }, + { + "epoch": 1.1332073301010765, + "grad_norm": 1.1507283449172974, + "learning_rate": 8.35832023538186e-05, + "loss": 0.6776, + "step": 27580 + }, + { + "epoch": 1.1340290903114472, + "grad_norm": 1.141131043434143, + "learning_rate": 8.345194641013236e-05, + "loss": 0.6824, + "step": 27600 + }, + { + "epoch": 1.1348508505218178, + "grad_norm": 1.1057684421539307, + "learning_rate": 8.332071977177526e-05, + "loss": 0.687, + "step": 27620 + }, + { + "epoch": 1.1356726107321884, + "grad_norm": 1.4246220588684082, + "learning_rate": 8.318952267113958e-05, + "loss": 0.6624, + "step": 27640 + }, + { + "epoch": 1.136494370942559, + "grad_norm": 0.9534235000610352, + "learning_rate": 8.305835534056524e-05, + "loss": 0.668, + "step": 27660 + }, + { + "epoch": 1.1373161311529296, + "grad_norm": 1.2411669492721558, + "learning_rate": 8.29272180123395e-05, + "loss": 0.6585, + "step": 27680 + }, + { + "epoch": 1.1381378913633002, + "grad_norm": 1.137723684310913, + "learning_rate": 8.279611091869651e-05, + "loss": 0.6684, + "step": 27700 + }, + { + "epoch": 1.1389596515736709, + "grad_norm": 1.1483657360076904, + "learning_rate": 8.26650342918168e-05, + "loss": 0.668, + "step": 27720 + }, + { + "epoch": 1.1397814117840415, + "grad_norm": 1.1523141860961914, + "learning_rate": 8.253398836382702e-05, + "loss": 0.6957, + "step": 27740 + }, + { + "epoch": 1.140603171994412, + "grad_norm": 1.1631639003753662, + "learning_rate": 8.240297336679942e-05, + "loss": 0.66, + "step": 27760 + }, + { + "epoch": 1.1414249322047827, + "grad_norm": 1.2860993146896362, + "learning_rate": 8.227198953275142e-05, + "loss": 0.685, + "step": 27780 + }, + { + "epoch": 1.1422466924151533, + "grad_norm": 1.236307144165039, + "learning_rate": 8.214103709364535e-05, + "loss": 0.663, + "step": 27800 + }, + { + "epoch": 1.143068452625524, + "grad_norm": 1.1525200605392456, + "learning_rate": 8.201011628138789e-05, + "loss": 0.6581, + "step": 27820 + }, + { + "epoch": 1.1438902128358945, + "grad_norm": 1.1676980257034302, + "learning_rate": 8.187922732782976e-05, + "loss": 0.6768, + "step": 27840 + }, + { + "epoch": 1.1447119730462652, + "grad_norm": 1.0874487161636353, + "learning_rate": 8.174837046476518e-05, + "loss": 0.6336, + "step": 27860 + }, + { + "epoch": 1.1455337332566358, + "grad_norm": 0.9278498291969299, + "learning_rate": 8.16175459239316e-05, + "loss": 0.6767, + "step": 27880 + }, + { + "epoch": 1.1463554934670064, + "grad_norm": 1.039865255355835, + "learning_rate": 8.148675393700918e-05, + "loss": 0.6686, + "step": 27900 + }, + { + "epoch": 1.147177253677377, + "grad_norm": 1.0158859491348267, + "learning_rate": 8.135599473562048e-05, + "loss": 0.685, + "step": 27920 + }, + { + "epoch": 1.1479990138877476, + "grad_norm": 1.2587593793869019, + "learning_rate": 8.122526855132997e-05, + "loss": 0.6606, + "step": 27940 + }, + { + "epoch": 1.1488207740981182, + "grad_norm": 1.2706636190414429, + "learning_rate": 8.109457561564373e-05, + "loss": 0.6761, + "step": 27960 + }, + { + "epoch": 1.1496425343084888, + "grad_norm": 1.0990588665008545, + "learning_rate": 8.09639161600088e-05, + "loss": 0.6512, + "step": 27980 + }, + { + "epoch": 1.1504642945188595, + "grad_norm": 1.1541229486465454, + "learning_rate": 8.084635146662451e-05, + "loss": 0.663, + "step": 28000 + }, + { + "epoch": 1.1504642945188595, + "eval_loss": 0.9906555414199829, + "eval_runtime": 16.5563, + "eval_samples_per_second": 158.248, + "eval_steps_per_second": 4.953, + "step": 28000 + }, + { + "epoch": 1.15128605472923, + "grad_norm": 1.1189225912094116, + "learning_rate": 8.071575626051133e-05, + "loss": 0.6602, + "step": 28020 + }, + { + "epoch": 1.1521078149396007, + "grad_norm": 1.2261697053909302, + "learning_rate": 8.058519520530936e-05, + "loss": 0.6556, + "step": 28040 + }, + { + "epoch": 1.1529295751499713, + "grad_norm": 1.10615074634552, + "learning_rate": 8.045466853223223e-05, + "loss": 0.6632, + "step": 28060 + }, + { + "epoch": 1.153751335360342, + "grad_norm": 1.2373408079147339, + "learning_rate": 8.032417647243263e-05, + "loss": 0.6697, + "step": 28080 + }, + { + "epoch": 1.1545730955707125, + "grad_norm": 1.282421588897705, + "learning_rate": 8.019371925700199e-05, + "loss": 0.6827, + "step": 28100 + }, + { + "epoch": 1.1553948557810831, + "grad_norm": 1.3164217472076416, + "learning_rate": 8.006329711696991e-05, + "loss": 0.6604, + "step": 28120 + }, + { + "epoch": 1.1562166159914538, + "grad_norm": 1.0808019638061523, + "learning_rate": 7.993291028330409e-05, + "loss": 0.6437, + "step": 28140 + }, + { + "epoch": 1.1570383762018244, + "grad_norm": 1.116471767425537, + "learning_rate": 7.980255898690942e-05, + "loss": 0.6599, + "step": 28160 + }, + { + "epoch": 1.157860136412195, + "grad_norm": 1.13624906539917, + "learning_rate": 7.967224345862811e-05, + "loss": 0.6707, + "step": 28180 + }, + { + "epoch": 1.1586818966225656, + "grad_norm": 1.1284352540969849, + "learning_rate": 7.954196392923891e-05, + "loss": 0.6627, + "step": 28200 + }, + { + "epoch": 1.1595036568329362, + "grad_norm": 0.9575105905532837, + "learning_rate": 7.941172062945683e-05, + "loss": 0.669, + "step": 28220 + }, + { + "epoch": 1.1603254170433068, + "grad_norm": 1.4857319593429565, + "learning_rate": 7.928151378993277e-05, + "loss": 0.684, + "step": 28240 + }, + { + "epoch": 1.1611471772536774, + "grad_norm": 1.1151981353759766, + "learning_rate": 7.915134364125295e-05, + "loss": 0.6515, + "step": 28260 + }, + { + "epoch": 1.161968937464048, + "grad_norm": 1.0837492942810059, + "learning_rate": 7.902121041393876e-05, + "loss": 0.6808, + "step": 28280 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 1.3104796409606934, + "learning_rate": 7.889111433844603e-05, + "loss": 0.6816, + "step": 28300 + }, + { + "epoch": 1.1636124578847893, + "grad_norm": 1.266835331916809, + "learning_rate": 7.876105564516498e-05, + "loss": 0.6494, + "step": 28320 + }, + { + "epoch": 1.16443421809516, + "grad_norm": 1.1202675104141235, + "learning_rate": 7.863103456441951e-05, + "loss": 0.6517, + "step": 28340 + }, + { + "epoch": 1.1652559783055305, + "grad_norm": 1.1493626832962036, + "learning_rate": 7.850105132646699e-05, + "loss": 0.6545, + "step": 28360 + }, + { + "epoch": 1.1660777385159011, + "grad_norm": 1.259341835975647, + "learning_rate": 7.837110616149767e-05, + "loss": 0.6606, + "step": 28380 + }, + { + "epoch": 1.1668994987262717, + "grad_norm": 1.210664987564087, + "learning_rate": 7.824119929963444e-05, + "loss": 0.645, + "step": 28400 + }, + { + "epoch": 1.1677212589366424, + "grad_norm": 1.1816967725753784, + "learning_rate": 7.81113309709324e-05, + "loss": 0.6413, + "step": 28420 + }, + { + "epoch": 1.168543019147013, + "grad_norm": 1.0502477884292603, + "learning_rate": 7.79815014053783e-05, + "loss": 0.6552, + "step": 28440 + }, + { + "epoch": 1.1693647793573836, + "grad_norm": 1.1239842176437378, + "learning_rate": 7.785171083289039e-05, + "loss": 0.6106, + "step": 28460 + }, + { + "epoch": 1.1701865395677542, + "grad_norm": 1.163809061050415, + "learning_rate": 7.772195948331769e-05, + "loss": 0.6831, + "step": 28480 + }, + { + "epoch": 1.1710082997781248, + "grad_norm": 1.0698950290679932, + "learning_rate": 7.759873224073584e-05, + "loss": 0.6741, + "step": 28500 + }, + { + "epoch": 1.1718300599884954, + "grad_norm": 1.140785574913025, + "learning_rate": 7.7469058036688e-05, + "loss": 0.6761, + "step": 28520 + }, + { + "epoch": 1.172651820198866, + "grad_norm": 1.134072184562683, + "learning_rate": 7.733942373320407e-05, + "loss": 0.6785, + "step": 28540 + }, + { + "epoch": 1.1734735804092367, + "grad_norm": 1.0939021110534668, + "learning_rate": 7.720982955985641e-05, + "loss": 0.6478, + "step": 28560 + }, + { + "epoch": 1.1742953406196073, + "grad_norm": 1.268872618675232, + "learning_rate": 7.708027574614631e-05, + "loss": 0.6623, + "step": 28580 + }, + { + "epoch": 1.175117100829978, + "grad_norm": 1.1508769989013672, + "learning_rate": 7.695076252150361e-05, + "loss": 0.6663, + "step": 28600 + }, + { + "epoch": 1.1759388610403485, + "grad_norm": 1.2125996351242065, + "learning_rate": 7.682129011528627e-05, + "loss": 0.6473, + "step": 28620 + }, + { + "epoch": 1.1767606212507191, + "grad_norm": 1.1959916353225708, + "learning_rate": 7.669185875677998e-05, + "loss": 0.6421, + "step": 28640 + }, + { + "epoch": 1.1775823814610897, + "grad_norm": 1.1845420598983765, + "learning_rate": 7.656246867519772e-05, + "loss": 0.6756, + "step": 28660 + }, + { + "epoch": 1.1784041416714603, + "grad_norm": 1.0986628532409668, + "learning_rate": 7.643312009967928e-05, + "loss": 0.6591, + "step": 28680 + }, + { + "epoch": 1.179225901881831, + "grad_norm": 1.0492193698883057, + "learning_rate": 7.630381325929113e-05, + "loss": 0.6567, + "step": 28700 + }, + { + "epoch": 1.1800476620922016, + "grad_norm": 1.1858165264129639, + "learning_rate": 7.617454838302567e-05, + "loss": 0.6644, + "step": 28720 + }, + { + "epoch": 1.1808694223025722, + "grad_norm": 1.107640266418457, + "learning_rate": 7.604532569980105e-05, + "loss": 0.6514, + "step": 28740 + }, + { + "epoch": 1.1816911825129428, + "grad_norm": 1.2059592008590698, + "learning_rate": 7.591614543846075e-05, + "loss": 0.6607, + "step": 28760 + }, + { + "epoch": 1.1825129427233134, + "grad_norm": 1.0413446426391602, + "learning_rate": 7.578700782777299e-05, + "loss": 0.6722, + "step": 28780 + }, + { + "epoch": 1.183334702933684, + "grad_norm": 1.18887197971344, + "learning_rate": 7.565791309643058e-05, + "loss": 0.6796, + "step": 28800 + }, + { + "epoch": 1.1841564631440546, + "grad_norm": 1.1347589492797852, + "learning_rate": 7.552886147305034e-05, + "loss": 0.6652, + "step": 28820 + }, + { + "epoch": 1.1849782233544253, + "grad_norm": 1.0669459104537964, + "learning_rate": 7.53998531861728e-05, + "loss": 0.6557, + "step": 28840 + }, + { + "epoch": 1.1857999835647959, + "grad_norm": 1.1914751529693604, + "learning_rate": 7.527088846426164e-05, + "loss": 0.6631, + "step": 28860 + }, + { + "epoch": 1.1866217437751665, + "grad_norm": 1.1654757261276245, + "learning_rate": 7.514196753570354e-05, + "loss": 0.6392, + "step": 28880 + }, + { + "epoch": 1.187443503985537, + "grad_norm": 1.1120076179504395, + "learning_rate": 7.501309062880745e-05, + "loss": 0.6515, + "step": 28900 + }, + { + "epoch": 1.1882652641959077, + "grad_norm": 1.3329730033874512, + "learning_rate": 7.488425797180449e-05, + "loss": 0.6495, + "step": 28920 + }, + { + "epoch": 1.1890870244062783, + "grad_norm": 1.0386924743652344, + "learning_rate": 7.475546979284738e-05, + "loss": 0.6633, + "step": 28940 + }, + { + "epoch": 1.189908784616649, + "grad_norm": 1.130223035812378, + "learning_rate": 7.46267263200101e-05, + "loss": 0.6608, + "step": 28960 + }, + { + "epoch": 1.1907305448270196, + "grad_norm": 1.205684781074524, + "learning_rate": 7.44980277812874e-05, + "loss": 0.6873, + "step": 28980 + }, + { + "epoch": 1.1915523050373902, + "grad_norm": 1.1943809986114502, + "learning_rate": 7.436937440459448e-05, + "loss": 0.6384, + "step": 29000 + }, + { + "epoch": 1.1923740652477608, + "grad_norm": 1.1930698156356812, + "learning_rate": 7.424076641776657e-05, + "loss": 0.6562, + "step": 29020 + }, + { + "epoch": 1.1931958254581314, + "grad_norm": 1.1172430515289307, + "learning_rate": 7.411220404855852e-05, + "loss": 0.6548, + "step": 29040 + }, + { + "epoch": 1.194017585668502, + "grad_norm": 1.2052860260009766, + "learning_rate": 7.398368752464438e-05, + "loss": 0.6368, + "step": 29060 + }, + { + "epoch": 1.1948393458788726, + "grad_norm": 1.2752981185913086, + "learning_rate": 7.385521707361705e-05, + "loss": 0.6425, + "step": 29080 + }, + { + "epoch": 1.1956611060892433, + "grad_norm": 1.2396671772003174, + "learning_rate": 7.372679292298781e-05, + "loss": 0.6536, + "step": 29100 + }, + { + "epoch": 1.1964828662996139, + "grad_norm": 1.0870732069015503, + "learning_rate": 7.359841530018589e-05, + "loss": 0.6926, + "step": 29120 + }, + { + "epoch": 1.1973046265099845, + "grad_norm": 1.238871455192566, + "learning_rate": 7.347008443255825e-05, + "loss": 0.6733, + "step": 29140 + }, + { + "epoch": 1.198126386720355, + "grad_norm": 0.9936127662658691, + "learning_rate": 7.334180054736892e-05, + "loss": 0.6675, + "step": 29160 + }, + { + "epoch": 1.1989481469307257, + "grad_norm": 1.1829917430877686, + "learning_rate": 7.321356387179881e-05, + "loss": 0.6501, + "step": 29180 + }, + { + "epoch": 1.1997699071410963, + "grad_norm": 1.218711256980896, + "learning_rate": 7.308537463294525e-05, + "loss": 0.6366, + "step": 29200 + }, + { + "epoch": 1.2005916673514667, + "grad_norm": 1.1491694450378418, + "learning_rate": 7.29572330578214e-05, + "loss": 0.6563, + "step": 29220 + }, + { + "epoch": 1.2014134275618376, + "grad_norm": 1.1249312162399292, + "learning_rate": 7.28291393733562e-05, + "loss": 0.6522, + "step": 29240 + }, + { + "epoch": 1.202235187772208, + "grad_norm": 1.1913214921951294, + "learning_rate": 7.270109380639374e-05, + "loss": 0.6663, + "step": 29260 + }, + { + "epoch": 1.2030569479825788, + "grad_norm": 1.059322714805603, + "learning_rate": 7.257309658369278e-05, + "loss": 0.6719, + "step": 29280 + }, + { + "epoch": 1.2038787081929492, + "grad_norm": 1.2595094442367554, + "learning_rate": 7.244514793192658e-05, + "loss": 0.6652, + "step": 29300 + }, + { + "epoch": 1.20470046840332, + "grad_norm": 1.1310940980911255, + "learning_rate": 7.231724807768243e-05, + "loss": 0.6286, + "step": 29320 + }, + { + "epoch": 1.2055222286136904, + "grad_norm": 1.1567174196243286, + "learning_rate": 7.2189397247461e-05, + "loss": 0.6545, + "step": 29340 + }, + { + "epoch": 1.2063439888240612, + "grad_norm": 1.2142544984817505, + "learning_rate": 7.206159566767633e-05, + "loss": 0.6717, + "step": 29360 + }, + { + "epoch": 1.2071657490344316, + "grad_norm": 1.1306164264678955, + "learning_rate": 7.193384356465518e-05, + "loss": 0.6589, + "step": 29380 + }, + { + "epoch": 1.2079875092448025, + "grad_norm": 1.1609306335449219, + "learning_rate": 7.180614116463671e-05, + "loss": 0.6393, + "step": 29400 + }, + { + "epoch": 1.2088092694551729, + "grad_norm": 1.0231740474700928, + "learning_rate": 7.167848869377201e-05, + "loss": 0.6623, + "step": 29420 + }, + { + "epoch": 1.2096310296655437, + "grad_norm": 1.1379834413528442, + "learning_rate": 7.15508863781238e-05, + "loss": 0.6451, + "step": 29440 + }, + { + "epoch": 1.210452789875914, + "grad_norm": 1.1678614616394043, + "learning_rate": 7.142333444366593e-05, + "loss": 0.6275, + "step": 29460 + }, + { + "epoch": 1.211274550086285, + "grad_norm": 1.2142267227172852, + "learning_rate": 7.129583311628307e-05, + "loss": 0.6486, + "step": 29480 + }, + { + "epoch": 1.2120963102966553, + "grad_norm": 1.1522984504699707, + "learning_rate": 7.116838262177025e-05, + "loss": 0.6692, + "step": 29500 + }, + { + "epoch": 1.2129180705070262, + "grad_norm": 1.2733930349349976, + "learning_rate": 7.104098318583256e-05, + "loss": 0.6467, + "step": 29520 + }, + { + "epoch": 1.2137398307173966, + "grad_norm": 1.1900440454483032, + "learning_rate": 7.09136350340845e-05, + "loss": 0.6534, + "step": 29540 + }, + { + "epoch": 1.2145615909277674, + "grad_norm": 1.1477757692337036, + "learning_rate": 7.078633839204985e-05, + "loss": 0.6601, + "step": 29560 + }, + { + "epoch": 1.2153833511381378, + "grad_norm": 1.284244418144226, + "learning_rate": 7.065909348516122e-05, + "loss": 0.6659, + "step": 29580 + }, + { + "epoch": 1.2162051113485086, + "grad_norm": 1.0393390655517578, + "learning_rate": 7.05319005387595e-05, + "loss": 0.6428, + "step": 29600 + }, + { + "epoch": 1.217026871558879, + "grad_norm": 1.0445849895477295, + "learning_rate": 7.040475977809362e-05, + "loss": 0.6619, + "step": 29620 + }, + { + "epoch": 1.2178486317692498, + "grad_norm": 1.2404284477233887, + "learning_rate": 7.027767142832012e-05, + "loss": 0.6424, + "step": 29640 + }, + { + "epoch": 1.2186703919796202, + "grad_norm": 1.1194926500320435, + "learning_rate": 7.015063571450262e-05, + "loss": 0.6492, + "step": 29660 + }, + { + "epoch": 1.219492152189991, + "grad_norm": 1.1228058338165283, + "learning_rate": 7.00236528616116e-05, + "loss": 0.6569, + "step": 29680 + }, + { + "epoch": 1.2203139124003615, + "grad_norm": 1.106655240058899, + "learning_rate": 6.989672309452398e-05, + "loss": 0.6606, + "step": 29700 + }, + { + "epoch": 1.2211356726107323, + "grad_norm": 1.1909033060073853, + "learning_rate": 6.976984663802252e-05, + "loss": 0.6867, + "step": 29720 + }, + { + "epoch": 1.2219574328211027, + "grad_norm": 1.1197348833084106, + "learning_rate": 6.964302371679578e-05, + "loss": 0.65, + "step": 29740 + }, + { + "epoch": 1.2227791930314735, + "grad_norm": 1.0467857122421265, + "learning_rate": 6.951625455543724e-05, + "loss": 0.6434, + "step": 29760 + }, + { + "epoch": 1.223600953241844, + "grad_norm": 1.1986229419708252, + "learning_rate": 6.938953937844541e-05, + "loss": 0.6278, + "step": 29780 + }, + { + "epoch": 1.2244227134522148, + "grad_norm": 1.1179389953613281, + "learning_rate": 6.926287841022312e-05, + "loss": 0.6584, + "step": 29800 + }, + { + "epoch": 1.2252444736625852, + "grad_norm": 1.0440237522125244, + "learning_rate": 6.913627187507716e-05, + "loss": 0.6305, + "step": 29820 + }, + { + "epoch": 1.226066233872956, + "grad_norm": 1.2614185810089111, + "learning_rate": 6.9009719997218e-05, + "loss": 0.6582, + "step": 29840 + }, + { + "epoch": 1.2268879940833264, + "grad_norm": 1.0907703638076782, + "learning_rate": 6.888322300075927e-05, + "loss": 0.655, + "step": 29860 + }, + { + "epoch": 1.2277097542936972, + "grad_norm": 1.0902374982833862, + "learning_rate": 6.875678110971738e-05, + "loss": 0.6338, + "step": 29880 + }, + { + "epoch": 1.2285315145040676, + "grad_norm": 1.2563419342041016, + "learning_rate": 6.863039454801119e-05, + "loss": 0.6591, + "step": 29900 + }, + { + "epoch": 1.2293532747144384, + "grad_norm": 1.1033470630645752, + "learning_rate": 6.850406353946158e-05, + "loss": 0.654, + "step": 29920 + }, + { + "epoch": 1.2301750349248088, + "grad_norm": 1.1465051174163818, + "learning_rate": 6.837778830779105e-05, + "loss": 0.6621, + "step": 29940 + }, + { + "epoch": 1.2309967951351797, + "grad_norm": 1.1070022583007812, + "learning_rate": 6.825156907662336e-05, + "loss": 0.6426, + "step": 29960 + }, + { + "epoch": 1.23181855534555, + "grad_norm": 1.1090333461761475, + "learning_rate": 6.812540606948296e-05, + "loss": 0.6625, + "step": 29980 + }, + { + "epoch": 1.232640315555921, + "grad_norm": 1.1961784362792969, + "learning_rate": 6.799929950979487e-05, + "loss": 0.6424, + "step": 30000 + }, + { + "epoch": 1.232640315555921, + "eval_loss": 0.9679434895515442, + "eval_runtime": 16.6198, + "eval_samples_per_second": 157.644, + "eval_steps_per_second": 4.934, + "step": 30000 + }, + { + "epoch": 1.2334620757662913, + "grad_norm": 1.2960805892944336, + "learning_rate": 6.787324962088411e-05, + "loss": 0.6821, + "step": 30020 + }, + { + "epoch": 1.234283835976662, + "grad_norm": 1.1521825790405273, + "learning_rate": 6.77472566259753e-05, + "loss": 0.6593, + "step": 30040 + }, + { + "epoch": 1.2351055961870325, + "grad_norm": 1.1858956813812256, + "learning_rate": 6.762132074819236e-05, + "loss": 0.662, + "step": 30060 + }, + { + "epoch": 1.2359273563974031, + "grad_norm": 1.1205803155899048, + "learning_rate": 6.749544221055808e-05, + "loss": 0.6815, + "step": 30080 + }, + { + "epoch": 1.2367491166077738, + "grad_norm": 0.9257388710975647, + "learning_rate": 6.736962123599355e-05, + "loss": 0.6531, + "step": 30100 + }, + { + "epoch": 1.2375708768181444, + "grad_norm": 0.9369202256202698, + "learning_rate": 6.724385804731811e-05, + "loss": 0.6305, + "step": 30120 + }, + { + "epoch": 1.238392637028515, + "grad_norm": 1.1999167203903198, + "learning_rate": 6.71181528672487e-05, + "loss": 0.6468, + "step": 30140 + }, + { + "epoch": 1.2392143972388856, + "grad_norm": 1.1772797107696533, + "learning_rate": 6.699250591839946e-05, + "loss": 0.654, + "step": 30160 + }, + { + "epoch": 1.2400361574492562, + "grad_norm": 1.1733454465866089, + "learning_rate": 6.68669174232815e-05, + "loss": 0.6439, + "step": 30180 + }, + { + "epoch": 1.2408579176596268, + "grad_norm": 1.0485635995864868, + "learning_rate": 6.674138760430236e-05, + "loss": 0.6505, + "step": 30200 + }, + { + "epoch": 1.2416796778699974, + "grad_norm": 1.1465091705322266, + "learning_rate": 6.66159166837657e-05, + "loss": 0.6591, + "step": 30220 + }, + { + "epoch": 1.242501438080368, + "grad_norm": 1.1369998455047607, + "learning_rate": 6.649050488387086e-05, + "loss": 0.6399, + "step": 30240 + }, + { + "epoch": 1.2433231982907387, + "grad_norm": 1.2065318822860718, + "learning_rate": 6.636515242671247e-05, + "loss": 0.6695, + "step": 30260 + }, + { + "epoch": 1.2441449585011093, + "grad_norm": 1.2178568840026855, + "learning_rate": 6.62398595342801e-05, + "loss": 0.6448, + "step": 30280 + }, + { + "epoch": 1.24496671871148, + "grad_norm": 1.1589614152908325, + "learning_rate": 6.611462642845782e-05, + "loss": 0.6308, + "step": 30300 + }, + { + "epoch": 1.2457884789218505, + "grad_norm": 1.136525273323059, + "learning_rate": 6.598945333102384e-05, + "loss": 0.6525, + "step": 30320 + }, + { + "epoch": 1.2466102391322211, + "grad_norm": 1.2322630882263184, + "learning_rate": 6.586434046365002e-05, + "loss": 0.639, + "step": 30340 + }, + { + "epoch": 1.2474319993425917, + "grad_norm": 1.171217679977417, + "learning_rate": 6.573928804790165e-05, + "loss": 0.6624, + "step": 30360 + }, + { + "epoch": 1.2482537595529624, + "grad_norm": 1.0731596946716309, + "learning_rate": 6.561429630523694e-05, + "loss": 0.658, + "step": 30380 + }, + { + "epoch": 1.249075519763333, + "grad_norm": 0.9198722839355469, + "learning_rate": 6.548936545700665e-05, + "loss": 0.656, + "step": 30400 + }, + { + "epoch": 1.2498972799737036, + "grad_norm": 1.2230961322784424, + "learning_rate": 6.53644957244537e-05, + "loss": 0.6415, + "step": 30420 + }, + { + "epoch": 1.2507190401840742, + "grad_norm": 1.3008650541305542, + "learning_rate": 6.52396873287127e-05, + "loss": 0.649, + "step": 30440 + }, + { + "epoch": 1.2515408003944448, + "grad_norm": 1.1299681663513184, + "learning_rate": 6.511494049080982e-05, + "loss": 0.6632, + "step": 30460 + }, + { + "epoch": 1.2523625606048154, + "grad_norm": 1.0429800748825073, + "learning_rate": 6.499025543166205e-05, + "loss": 0.6435, + "step": 30480 + }, + { + "epoch": 1.253184320815186, + "grad_norm": 1.313433051109314, + "learning_rate": 6.486563237207704e-05, + "loss": 0.6573, + "step": 30500 + }, + { + "epoch": 1.2540060810255567, + "grad_norm": 1.0738648176193237, + "learning_rate": 6.47472980935819e-05, + "loss": 0.6508, + "step": 30520 + }, + { + "epoch": 1.2548278412359273, + "grad_norm": 1.1511154174804688, + "learning_rate": 6.462279656782608e-05, + "loss": 0.6244, + "step": 30540 + }, + { + "epoch": 1.255649601446298, + "grad_norm": 1.1229740381240845, + "learning_rate": 6.450457814479982e-05, + "loss": 0.6303, + "step": 30560 + }, + { + "epoch": 1.2564713616566685, + "grad_norm": 1.1822351217269897, + "learning_rate": 6.438019899125807e-05, + "loss": 0.6296, + "step": 30580 + }, + { + "epoch": 1.2572931218670391, + "grad_norm": 1.127118706703186, + "learning_rate": 6.425588291764203e-05, + "loss": 0.6664, + "step": 30600 + }, + { + "epoch": 1.2581148820774097, + "grad_norm": 1.275978684425354, + "learning_rate": 6.413163014410595e-05, + "loss": 0.651, + "step": 30620 + }, + { + "epoch": 1.2589366422877803, + "grad_norm": 1.1102421283721924, + "learning_rate": 6.400744089069191e-05, + "loss": 0.6459, + "step": 30640 + }, + { + "epoch": 1.259758402498151, + "grad_norm": 1.2421194314956665, + "learning_rate": 6.388331537732954e-05, + "loss": 0.6464, + "step": 30660 + }, + { + "epoch": 1.2605801627085216, + "grad_norm": 1.248716950416565, + "learning_rate": 6.37592538238356e-05, + "loss": 0.6495, + "step": 30680 + }, + { + "epoch": 1.2614019229188922, + "grad_norm": 1.144371509552002, + "learning_rate": 6.363525644991348e-05, + "loss": 0.6405, + "step": 30700 + }, + { + "epoch": 1.2622236831292628, + "grad_norm": 1.2534958124160767, + "learning_rate": 6.351132347515303e-05, + "loss": 0.6581, + "step": 30720 + }, + { + "epoch": 1.2630454433396334, + "grad_norm": 1.2671653032302856, + "learning_rate": 6.338745511902997e-05, + "loss": 0.6547, + "step": 30740 + }, + { + "epoch": 1.263867203550004, + "grad_norm": 1.1555167436599731, + "learning_rate": 6.326365160090566e-05, + "loss": 0.6461, + "step": 30760 + }, + { + "epoch": 1.2646889637603747, + "grad_norm": 1.1524670124053955, + "learning_rate": 6.313991314002663e-05, + "loss": 0.6622, + "step": 30780 + }, + { + "epoch": 1.2655107239707453, + "grad_norm": 1.204883337020874, + "learning_rate": 6.301623995552409e-05, + "loss": 0.6624, + "step": 30800 + }, + { + "epoch": 1.2663324841811159, + "grad_norm": 1.0530204772949219, + "learning_rate": 6.289263226641375e-05, + "loss": 0.6655, + "step": 30820 + }, + { + "epoch": 1.2671542443914865, + "grad_norm": 1.1731172800064087, + "learning_rate": 6.276909029159536e-05, + "loss": 0.6524, + "step": 30840 + }, + { + "epoch": 1.267976004601857, + "grad_norm": 1.1701477766036987, + "learning_rate": 6.26456142498522e-05, + "loss": 0.6265, + "step": 30860 + }, + { + "epoch": 1.2687977648122277, + "grad_norm": 1.3820420503616333, + "learning_rate": 6.252220435985088e-05, + "loss": 0.6387, + "step": 30880 + }, + { + "epoch": 1.2696195250225983, + "grad_norm": 1.1477141380310059, + "learning_rate": 6.239886084014081e-05, + "loss": 0.6521, + "step": 30900 + }, + { + "epoch": 1.270441285232969, + "grad_norm": 1.2501583099365234, + "learning_rate": 6.22755839091538e-05, + "loss": 0.6452, + "step": 30920 + }, + { + "epoch": 1.2712630454433396, + "grad_norm": 1.110160231590271, + "learning_rate": 6.215237378520388e-05, + "loss": 0.6287, + "step": 30940 + }, + { + "epoch": 1.2720848056537102, + "grad_norm": 1.2295660972595215, + "learning_rate": 6.202923068648665e-05, + "loss": 0.6611, + "step": 30960 + }, + { + "epoch": 1.2729065658640808, + "grad_norm": 1.2019611597061157, + "learning_rate": 6.190615483107911e-05, + "loss": 0.6363, + "step": 30980 + }, + { + "epoch": 1.2737283260744514, + "grad_norm": 1.2087301015853882, + "learning_rate": 6.17831464369391e-05, + "loss": 0.6613, + "step": 31000 + }, + { + "epoch": 1.274550086284822, + "grad_norm": 1.179447889328003, + "learning_rate": 6.166020572190501e-05, + "loss": 0.6497, + "step": 31020 + }, + { + "epoch": 1.2753718464951926, + "grad_norm": 1.183194637298584, + "learning_rate": 6.153733290369536e-05, + "loss": 0.6464, + "step": 31040 + }, + { + "epoch": 1.2761936067055633, + "grad_norm": 1.2545372247695923, + "learning_rate": 6.14145281999085e-05, + "loss": 0.6517, + "step": 31060 + }, + { + "epoch": 1.2770153669159339, + "grad_norm": 1.143701434135437, + "learning_rate": 6.129179182802208e-05, + "loss": 0.629, + "step": 31080 + }, + { + "epoch": 1.2778371271263045, + "grad_norm": 1.2404268980026245, + "learning_rate": 6.116912400539277e-05, + "loss": 0.6209, + "step": 31100 + }, + { + "epoch": 1.278658887336675, + "grad_norm": 1.0369129180908203, + "learning_rate": 6.104652494925587e-05, + "loss": 0.637, + "step": 31120 + }, + { + "epoch": 1.2794806475470457, + "grad_norm": 1.1535693407058716, + "learning_rate": 6.092399487672482e-05, + "loss": 0.6406, + "step": 31140 + }, + { + "epoch": 1.2803024077574163, + "grad_norm": 1.143570899963379, + "learning_rate": 6.0801534004791005e-05, + "loss": 0.6631, + "step": 31160 + }, + { + "epoch": 1.281124167967787, + "grad_norm": 1.2489653825759888, + "learning_rate": 6.067914255032314e-05, + "loss": 0.6433, + "step": 31180 + }, + { + "epoch": 1.2819459281781576, + "grad_norm": 1.0466420650482178, + "learning_rate": 6.05568207300671e-05, + "loss": 0.6404, + "step": 31200 + }, + { + "epoch": 1.2827676883885282, + "grad_norm": 1.2191749811172485, + "learning_rate": 6.043456876064546e-05, + "loss": 0.6197, + "step": 31220 + }, + { + "epoch": 1.2835894485988988, + "grad_norm": 1.1350699663162231, + "learning_rate": 6.0312386858556956e-05, + "loss": 0.6605, + "step": 31240 + }, + { + "epoch": 1.2844112088092694, + "grad_norm": 1.1058688163757324, + "learning_rate": 6.0190275240176386e-05, + "loss": 0.6577, + "step": 31260 + }, + { + "epoch": 1.28523296901964, + "grad_norm": 1.0823215246200562, + "learning_rate": 6.006823412175404e-05, + "loss": 0.6389, + "step": 31280 + }, + { + "epoch": 1.2860547292300106, + "grad_norm": 1.1849805116653442, + "learning_rate": 5.9946263719415295e-05, + "loss": 0.665, + "step": 31300 + }, + { + "epoch": 1.2868764894403812, + "grad_norm": 1.1973915100097656, + "learning_rate": 5.982436424916037e-05, + "loss": 0.6612, + "step": 31320 + }, + { + "epoch": 1.2876982496507519, + "grad_norm": 1.1198099851608276, + "learning_rate": 5.970253592686389e-05, + "loss": 0.6371, + "step": 31340 + }, + { + "epoch": 1.2885200098611225, + "grad_norm": 0.9331278204917908, + "learning_rate": 5.9580778968274354e-05, + "loss": 0.6204, + "step": 31360 + }, + { + "epoch": 1.289341770071493, + "grad_norm": 1.0500894784927368, + "learning_rate": 5.9459093589014e-05, + "loss": 0.6481, + "step": 31380 + }, + { + "epoch": 1.2901635302818637, + "grad_norm": 1.1697520017623901, + "learning_rate": 5.9337480004578285e-05, + "loss": 0.6694, + "step": 31400 + }, + { + "epoch": 1.2909852904922343, + "grad_norm": 1.1045079231262207, + "learning_rate": 5.921593843033548e-05, + "loss": 0.6437, + "step": 31420 + }, + { + "epoch": 1.291807050702605, + "grad_norm": 1.0866926908493042, + "learning_rate": 5.9094469081526396e-05, + "loss": 0.6278, + "step": 31440 + }, + { + "epoch": 1.2926288109129755, + "grad_norm": 1.1703931093215942, + "learning_rate": 5.8973072173263865e-05, + "loss": 0.6523, + "step": 31460 + }, + { + "epoch": 1.2934505711233462, + "grad_norm": 1.2150297164916992, + "learning_rate": 5.885174792053245e-05, + "loss": 0.648, + "step": 31480 + }, + { + "epoch": 1.2942723313337168, + "grad_norm": 1.2517179250717163, + "learning_rate": 5.8730496538188084e-05, + "loss": 0.6439, + "step": 31500 + }, + { + "epoch": 1.2950940915440874, + "grad_norm": 1.0839895009994507, + "learning_rate": 5.8609318240957635e-05, + "loss": 0.6365, + "step": 31520 + }, + { + "epoch": 1.295915851754458, + "grad_norm": 1.081815481185913, + "learning_rate": 5.848821324343854e-05, + "loss": 0.6467, + "step": 31540 + }, + { + "epoch": 1.2967376119648286, + "grad_norm": 1.0636696815490723, + "learning_rate": 5.836718176009848e-05, + "loss": 0.6506, + "step": 31560 + }, + { + "epoch": 1.2975593721751992, + "grad_norm": 1.1643255949020386, + "learning_rate": 5.825227013865635e-05, + "loss": 0.6489, + "step": 31580 + }, + { + "epoch": 1.2983811323855698, + "grad_norm": 1.1235414743423462, + "learning_rate": 5.813742524085972e-05, + "loss": 0.652, + "step": 31600 + }, + { + "epoch": 1.2992028925959405, + "grad_norm": 1.1215312480926514, + "learning_rate": 5.801660816024896e-05, + "loss": 0.6475, + "step": 31620 + }, + { + "epoch": 1.300024652806311, + "grad_norm": 1.1635875701904297, + "learning_rate": 5.7895865428993504e-05, + "loss": 0.6325, + "step": 31640 + }, + { + "epoch": 1.3008464130166817, + "grad_norm": 1.1717442274093628, + "learning_rate": 5.777519726091938e-05, + "loss": 0.6301, + "step": 31660 + }, + { + "epoch": 1.3016681732270523, + "grad_norm": 1.0808964967727661, + "learning_rate": 5.765460386972068e-05, + "loss": 0.636, + "step": 31680 + }, + { + "epoch": 1.302489933437423, + "grad_norm": 1.1285815238952637, + "learning_rate": 5.7534085468958965e-05, + "loss": 0.6405, + "step": 31700 + }, + { + "epoch": 1.3033116936477935, + "grad_norm": 1.0786865949630737, + "learning_rate": 5.7413642272063164e-05, + "loss": 0.6418, + "step": 31720 + }, + { + "epoch": 1.3041334538581641, + "grad_norm": 1.243355631828308, + "learning_rate": 5.729327449232873e-05, + "loss": 0.6563, + "step": 31740 + }, + { + "epoch": 1.3049552140685348, + "grad_norm": 1.1835846900939941, + "learning_rate": 5.717298234291786e-05, + "loss": 0.6447, + "step": 31760 + }, + { + "epoch": 1.3057769742789054, + "grad_norm": 1.239445686340332, + "learning_rate": 5.705276603685868e-05, + "loss": 0.6445, + "step": 31780 + }, + { + "epoch": 1.306598734489276, + "grad_norm": 1.0167465209960938, + "learning_rate": 5.693262578704492e-05, + "loss": 0.6587, + "step": 31800 + }, + { + "epoch": 1.3074204946996466, + "grad_norm": 1.2314865589141846, + "learning_rate": 5.6812561806235855e-05, + "loss": 0.6464, + "step": 31820 + }, + { + "epoch": 1.3082422549100172, + "grad_norm": 1.1035876274108887, + "learning_rate": 5.6692574307055476e-05, + "loss": 0.6549, + "step": 31840 + }, + { + "epoch": 1.3090640151203878, + "grad_norm": 1.2363945245742798, + "learning_rate": 5.6572663501992416e-05, + "loss": 0.6621, + "step": 31860 + }, + { + "epoch": 1.3098857753307585, + "grad_norm": 0.9777959585189819, + "learning_rate": 5.645282960339944e-05, + "loss": 0.6054, + "step": 31880 + }, + { + "epoch": 1.310707535541129, + "grad_norm": 1.1316789388656616, + "learning_rate": 5.633307282349325e-05, + "loss": 0.6553, + "step": 31900 + }, + { + "epoch": 1.3115292957514997, + "grad_norm": 1.0634005069732666, + "learning_rate": 5.6213393374353814e-05, + "loss": 0.619, + "step": 31920 + }, + { + "epoch": 1.3123510559618703, + "grad_norm": 1.0504931211471558, + "learning_rate": 5.609379146792426e-05, + "loss": 0.6504, + "step": 31940 + }, + { + "epoch": 1.313172816172241, + "grad_norm": 1.1487746238708496, + "learning_rate": 5.597426731601034e-05, + "loss": 0.6555, + "step": 31960 + }, + { + "epoch": 1.3139945763826115, + "grad_norm": 1.0530478954315186, + "learning_rate": 5.585482113028009e-05, + "loss": 0.6315, + "step": 31980 + }, + { + "epoch": 1.3148163365929821, + "grad_norm": 1.161407709121704, + "learning_rate": 5.5735453122263595e-05, + "loss": 0.6467, + "step": 32000 + }, + { + "epoch": 1.3148163365929821, + "eval_loss": 0.9416676163673401, + "eval_runtime": 16.767, + "eval_samples_per_second": 156.259, + "eval_steps_per_second": 4.891, + "step": 32000 + }, + { + "epoch": 1.3156380968033528, + "grad_norm": 1.104887843132019, + "learning_rate": 5.5616163503352314e-05, + "loss": 0.6367, + "step": 32020 + }, + { + "epoch": 1.3164598570137234, + "grad_norm": 0.9899182319641113, + "learning_rate": 5.5496952484799114e-05, + "loss": 0.6521, + "step": 32040 + }, + { + "epoch": 1.317281617224094, + "grad_norm": 1.3830389976501465, + "learning_rate": 5.537782027771736e-05, + "loss": 0.6269, + "step": 32060 + }, + { + "epoch": 1.3181033774344646, + "grad_norm": 1.158103108406067, + "learning_rate": 5.5258767093081165e-05, + "loss": 0.6405, + "step": 32080 + }, + { + "epoch": 1.3189251376448352, + "grad_norm": 1.2154991626739502, + "learning_rate": 5.513979314172449e-05, + "loss": 0.626, + "step": 32100 + }, + { + "epoch": 1.3197468978552058, + "grad_norm": 1.2084523439407349, + "learning_rate": 5.502089863434101e-05, + "loss": 0.6438, + "step": 32120 + }, + { + "epoch": 1.3205686580655764, + "grad_norm": 1.1323741674423218, + "learning_rate": 5.490208378148385e-05, + "loss": 0.6553, + "step": 32140 + }, + { + "epoch": 1.321390418275947, + "grad_norm": 1.1183146238327026, + "learning_rate": 5.478334879356488e-05, + "loss": 0.6435, + "step": 32160 + }, + { + "epoch": 1.3222121784863177, + "grad_norm": 1.035343885421753, + "learning_rate": 5.466469388085467e-05, + "loss": 0.6335, + "step": 32180 + }, + { + "epoch": 1.3230339386966883, + "grad_norm": 0.9998947381973267, + "learning_rate": 5.454611925348191e-05, + "loss": 0.6473, + "step": 32200 + }, + { + "epoch": 1.323855698907059, + "grad_norm": 1.1686418056488037, + "learning_rate": 5.442762512143311e-05, + "loss": 0.6328, + "step": 32220 + }, + { + "epoch": 1.3246774591174295, + "grad_norm": 1.1200451850891113, + "learning_rate": 5.4309211694552334e-05, + "loss": 0.6295, + "step": 32240 + }, + { + "epoch": 1.3254992193278001, + "grad_norm": 1.2187868356704712, + "learning_rate": 5.41908791825406e-05, + "loss": 0.6223, + "step": 32260 + }, + { + "epoch": 1.3263209795381707, + "grad_norm": 1.0789508819580078, + "learning_rate": 5.4072627794955697e-05, + "loss": 0.6311, + "step": 32280 + }, + { + "epoch": 1.3271427397485414, + "grad_norm": 1.2329927682876587, + "learning_rate": 5.395445774121166e-05, + "loss": 0.6376, + "step": 32300 + }, + { + "epoch": 1.327964499958912, + "grad_norm": 1.2252501249313354, + "learning_rate": 5.3836369230578665e-05, + "loss": 0.6175, + "step": 32320 + }, + { + "epoch": 1.3287862601692826, + "grad_norm": 1.0544497966766357, + "learning_rate": 5.371836247218232e-05, + "loss": 0.6348, + "step": 32340 + }, + { + "epoch": 1.3296080203796532, + "grad_norm": 1.1359783411026, + "learning_rate": 5.360043767500348e-05, + "loss": 0.6333, + "step": 32360 + }, + { + "epoch": 1.3304297805900238, + "grad_norm": 1.1277779340744019, + "learning_rate": 5.3482595047878004e-05, + "loss": 0.64, + "step": 32380 + }, + { + "epoch": 1.3312515408003944, + "grad_norm": 1.1540424823760986, + "learning_rate": 5.3364834799495934e-05, + "loss": 0.6403, + "step": 32400 + }, + { + "epoch": 1.332073301010765, + "grad_norm": 1.007522702217102, + "learning_rate": 5.324715713840174e-05, + "loss": 0.6374, + "step": 32420 + }, + { + "epoch": 1.3328950612211357, + "grad_norm": 1.1693965196609497, + "learning_rate": 5.3129562272993437e-05, + "loss": 0.6387, + "step": 32440 + }, + { + "epoch": 1.3337168214315063, + "grad_norm": 1.1591569185256958, + "learning_rate": 5.301205041152253e-05, + "loss": 0.6613, + "step": 32460 + }, + { + "epoch": 1.3345385816418769, + "grad_norm": 1.1624069213867188, + "learning_rate": 5.2900491215067996e-05, + "loss": 0.6349, + "step": 32480 + }, + { + "epoch": 1.3353603418522475, + "grad_norm": 1.181348204612732, + "learning_rate": 5.2783141809701195e-05, + "loss": 0.6565, + "step": 32500 + }, + { + "epoch": 1.3361821020626181, + "grad_norm": 1.1376749277114868, + "learning_rate": 5.266587602175571e-05, + "loss": 0.642, + "step": 32520 + }, + { + "epoch": 1.3370038622729887, + "grad_norm": 1.161271572113037, + "learning_rate": 5.25486940589003e-05, + "loss": 0.6378, + "step": 32540 + }, + { + "epoch": 1.3378256224833593, + "grad_norm": 1.0550984144210815, + "learning_rate": 5.243159612865513e-05, + "loss": 0.6456, + "step": 32560 + }, + { + "epoch": 1.33864738269373, + "grad_norm": 1.299521803855896, + "learning_rate": 5.2314582438391666e-05, + "loss": 0.6495, + "step": 32580 + }, + { + "epoch": 1.3394691429041006, + "grad_norm": 1.115898847579956, + "learning_rate": 5.2197653195332094e-05, + "loss": 0.6439, + "step": 32600 + }, + { + "epoch": 1.3402909031144712, + "grad_norm": 1.1047664880752563, + "learning_rate": 5.208080860654916e-05, + "loss": 0.6428, + "step": 32620 + }, + { + "epoch": 1.3411126633248418, + "grad_norm": 1.0742267370224, + "learning_rate": 5.196404887896562e-05, + "loss": 0.633, + "step": 32640 + }, + { + "epoch": 1.3419344235352124, + "grad_norm": 1.079347014427185, + "learning_rate": 5.18473742193539e-05, + "loss": 0.628, + "step": 32660 + }, + { + "epoch": 1.342756183745583, + "grad_norm": 1.0724397897720337, + "learning_rate": 5.1730784834336e-05, + "loss": 0.6421, + "step": 32680 + }, + { + "epoch": 1.3435779439559536, + "grad_norm": 1.193382740020752, + "learning_rate": 5.161428093038255e-05, + "loss": 0.6263, + "step": 32700 + }, + { + "epoch": 1.3443997041663243, + "grad_norm": 1.219397783279419, + "learning_rate": 5.149786271381314e-05, + "loss": 0.637, + "step": 32720 + }, + { + "epoch": 1.3452214643766949, + "grad_norm": 1.2231544256210327, + "learning_rate": 5.1381530390795365e-05, + "loss": 0.6491, + "step": 32740 + }, + { + "epoch": 1.3460432245870655, + "grad_norm": 1.1197011470794678, + "learning_rate": 5.1265284167344906e-05, + "loss": 0.6405, + "step": 32760 + }, + { + "epoch": 1.346864984797436, + "grad_norm": 1.1939942836761475, + "learning_rate": 5.114912424932485e-05, + "loss": 0.6389, + "step": 32780 + }, + { + "epoch": 1.3476867450078067, + "grad_norm": 1.029039740562439, + "learning_rate": 5.103305084244545e-05, + "loss": 0.6434, + "step": 32800 + }, + { + "epoch": 1.3485085052181773, + "grad_norm": 1.1776632070541382, + "learning_rate": 5.0917064152263804e-05, + "loss": 0.6287, + "step": 32820 + }, + { + "epoch": 1.349330265428548, + "grad_norm": 1.1028509140014648, + "learning_rate": 5.080116438418334e-05, + "loss": 0.6407, + "step": 32840 + }, + { + "epoch": 1.3501520256389186, + "grad_norm": 1.1051464080810547, + "learning_rate": 5.068535174345373e-05, + "loss": 0.6248, + "step": 32860 + }, + { + "epoch": 1.3509737858492892, + "grad_norm": 1.0466769933700562, + "learning_rate": 5.056962643517014e-05, + "loss": 0.6292, + "step": 32880 + }, + { + "epoch": 1.3517955460596598, + "grad_norm": 1.222561001777649, + "learning_rate": 5.045398866427331e-05, + "loss": 0.6223, + "step": 32900 + }, + { + "epoch": 1.3526173062700304, + "grad_norm": 1.2215903997421265, + "learning_rate": 5.03384386355487e-05, + "loss": 0.6381, + "step": 32920 + }, + { + "epoch": 1.353439066480401, + "grad_norm": 1.1650848388671875, + "learning_rate": 5.02229765536266e-05, + "loss": 0.6543, + "step": 32940 + }, + { + "epoch": 1.3542608266907716, + "grad_norm": 1.1978867053985596, + "learning_rate": 5.010760262298145e-05, + "loss": 0.6255, + "step": 32960 + }, + { + "epoch": 1.3550825869011422, + "grad_norm": 1.2988346815109253, + "learning_rate": 4.999231704793156e-05, + "loss": 0.6388, + "step": 32980 + }, + { + "epoch": 1.3559043471115129, + "grad_norm": 0.9702379107475281, + "learning_rate": 4.987712003263892e-05, + "loss": 0.6343, + "step": 33000 + }, + { + "epoch": 1.3567261073218835, + "grad_norm": 1.037627100944519, + "learning_rate": 4.976201178110843e-05, + "loss": 0.6351, + "step": 33020 + }, + { + "epoch": 1.357547867532254, + "grad_norm": 1.2792913913726807, + "learning_rate": 4.964699249718805e-05, + "loss": 0.6324, + "step": 33040 + }, + { + "epoch": 1.3583696277426247, + "grad_norm": 1.1782094240188599, + "learning_rate": 4.953206238456804e-05, + "loss": 0.6144, + "step": 33060 + }, + { + "epoch": 1.3591913879529953, + "grad_norm": 1.1542799472808838, + "learning_rate": 4.941722164678074e-05, + "loss": 0.6236, + "step": 33080 + }, + { + "epoch": 1.360013148163366, + "grad_norm": 1.2403596639633179, + "learning_rate": 4.930247048720035e-05, + "loss": 0.6325, + "step": 33100 + }, + { + "epoch": 1.3608349083737366, + "grad_norm": 1.1392772197723389, + "learning_rate": 4.918780910904229e-05, + "loss": 0.6363, + "step": 33120 + }, + { + "epoch": 1.3616566685841072, + "grad_norm": 1.0998027324676514, + "learning_rate": 4.907323771536304e-05, + "loss": 0.6461, + "step": 33140 + }, + { + "epoch": 1.3624784287944778, + "grad_norm": 1.139052391052246, + "learning_rate": 4.895875650905967e-05, + "loss": 0.6196, + "step": 33160 + }, + { + "epoch": 1.3633001890048484, + "grad_norm": 1.0922917127609253, + "learning_rate": 4.884436569286968e-05, + "loss": 0.6418, + "step": 33180 + }, + { + "epoch": 1.364121949215219, + "grad_norm": 1.2731437683105469, + "learning_rate": 4.8730065469370345e-05, + "loss": 0.6584, + "step": 33200 + }, + { + "epoch": 1.3649437094255896, + "grad_norm": 1.07999849319458, + "learning_rate": 4.861585604097857e-05, + "loss": 0.6337, + "step": 33220 + }, + { + "epoch": 1.3657654696359602, + "grad_norm": 1.270757794380188, + "learning_rate": 4.8501737609950456e-05, + "loss": 0.6189, + "step": 33240 + }, + { + "epoch": 1.3665872298463309, + "grad_norm": 1.1716457605361938, + "learning_rate": 4.8387710378380925e-05, + "loss": 0.6443, + "step": 33260 + }, + { + "epoch": 1.3674089900567015, + "grad_norm": 1.2344413995742798, + "learning_rate": 4.827377454820351e-05, + "loss": 0.648, + "step": 33280 + }, + { + "epoch": 1.368230750267072, + "grad_norm": 1.134440541267395, + "learning_rate": 4.815993032118972e-05, + "loss": 0.6276, + "step": 33300 + }, + { + "epoch": 1.3690525104774427, + "grad_norm": 1.1542905569076538, + "learning_rate": 4.804617789894898e-05, + "loss": 0.632, + "step": 33320 + }, + { + "epoch": 1.3698742706878133, + "grad_norm": 1.0746182203292847, + "learning_rate": 4.7932517482928044e-05, + "loss": 0.6425, + "step": 33340 + }, + { + "epoch": 1.370696030898184, + "grad_norm": 1.2313367128372192, + "learning_rate": 4.7818949274410755e-05, + "loss": 0.6359, + "step": 33360 + }, + { + "epoch": 1.3715177911085545, + "grad_norm": 1.1803609132766724, + "learning_rate": 4.770547347451767e-05, + "loss": 0.6255, + "step": 33380 + }, + { + "epoch": 1.3723395513189252, + "grad_norm": 1.082554578781128, + "learning_rate": 4.759209028420567e-05, + "loss": 0.6194, + "step": 33400 + }, + { + "epoch": 1.3731613115292958, + "grad_norm": 1.1176568269729614, + "learning_rate": 4.747879990426772e-05, + "loss": 0.626, + "step": 33420 + }, + { + "epoch": 1.3739830717396664, + "grad_norm": 1.2282779216766357, + "learning_rate": 4.736560253533233e-05, + "loss": 0.6334, + "step": 33440 + }, + { + "epoch": 1.374804831950037, + "grad_norm": 1.1532173156738281, + "learning_rate": 4.725249837786333e-05, + "loss": 0.6206, + "step": 33460 + }, + { + "epoch": 1.3756265921604076, + "grad_norm": 1.1878107786178589, + "learning_rate": 4.713948763215943e-05, + "loss": 0.6585, + "step": 33480 + }, + { + "epoch": 1.3764483523707782, + "grad_norm": 1.0733743906021118, + "learning_rate": 4.7026570498354036e-05, + "loss": 0.6409, + "step": 33500 + }, + { + "epoch": 1.3772701125811488, + "grad_norm": 1.0726110935211182, + "learning_rate": 4.691374717641468e-05, + "loss": 0.6188, + "step": 33520 + }, + { + "epoch": 1.3780918727915195, + "grad_norm": 1.0967559814453125, + "learning_rate": 4.6801017866142716e-05, + "loss": 0.624, + "step": 33540 + }, + { + "epoch": 1.37891363300189, + "grad_norm": 1.1548309326171875, + "learning_rate": 4.668838276717321e-05, + "loss": 0.6432, + "step": 33560 + }, + { + "epoch": 1.3797353932122607, + "grad_norm": 1.1636457443237305, + "learning_rate": 4.6575842078974096e-05, + "loss": 0.6232, + "step": 33580 + }, + { + "epoch": 1.3805571534226313, + "grad_norm": 1.2843453884124756, + "learning_rate": 4.6463396000846385e-05, + "loss": 0.6349, + "step": 33600 + }, + { + "epoch": 1.381378913633002, + "grad_norm": 1.131871223449707, + "learning_rate": 4.635104473192334e-05, + "loss": 0.6477, + "step": 33620 + }, + { + "epoch": 1.3822006738433725, + "grad_norm": 1.2385400533676147, + "learning_rate": 4.62387884711705e-05, + "loss": 0.6233, + "step": 33640 + }, + { + "epoch": 1.3830224340537431, + "grad_norm": 1.1033365726470947, + "learning_rate": 4.612662741738501e-05, + "loss": 0.6398, + "step": 33660 + }, + { + "epoch": 1.3838441942641138, + "grad_norm": 1.232216477394104, + "learning_rate": 4.6014561769195476e-05, + "loss": 0.6356, + "step": 33680 + }, + { + "epoch": 1.3846659544744844, + "grad_norm": 1.1851789951324463, + "learning_rate": 4.5902591725061516e-05, + "loss": 0.5995, + "step": 33700 + }, + { + "epoch": 1.385487714684855, + "grad_norm": 1.0367991924285889, + "learning_rate": 4.5796308916997966e-05, + "loss": 0.6456, + "step": 33720 + }, + { + "epoch": 1.3863094748952256, + "grad_norm": 1.2613193988800049, + "learning_rate": 4.568452587095045e-05, + "loss": 0.5976, + "step": 33740 + }, + { + "epoch": 1.3871312351055962, + "grad_norm": 1.056667685508728, + "learning_rate": 4.557283901342667e-05, + "loss": 0.6341, + "step": 33760 + }, + { + "epoch": 1.3879529953159668, + "grad_norm": 0.9964428544044495, + "learning_rate": 4.546124854221549e-05, + "loss": 0.6187, + "step": 33780 + }, + { + "epoch": 1.3887747555263374, + "grad_norm": 1.2256003618240356, + "learning_rate": 4.5349754654934994e-05, + "loss": 0.6263, + "step": 33800 + }, + { + "epoch": 1.389596515736708, + "grad_norm": 1.1462249755859375, + "learning_rate": 4.523835754903235e-05, + "loss": 0.6133, + "step": 33820 + }, + { + "epoch": 1.3904182759470787, + "grad_norm": 1.1194911003112793, + "learning_rate": 4.512705742178317e-05, + "loss": 0.6437, + "step": 33840 + }, + { + "epoch": 1.3912400361574493, + "grad_norm": 1.1602319478988647, + "learning_rate": 4.501585447029154e-05, + "loss": 0.6323, + "step": 33860 + }, + { + "epoch": 1.39206179636782, + "grad_norm": 1.1044458150863647, + "learning_rate": 4.490474889148918e-05, + "loss": 0.6317, + "step": 33880 + }, + { + "epoch": 1.3928835565781905, + "grad_norm": 1.1019541025161743, + "learning_rate": 4.479374088213561e-05, + "loss": 0.6154, + "step": 33900 + }, + { + "epoch": 1.3937053167885611, + "grad_norm": 1.163619875907898, + "learning_rate": 4.468283063881745e-05, + "loss": 0.64, + "step": 33920 + }, + { + "epoch": 1.3945270769989317, + "grad_norm": 1.2417570352554321, + "learning_rate": 4.4572018357948163e-05, + "loss": 0.6219, + "step": 33940 + }, + { + "epoch": 1.3953488372093024, + "grad_norm": 1.0506731271743774, + "learning_rate": 4.446130423576788e-05, + "loss": 0.6288, + "step": 33960 + }, + { + "epoch": 1.396170597419673, + "grad_norm": 1.1237679719924927, + "learning_rate": 4.4350688468342625e-05, + "loss": 0.6271, + "step": 33980 + }, + { + "epoch": 1.3969923576300436, + "grad_norm": 1.254115343093872, + "learning_rate": 4.424017125156454e-05, + "loss": 0.616, + "step": 34000 + }, + { + "epoch": 1.3969923576300436, + "eval_loss": 0.924524188041687, + "eval_runtime": 16.5191, + "eval_samples_per_second": 158.604, + "eval_steps_per_second": 4.964, + "step": 34000 + }, + { + "epoch": 1.3978141178404142, + "grad_norm": 1.0411611795425415, + "learning_rate": 4.412975278115104e-05, + "loss": 0.6529, + "step": 34020 + }, + { + "epoch": 1.3986358780507848, + "grad_norm": 1.1783146858215332, + "learning_rate": 4.401943325264478e-05, + "loss": 0.6594, + "step": 34040 + }, + { + "epoch": 1.3994576382611554, + "grad_norm": 1.1332125663757324, + "learning_rate": 4.390921286141314e-05, + "loss": 0.6194, + "step": 34060 + }, + { + "epoch": 1.400279398471526, + "grad_norm": 1.1602998971939087, + "learning_rate": 4.3799091802647954e-05, + "loss": 0.6178, + "step": 34080 + }, + { + "epoch": 1.4011011586818967, + "grad_norm": 1.1415718793869019, + "learning_rate": 4.368907027136512e-05, + "loss": 0.6115, + "step": 34100 + }, + { + "epoch": 1.4019229188922673, + "grad_norm": 1.0940213203430176, + "learning_rate": 4.3579148462404273e-05, + "loss": 0.6163, + "step": 34120 + }, + { + "epoch": 1.402744679102638, + "grad_norm": 1.0750993490219116, + "learning_rate": 4.346932657042855e-05, + "loss": 0.6368, + "step": 34140 + }, + { + "epoch": 1.4035664393130085, + "grad_norm": 1.1632193326950073, + "learning_rate": 4.335960478992399e-05, + "loss": 0.6357, + "step": 34160 + }, + { + "epoch": 1.4043881995233791, + "grad_norm": 0.9799935221672058, + "learning_rate": 4.324998331519954e-05, + "loss": 0.6241, + "step": 34180 + }, + { + "epoch": 1.4052099597337497, + "grad_norm": 1.2228275537490845, + "learning_rate": 4.314046234038624e-05, + "loss": 0.6131, + "step": 34200 + }, + { + "epoch": 1.4060317199441204, + "grad_norm": 1.1572198867797852, + "learning_rate": 4.3031042059437423e-05, + "loss": 0.6143, + "step": 34220 + }, + { + "epoch": 1.406853480154491, + "grad_norm": 1.0190701484680176, + "learning_rate": 4.292172266612794e-05, + "loss": 0.6292, + "step": 34240 + }, + { + "epoch": 1.4076752403648616, + "grad_norm": 1.032221794128418, + "learning_rate": 4.2812504354053986e-05, + "loss": 0.6137, + "step": 34260 + }, + { + "epoch": 1.4084970005752322, + "grad_norm": 1.2286864519119263, + "learning_rate": 4.270338731663285e-05, + "loss": 0.6144, + "step": 34280 + }, + { + "epoch": 1.4093187607856028, + "grad_norm": 1.0505238771438599, + "learning_rate": 4.259437174710239e-05, + "loss": 0.632, + "step": 34300 + }, + { + "epoch": 1.4101405209959734, + "grad_norm": 1.101257562637329, + "learning_rate": 4.248545783852077e-05, + "loss": 0.6191, + "step": 34320 + }, + { + "epoch": 1.410962281206344, + "grad_norm": 1.1456806659698486, + "learning_rate": 4.237664578376611e-05, + "loss": 0.6128, + "step": 34340 + }, + { + "epoch": 1.4117840414167147, + "grad_norm": 1.186767339706421, + "learning_rate": 4.226793577553626e-05, + "loss": 0.6076, + "step": 34360 + }, + { + "epoch": 1.4126058016270853, + "grad_norm": 1.081678867340088, + "learning_rate": 4.215932800634823e-05, + "loss": 0.5981, + "step": 34380 + }, + { + "epoch": 1.4134275618374559, + "grad_norm": 1.1703848838806152, + "learning_rate": 4.205082266853803e-05, + "loss": 0.6327, + "step": 34400 + }, + { + "epoch": 1.4142493220478265, + "grad_norm": 1.0651689767837524, + "learning_rate": 4.194241995426025e-05, + "loss": 0.6534, + "step": 34420 + }, + { + "epoch": 1.415071082258197, + "grad_norm": 0.9564984440803528, + "learning_rate": 4.183412005548771e-05, + "loss": 0.6247, + "step": 34440 + }, + { + "epoch": 1.4158928424685677, + "grad_norm": 1.0257432460784912, + "learning_rate": 4.172592316401129e-05, + "loss": 0.6221, + "step": 34460 + }, + { + "epoch": 1.4167146026789383, + "grad_norm": 1.0298326015472412, + "learning_rate": 4.1617829471439286e-05, + "loss": 0.6157, + "step": 34480 + }, + { + "epoch": 1.417536362889309, + "grad_norm": 1.0720484256744385, + "learning_rate": 4.150983916919735e-05, + "loss": 0.6086, + "step": 34500 + }, + { + "epoch": 1.4183581230996796, + "grad_norm": 1.0548421144485474, + "learning_rate": 4.140195244852797e-05, + "loss": 0.6133, + "step": 34520 + }, + { + "epoch": 1.4191798833100502, + "grad_norm": 1.1134650707244873, + "learning_rate": 4.1294169500490244e-05, + "loss": 0.613, + "step": 34540 + }, + { + "epoch": 1.4200016435204208, + "grad_norm": 1.135111689567566, + "learning_rate": 4.1186490515959456e-05, + "loss": 0.6121, + "step": 34560 + }, + { + "epoch": 1.4208234037307914, + "grad_norm": 1.080965280532837, + "learning_rate": 4.107891568562675e-05, + "loss": 0.6202, + "step": 34580 + }, + { + "epoch": 1.421645163941162, + "grad_norm": 1.1884208917617798, + "learning_rate": 4.097144519999898e-05, + "loss": 0.64, + "step": 34600 + }, + { + "epoch": 1.4224669241515326, + "grad_norm": 1.1158133745193481, + "learning_rate": 4.086407924939803e-05, + "loss": 0.6285, + "step": 34620 + }, + { + "epoch": 1.4232886843619033, + "grad_norm": 1.0862083435058594, + "learning_rate": 4.0756818023960765e-05, + "loss": 0.6368, + "step": 34640 + }, + { + "epoch": 1.4241104445722739, + "grad_norm": 1.0684436559677124, + "learning_rate": 4.0649661713638544e-05, + "loss": 0.6247, + "step": 34660 + }, + { + "epoch": 1.4249322047826445, + "grad_norm": 1.1276649236679077, + "learning_rate": 4.054261050819691e-05, + "loss": 0.6464, + "step": 34680 + }, + { + "epoch": 1.425753964993015, + "grad_norm": 1.162343144416809, + "learning_rate": 4.043566459721537e-05, + "loss": 0.6193, + "step": 34700 + }, + { + "epoch": 1.4265757252033857, + "grad_norm": 0.9801756143569946, + "learning_rate": 4.033416368328017e-05, + "loss": 0.6304, + "step": 34720 + }, + { + "epoch": 1.4273974854137563, + "grad_norm": 1.1881067752838135, + "learning_rate": 4.022742364106714e-05, + "loss": 0.6233, + "step": 34740 + }, + { + "epoch": 1.428219245624127, + "grad_norm": 1.0229750871658325, + "learning_rate": 4.012078945148589e-05, + "loss": 0.6281, + "step": 34760 + }, + { + "epoch": 1.4290410058344976, + "grad_norm": 1.2104301452636719, + "learning_rate": 4.001426130337733e-05, + "loss": 0.616, + "step": 34780 + }, + { + "epoch": 1.4298627660448682, + "grad_norm": 1.0440176725387573, + "learning_rate": 3.990783938539456e-05, + "loss": 0.6205, + "step": 34800 + }, + { + "epoch": 1.4306845262552388, + "grad_norm": 1.0981454849243164, + "learning_rate": 3.98068371306235e-05, + "loss": 0.6387, + "step": 34820 + }, + { + "epoch": 1.4315062864656094, + "grad_norm": 1.1962534189224243, + "learning_rate": 3.970062290328703e-05, + "loss": 0.6029, + "step": 34840 + }, + { + "epoch": 1.43232804667598, + "grad_norm": 1.0792949199676514, + "learning_rate": 3.959451546150584e-05, + "loss": 0.6446, + "step": 34860 + }, + { + "epoch": 1.4331498068863506, + "grad_norm": 1.2667758464813232, + "learning_rate": 3.948851499318811e-05, + "loss": 0.6233, + "step": 34880 + }, + { + "epoch": 1.4339715670967212, + "grad_norm": 0.9692990779876709, + "learning_rate": 3.9382621686052454e-05, + "loss": 0.6101, + "step": 34900 + }, + { + "epoch": 1.4347933273070919, + "grad_norm": 1.0378152132034302, + "learning_rate": 3.927683572762778e-05, + "loss": 0.6223, + "step": 34920 + }, + { + "epoch": 1.4356150875174625, + "grad_norm": 1.0101780891418457, + "learning_rate": 3.9171157305252884e-05, + "loss": 0.6109, + "step": 34940 + }, + { + "epoch": 1.436436847727833, + "grad_norm": 1.1455594301223755, + "learning_rate": 3.9065586606076064e-05, + "loss": 0.6373, + "step": 34960 + }, + { + "epoch": 1.4372586079382037, + "grad_norm": 0.9811462759971619, + "learning_rate": 3.8960123817055e-05, + "loss": 0.6111, + "step": 34980 + }, + { + "epoch": 1.4380803681485743, + "grad_norm": 1.0883426666259766, + "learning_rate": 3.8854769124956104e-05, + "loss": 0.6136, + "step": 35000 + }, + { + "epoch": 1.438902128358945, + "grad_norm": 1.2005125284194946, + "learning_rate": 3.874952271635444e-05, + "loss": 0.624, + "step": 35020 + }, + { + "epoch": 1.4397238885693155, + "grad_norm": 1.1082879304885864, + "learning_rate": 3.864438477763327e-05, + "loss": 0.6224, + "step": 35040 + }, + { + "epoch": 1.4405456487796862, + "grad_norm": 1.1191489696502686, + "learning_rate": 3.8539355494983865e-05, + "loss": 0.6174, + "step": 35060 + }, + { + "epoch": 1.4413674089900568, + "grad_norm": 1.131309986114502, + "learning_rate": 3.843443505440494e-05, + "loss": 0.6212, + "step": 35080 + }, + { + "epoch": 1.4421891692004274, + "grad_norm": 1.216101884841919, + "learning_rate": 3.832962364170251e-05, + "loss": 0.6228, + "step": 35100 + }, + { + "epoch": 1.443010929410798, + "grad_norm": 1.043578863143921, + "learning_rate": 3.82249214424896e-05, + "loss": 0.6225, + "step": 35120 + }, + { + "epoch": 1.4438326896211686, + "grad_norm": 1.0260852575302124, + "learning_rate": 3.812032864218563e-05, + "loss": 0.6249, + "step": 35140 + }, + { + "epoch": 1.4446544498315392, + "grad_norm": 1.1600069999694824, + "learning_rate": 3.8015845426016494e-05, + "loss": 0.6072, + "step": 35160 + }, + { + "epoch": 1.4454762100419098, + "grad_norm": 1.0437159538269043, + "learning_rate": 3.7911471979013845e-05, + "loss": 0.619, + "step": 35180 + }, + { + "epoch": 1.4462979702522805, + "grad_norm": 1.074245572090149, + "learning_rate": 3.78072084860151e-05, + "loss": 0.6135, + "step": 35200 + }, + { + "epoch": 1.447119730462651, + "grad_norm": 1.021990180015564, + "learning_rate": 3.7703055131662854e-05, + "loss": 0.614, + "step": 35220 + }, + { + "epoch": 1.4479414906730217, + "grad_norm": 0.9545276165008545, + "learning_rate": 3.759901210040466e-05, + "loss": 0.5856, + "step": 35240 + }, + { + "epoch": 1.4487632508833923, + "grad_norm": 1.1442620754241943, + "learning_rate": 3.749507957649274e-05, + "loss": 0.6096, + "step": 35260 + }, + { + "epoch": 1.449585011093763, + "grad_norm": 1.0844465494155884, + "learning_rate": 3.7391257743983554e-05, + "loss": 0.6105, + "step": 35280 + }, + { + "epoch": 1.4504067713041335, + "grad_norm": 1.13474702835083, + "learning_rate": 3.728754678673762e-05, + "loss": 0.6214, + "step": 35300 + }, + { + "epoch": 1.4512285315145041, + "grad_norm": 1.0782667398452759, + "learning_rate": 3.7183946888419066e-05, + "loss": 0.6417, + "step": 35320 + }, + { + "epoch": 1.4520502917248748, + "grad_norm": 1.2489984035491943, + "learning_rate": 3.708045823249531e-05, + "loss": 0.6105, + "step": 35340 + }, + { + "epoch": 1.4528720519352452, + "grad_norm": 1.113853096961975, + "learning_rate": 3.69770810022368e-05, + "loss": 0.6243, + "step": 35360 + }, + { + "epoch": 1.453693812145616, + "grad_norm": 1.1300607919692993, + "learning_rate": 3.6873815380716624e-05, + "loss": 0.6248, + "step": 35380 + }, + { + "epoch": 1.4545155723559864, + "grad_norm": 1.0935343503952026, + "learning_rate": 3.6770661550810316e-05, + "loss": 0.6188, + "step": 35400 + }, + { + "epoch": 1.4553373325663572, + "grad_norm": 1.2234795093536377, + "learning_rate": 3.6667619695195285e-05, + "loss": 0.6157, + "step": 35420 + }, + { + "epoch": 1.4561590927767276, + "grad_norm": 1.050308346748352, + "learning_rate": 3.656468999635085e-05, + "loss": 0.6089, + "step": 35440 + }, + { + "epoch": 1.4569808529870985, + "grad_norm": 1.116862416267395, + "learning_rate": 3.646187263655745e-05, + "loss": 0.6126, + "step": 35460 + }, + { + "epoch": 1.4578026131974688, + "grad_norm": 1.2579872608184814, + "learning_rate": 3.6359167797896795e-05, + "loss": 0.6189, + "step": 35480 + }, + { + "epoch": 1.4586243734078397, + "grad_norm": 1.2237604856491089, + "learning_rate": 3.625657566225124e-05, + "loss": 0.6248, + "step": 35500 + }, + { + "epoch": 1.45944613361821, + "grad_norm": 1.2082823514938354, + "learning_rate": 3.615409641130351e-05, + "loss": 0.6277, + "step": 35520 + }, + { + "epoch": 1.460267893828581, + "grad_norm": 1.17978835105896, + "learning_rate": 3.605173022653653e-05, + "loss": 0.6119, + "step": 35540 + }, + { + "epoch": 1.4610896540389513, + "grad_norm": 1.1109099388122559, + "learning_rate": 3.5949477289232914e-05, + "loss": 0.6391, + "step": 35560 + }, + { + "epoch": 1.4619114142493221, + "grad_norm": 1.109529733657837, + "learning_rate": 3.5847337780474744e-05, + "loss": 0.6211, + "step": 35580 + }, + { + "epoch": 1.4627331744596925, + "grad_norm": 1.159828543663025, + "learning_rate": 3.5745311881143196e-05, + "loss": 0.6182, + "step": 35600 + }, + { + "epoch": 1.4635549346700634, + "grad_norm": 1.047654390335083, + "learning_rate": 3.564339977191834e-05, + "loss": 0.6166, + "step": 35620 + }, + { + "epoch": 1.4643766948804338, + "grad_norm": 1.009979248046875, + "learning_rate": 3.554160163327864e-05, + "loss": 0.6, + "step": 35640 + }, + { + "epoch": 1.4651984550908046, + "grad_norm": 1.0264675617218018, + "learning_rate": 3.543991764550079e-05, + "loss": 0.6166, + "step": 35660 + }, + { + "epoch": 1.466020215301175, + "grad_norm": 1.2139075994491577, + "learning_rate": 3.533834798865927e-05, + "loss": 0.6369, + "step": 35680 + }, + { + "epoch": 1.4668419755115458, + "grad_norm": 1.1369953155517578, + "learning_rate": 3.523689284262611e-05, + "loss": 0.6183, + "step": 35700 + }, + { + "epoch": 1.4676637357219162, + "grad_norm": 1.1860294342041016, + "learning_rate": 3.5135552387070636e-05, + "loss": 0.6152, + "step": 35720 + }, + { + "epoch": 1.468485495932287, + "grad_norm": 1.0591294765472412, + "learning_rate": 3.503432680145892e-05, + "loss": 0.6152, + "step": 35740 + }, + { + "epoch": 1.4693072561426574, + "grad_norm": 1.112328290939331, + "learning_rate": 3.493321626505375e-05, + "loss": 0.6106, + "step": 35760 + }, + { + "epoch": 1.4701290163530283, + "grad_norm": 1.0694029331207275, + "learning_rate": 3.483222095691406e-05, + "loss": 0.609, + "step": 35780 + }, + { + "epoch": 1.4709507765633987, + "grad_norm": 1.0409561395645142, + "learning_rate": 3.4731341055894785e-05, + "loss": 0.6265, + "step": 35800 + }, + { + "epoch": 1.4717725367737695, + "grad_norm": 1.130654215812683, + "learning_rate": 3.463057674064646e-05, + "loss": 0.6058, + "step": 35820 + }, + { + "epoch": 1.47259429698414, + "grad_norm": 1.0501320362091064, + "learning_rate": 3.4534957865013894e-05, + "loss": 0.588, + "step": 35840 + }, + { + "epoch": 1.4734160571945107, + "grad_norm": 1.1128648519515991, + "learning_rate": 3.443441945508704e-05, + "loss": 0.6212, + "step": 35860 + }, + { + "epoch": 1.4742378174048811, + "grad_norm": 1.1255168914794922, + "learning_rate": 3.433399715675645e-05, + "loss": 0.6282, + "step": 35880 + }, + { + "epoch": 1.475059577615252, + "grad_norm": 1.3319727182388306, + "learning_rate": 3.4238703683689755e-05, + "loss": 0.6392, + "step": 35900 + }, + { + "epoch": 1.4758813378256224, + "grad_norm": 1.1387158632278442, + "learning_rate": 3.414351531414453e-05, + "loss": 0.6058, + "step": 35920 + }, + { + "epoch": 1.4767030980359932, + "grad_norm": 1.0003697872161865, + "learning_rate": 3.404343074439131e-05, + "loss": 0.6186, + "step": 35940 + }, + { + "epoch": 1.4775248582463636, + "grad_norm": 1.2400519847869873, + "learning_rate": 3.3943462978645225e-05, + "loss": 0.6088, + "step": 35960 + }, + { + "epoch": 1.4783466184567344, + "grad_norm": 1.048429250717163, + "learning_rate": 3.384361219394153e-05, + "loss": 0.6101, + "step": 35980 + }, + { + "epoch": 1.4791683786671048, + "grad_norm": 1.1433545351028442, + "learning_rate": 3.374387856710828e-05, + "loss": 0.6407, + "step": 36000 + }, + { + "epoch": 1.4791683786671048, + "eval_loss": 0.9071117639541626, + "eval_runtime": 16.6266, + "eval_samples_per_second": 157.579, + "eval_steps_per_second": 4.932, + "step": 36000 + }, + { + "epoch": 1.4799901388774757, + "grad_norm": 1.257107138633728, + "learning_rate": 3.3644262274766024e-05, + "loss": 0.6129, + "step": 36020 + }, + { + "epoch": 1.480811899087846, + "grad_norm": 1.07807457447052, + "learning_rate": 3.354476349332756e-05, + "loss": 0.619, + "step": 36040 + }, + { + "epoch": 1.4816336592982169, + "grad_norm": 1.1075770854949951, + "learning_rate": 3.344538239899754e-05, + "loss": 0.6148, + "step": 36060 + }, + { + "epoch": 1.4824554195085873, + "grad_norm": 1.2136929035186768, + "learning_rate": 3.3346119167772285e-05, + "loss": 0.6148, + "step": 36080 + }, + { + "epoch": 1.4832771797189581, + "grad_norm": 1.0873780250549316, + "learning_rate": 3.3246973975439274e-05, + "loss": 0.6213, + "step": 36100 + }, + { + "epoch": 1.4840989399293285, + "grad_norm": 1.09003746509552, + "learning_rate": 3.314794699757713e-05, + "loss": 0.6244, + "step": 36120 + }, + { + "epoch": 1.4849207001396993, + "grad_norm": 1.0032758712768555, + "learning_rate": 3.3049038409554855e-05, + "loss": 0.6265, + "step": 36140 + }, + { + "epoch": 1.4857424603500697, + "grad_norm": 1.10032057762146, + "learning_rate": 3.295024838653205e-05, + "loss": 0.6375, + "step": 36160 + }, + { + "epoch": 1.4865642205604406, + "grad_norm": 1.0320249795913696, + "learning_rate": 3.2851577103458196e-05, + "loss": 0.6101, + "step": 36180 + }, + { + "epoch": 1.487385980770811, + "grad_norm": 1.2705104351043701, + "learning_rate": 3.2753024735072534e-05, + "loss": 0.6337, + "step": 36200 + }, + { + "epoch": 1.4882077409811818, + "grad_norm": 1.1513526439666748, + "learning_rate": 3.2654591455903774e-05, + "loss": 0.6179, + "step": 36220 + }, + { + "epoch": 1.4890295011915522, + "grad_norm": 1.231684923171997, + "learning_rate": 3.2556277440269636e-05, + "loss": 0.619, + "step": 36240 + }, + { + "epoch": 1.489851261401923, + "grad_norm": 1.1558480262756348, + "learning_rate": 3.2458082862276685e-05, + "loss": 0.609, + "step": 36260 + }, + { + "epoch": 1.4906730216122934, + "grad_norm": 1.005265235900879, + "learning_rate": 3.236000789581992e-05, + "loss": 0.5977, + "step": 36280 + }, + { + "epoch": 1.4914947818226643, + "grad_norm": 1.1663181781768799, + "learning_rate": 3.2262052714582635e-05, + "loss": 0.6104, + "step": 36300 + }, + { + "epoch": 1.4923165420330347, + "grad_norm": 1.2102375030517578, + "learning_rate": 3.216421749203586e-05, + "loss": 0.6201, + "step": 36320 + }, + { + "epoch": 1.4931383022434055, + "grad_norm": 1.1391383409500122, + "learning_rate": 3.206650240143827e-05, + "loss": 0.6063, + "step": 36340 + }, + { + "epoch": 1.4939600624537759, + "grad_norm": 1.1457056999206543, + "learning_rate": 3.1968907615835756e-05, + "loss": 0.6111, + "step": 36360 + }, + { + "epoch": 1.4947818226641467, + "grad_norm": 1.105281114578247, + "learning_rate": 3.187143330806114e-05, + "loss": 0.6129, + "step": 36380 + }, + { + "epoch": 1.4956035828745171, + "grad_norm": 1.1415950059890747, + "learning_rate": 3.177407965073398e-05, + "loss": 0.6049, + "step": 36400 + }, + { + "epoch": 1.496425343084888, + "grad_norm": 1.0731744766235352, + "learning_rate": 3.1676846816260044e-05, + "loss": 0.6257, + "step": 36420 + }, + { + "epoch": 1.4972471032952583, + "grad_norm": 1.0916352272033691, + "learning_rate": 3.1579734976831265e-05, + "loss": 0.6219, + "step": 36440 + }, + { + "epoch": 1.4980688635056292, + "grad_norm": 1.1694599390029907, + "learning_rate": 3.14827443044252e-05, + "loss": 0.6068, + "step": 36460 + }, + { + "epoch": 1.4988906237159996, + "grad_norm": 1.0736804008483887, + "learning_rate": 3.1385874970804874e-05, + "loss": 0.6128, + "step": 36480 + }, + { + "epoch": 1.4997123839263704, + "grad_norm": 1.2031406164169312, + "learning_rate": 3.12891271475184e-05, + "loss": 0.6196, + "step": 36500 + }, + { + "epoch": 1.5005341441367408, + "grad_norm": 1.1104577779769897, + "learning_rate": 3.119250100589872e-05, + "loss": 0.6187, + "step": 36520 + }, + { + "epoch": 1.5013559043471116, + "grad_norm": 1.2821518182754517, + "learning_rate": 3.109599671706335e-05, + "loss": 0.6149, + "step": 36540 + }, + { + "epoch": 1.502177664557482, + "grad_norm": 1.0993192195892334, + "learning_rate": 3.09996144519139e-05, + "loss": 0.5835, + "step": 36560 + }, + { + "epoch": 1.5029994247678529, + "grad_norm": 1.0320968627929688, + "learning_rate": 3.090335438113597e-05, + "loss": 0.6023, + "step": 36580 + }, + { + "epoch": 1.5038211849782233, + "grad_norm": 0.9988365769386292, + "learning_rate": 3.08072166751987e-05, + "loss": 0.6205, + "step": 36600 + }, + { + "epoch": 1.504642945188594, + "grad_norm": 1.0576531887054443, + "learning_rate": 3.071120150435462e-05, + "loss": 0.6087, + "step": 36620 + }, + { + "epoch": 1.5054647053989645, + "grad_norm": 1.0885626077651978, + "learning_rate": 3.0615309038639186e-05, + "loss": 0.6034, + "step": 36640 + }, + { + "epoch": 1.5062864656093353, + "grad_norm": 1.0492252111434937, + "learning_rate": 3.051953944787054e-05, + "loss": 0.5865, + "step": 36660 + }, + { + "epoch": 1.5071082258197057, + "grad_norm": 1.0457483530044556, + "learning_rate": 3.0423892901649344e-05, + "loss": 0.6097, + "step": 36680 + }, + { + "epoch": 1.5079299860300766, + "grad_norm": 1.0300101041793823, + "learning_rate": 3.032836956935814e-05, + "loss": 0.5837, + "step": 36700 + }, + { + "epoch": 1.508751746240447, + "grad_norm": 1.1736280918121338, + "learning_rate": 3.023296962016151e-05, + "loss": 0.5926, + "step": 36720 + }, + { + "epoch": 1.5095735064508178, + "grad_norm": 1.0206154584884644, + "learning_rate": 3.0137693223005335e-05, + "loss": 0.6162, + "step": 36740 + }, + { + "epoch": 1.5103952666611882, + "grad_norm": 1.1228256225585938, + "learning_rate": 3.004254054661686e-05, + "loss": 0.5999, + "step": 36760 + }, + { + "epoch": 1.511217026871559, + "grad_norm": 1.0171713829040527, + "learning_rate": 2.994751175950411e-05, + "loss": 0.6092, + "step": 36780 + }, + { + "epoch": 1.5120387870819294, + "grad_norm": 1.017462968826294, + "learning_rate": 2.985260702995575e-05, + "loss": 0.6201, + "step": 36800 + }, + { + "epoch": 1.5128605472923002, + "grad_norm": 1.2459691762924194, + "learning_rate": 2.9757826526040755e-05, + "loss": 0.6189, + "step": 36820 + }, + { + "epoch": 1.5136823075026706, + "grad_norm": 1.027414321899414, + "learning_rate": 2.9663170415608078e-05, + "loss": 0.6104, + "step": 36840 + }, + { + "epoch": 1.5145040677130415, + "grad_norm": 1.0714311599731445, + "learning_rate": 2.9568638866286458e-05, + "loss": 0.6201, + "step": 36860 + }, + { + "epoch": 1.5153258279234119, + "grad_norm": 1.1437265872955322, + "learning_rate": 2.9474232045483952e-05, + "loss": 0.6102, + "step": 36880 + }, + { + "epoch": 1.5161475881337827, + "grad_norm": 1.2031759023666382, + "learning_rate": 2.93799501203878e-05, + "loss": 0.5923, + "step": 36900 + }, + { + "epoch": 1.516969348344153, + "grad_norm": 0.992753267288208, + "learning_rate": 2.928579325796401e-05, + "loss": 0.6025, + "step": 36920 + }, + { + "epoch": 1.517791108554524, + "grad_norm": 1.1709444522857666, + "learning_rate": 2.9191761624957115e-05, + "loss": 0.6125, + "step": 36940 + }, + { + "epoch": 1.5186128687648943, + "grad_norm": 1.1847578287124634, + "learning_rate": 2.909785538788995e-05, + "loss": 0.6141, + "step": 36960 + }, + { + "epoch": 1.5194346289752652, + "grad_norm": 1.032343864440918, + "learning_rate": 2.900407471306319e-05, + "loss": 0.6172, + "step": 36980 + }, + { + "epoch": 1.5202563891856355, + "grad_norm": 1.2214970588684082, + "learning_rate": 2.8910419766555275e-05, + "loss": 0.618, + "step": 37000 + }, + { + "epoch": 1.5210781493960064, + "grad_norm": 1.1395217180252075, + "learning_rate": 2.881689071422179e-05, + "loss": 0.6024, + "step": 37020 + }, + { + "epoch": 1.5218999096063768, + "grad_norm": 1.1765543222427368, + "learning_rate": 2.8723487721695562e-05, + "loss": 0.6021, + "step": 37040 + }, + { + "epoch": 1.5227216698167476, + "grad_norm": 1.1648917198181152, + "learning_rate": 2.8630210954386082e-05, + "loss": 0.6181, + "step": 37060 + }, + { + "epoch": 1.523543430027118, + "grad_norm": 1.1894433498382568, + "learning_rate": 2.853706057747929e-05, + "loss": 0.6272, + "step": 37080 + }, + { + "epoch": 1.5243651902374888, + "grad_norm": 1.1555765867233276, + "learning_rate": 2.84440367559374e-05, + "loss": 0.6187, + "step": 37100 + }, + { + "epoch": 1.5251869504478592, + "grad_norm": 1.1109102964401245, + "learning_rate": 2.8351139654498405e-05, + "loss": 0.611, + "step": 37120 + }, + { + "epoch": 1.52600871065823, + "grad_norm": 1.1215410232543945, + "learning_rate": 2.8258369437675926e-05, + "loss": 0.5798, + "step": 37140 + }, + { + "epoch": 1.5268304708686005, + "grad_norm": 1.1160521507263184, + "learning_rate": 2.816572626975884e-05, + "loss": 0.6054, + "step": 37160 + }, + { + "epoch": 1.5276522310789713, + "grad_norm": 1.0489914417266846, + "learning_rate": 2.8073210314811126e-05, + "loss": 0.6054, + "step": 37180 + }, + { + "epoch": 1.5284739912893417, + "grad_norm": 1.0901767015457153, + "learning_rate": 2.798082173667139e-05, + "loss": 0.6074, + "step": 37200 + }, + { + "epoch": 1.5292957514997125, + "grad_norm": 1.1401610374450684, + "learning_rate": 2.7888560698952702e-05, + "loss": 0.6087, + "step": 37220 + }, + { + "epoch": 1.530117511710083, + "grad_norm": 1.136517882347107, + "learning_rate": 2.7796427365042243e-05, + "loss": 0.6071, + "step": 37240 + }, + { + "epoch": 1.5309392719204538, + "grad_norm": 1.1060153245925903, + "learning_rate": 2.770442189810103e-05, + "loss": 0.6142, + "step": 37260 + }, + { + "epoch": 1.5317610321308242, + "grad_norm": 1.029160737991333, + "learning_rate": 2.7612544461063727e-05, + "loss": 0.6187, + "step": 37280 + }, + { + "epoch": 1.532582792341195, + "grad_norm": 1.1750712394714355, + "learning_rate": 2.752079521663814e-05, + "loss": 0.6107, + "step": 37300 + }, + { + "epoch": 1.5334045525515654, + "grad_norm": 1.0686962604522705, + "learning_rate": 2.7429174327305186e-05, + "loss": 0.6217, + "step": 37320 + }, + { + "epoch": 1.5342263127619362, + "grad_norm": 1.0369669198989868, + "learning_rate": 2.7337681955318363e-05, + "loss": 0.6108, + "step": 37340 + }, + { + "epoch": 1.5350480729723066, + "grad_norm": 1.0347490310668945, + "learning_rate": 2.724631826270362e-05, + "loss": 0.6055, + "step": 37360 + }, + { + "epoch": 1.5358698331826774, + "grad_norm": 1.0429108142852783, + "learning_rate": 2.715508341125904e-05, + "loss": 0.6004, + "step": 37380 + }, + { + "epoch": 1.5366915933930478, + "grad_norm": 1.1569420099258423, + "learning_rate": 2.7063977562554476e-05, + "loss": 0.613, + "step": 37400 + }, + { + "epoch": 1.5375133536034187, + "grad_norm": 1.1140472888946533, + "learning_rate": 2.6973000877931443e-05, + "loss": 0.6267, + "step": 37420 + }, + { + "epoch": 1.538335113813789, + "grad_norm": 1.0750665664672852, + "learning_rate": 2.6882153518502616e-05, + "loss": 0.6119, + "step": 37440 + }, + { + "epoch": 1.53915687402416, + "grad_norm": 1.0947927236557007, + "learning_rate": 2.6791435645151675e-05, + "loss": 0.5913, + "step": 37460 + }, + { + "epoch": 1.5399786342345303, + "grad_norm": 1.1905947923660278, + "learning_rate": 2.670084741853296e-05, + "loss": 0.6153, + "step": 37480 + }, + { + "epoch": 1.5408003944449011, + "grad_norm": 1.0356147289276123, + "learning_rate": 2.661038899907129e-05, + "loss": 0.6068, + "step": 37500 + }, + { + "epoch": 1.5416221546552715, + "grad_norm": 1.0834916830062866, + "learning_rate": 2.6520060546961566e-05, + "loss": 0.6017, + "step": 37520 + }, + { + "epoch": 1.5424439148656424, + "grad_norm": 1.0963350534439087, + "learning_rate": 2.6429862222168467e-05, + "loss": 0.6211, + "step": 37540 + }, + { + "epoch": 1.5432656750760128, + "grad_norm": 1.0668399333953857, + "learning_rate": 2.6339794184426393e-05, + "loss": 0.6182, + "step": 37560 + }, + { + "epoch": 1.5440874352863836, + "grad_norm": 1.169129490852356, + "learning_rate": 2.6249856593238763e-05, + "loss": 0.6078, + "step": 37580 + }, + { + "epoch": 1.544909195496754, + "grad_norm": 1.1952544450759888, + "learning_rate": 2.6160049607878234e-05, + "loss": 0.6056, + "step": 37600 + }, + { + "epoch": 1.5457309557071248, + "grad_norm": 1.1417872905731201, + "learning_rate": 2.6070373387386005e-05, + "loss": 0.6069, + "step": 37620 + }, + { + "epoch": 1.5465527159174952, + "grad_norm": 1.152288556098938, + "learning_rate": 2.5980828090571817e-05, + "loss": 0.6084, + "step": 37640 + }, + { + "epoch": 1.547374476127866, + "grad_norm": 1.1022766828536987, + "learning_rate": 2.589141387601346e-05, + "loss": 0.6145, + "step": 37660 + }, + { + "epoch": 1.5481962363382364, + "grad_norm": 1.0760823488235474, + "learning_rate": 2.580213090205663e-05, + "loss": 0.5979, + "step": 37680 + }, + { + "epoch": 1.5490179965486073, + "grad_norm": 1.1265369653701782, + "learning_rate": 2.5712979326814613e-05, + "loss": 0.6069, + "step": 37700 + }, + { + "epoch": 1.5498397567589777, + "grad_norm": 1.0889846086502075, + "learning_rate": 2.5623959308167945e-05, + "loss": 0.6006, + "step": 37720 + }, + { + "epoch": 1.5506615169693485, + "grad_norm": 1.110885739326477, + "learning_rate": 2.553507100376428e-05, + "loss": 0.6128, + "step": 37740 + }, + { + "epoch": 1.551483277179719, + "grad_norm": 0.9926326870918274, + "learning_rate": 2.5446314571017936e-05, + "loss": 0.6145, + "step": 37760 + }, + { + "epoch": 1.5523050373900897, + "grad_norm": 0.9480810165405273, + "learning_rate": 2.535769016710975e-05, + "loss": 0.6213, + "step": 37780 + }, + { + "epoch": 1.5531267976004601, + "grad_norm": 1.1244728565216064, + "learning_rate": 2.5269197948986678e-05, + "loss": 0.5935, + "step": 37800 + }, + { + "epoch": 1.553948557810831, + "grad_norm": 1.1508769989013672, + "learning_rate": 2.5180838073361624e-05, + "loss": 0.6349, + "step": 37820 + }, + { + "epoch": 1.5547703180212014, + "grad_norm": 1.137568473815918, + "learning_rate": 2.509261069671318e-05, + "loss": 0.6057, + "step": 37840 + }, + { + "epoch": 1.555592078231572, + "grad_norm": 1.1233346462249756, + "learning_rate": 2.5004515975285183e-05, + "loss": 0.5844, + "step": 37860 + }, + { + "epoch": 1.5564138384419426, + "grad_norm": 1.188909649848938, + "learning_rate": 2.491655406508667e-05, + "loss": 0.6043, + "step": 37880 + }, + { + "epoch": 1.5572355986523132, + "grad_norm": 1.0557928085327148, + "learning_rate": 2.4828725121891328e-05, + "loss": 0.5953, + "step": 37900 + }, + { + "epoch": 1.5580573588626838, + "grad_norm": 1.2361866235733032, + "learning_rate": 2.4745410928211422e-05, + "loss": 0.6316, + "step": 37920 + }, + { + "epoch": 1.5588791190730544, + "grad_norm": 1.1780842542648315, + "learning_rate": 2.4662217010784527e-05, + "loss": 0.614, + "step": 37940 + }, + { + "epoch": 1.559700879283425, + "grad_norm": 1.0414899587631226, + "learning_rate": 2.4574774550623027e-05, + "loss": 0.5905, + "step": 37960 + }, + { + "epoch": 1.5605226394937957, + "grad_norm": 1.1471487283706665, + "learning_rate": 2.448746566272997e-05, + "loss": 0.5866, + "step": 37980 + }, + { + "epoch": 1.5613443997041663, + "grad_norm": 1.048036813735962, + "learning_rate": 2.4400290501722623e-05, + "loss": 0.6068, + "step": 38000 + }, + { + "epoch": 1.5613443997041663, + "eval_loss": 0.8905351758003235, + "eval_runtime": 16.5754, + "eval_samples_per_second": 158.066, + "eval_steps_per_second": 4.947, + "step": 38000 + }, + { + "epoch": 1.562166159914537, + "grad_norm": 1.0737829208374023, + "learning_rate": 2.431324922198156e-05, + "loss": 0.6073, + "step": 38020 + }, + { + "epoch": 1.5629879201249075, + "grad_norm": 1.0627774000167847, + "learning_rate": 2.4226341977650145e-05, + "loss": 0.6057, + "step": 38040 + }, + { + "epoch": 1.5638096803352781, + "grad_norm": 1.09597647190094, + "learning_rate": 2.4139568922634427e-05, + "loss": 0.6003, + "step": 38060 + }, + { + "epoch": 1.5646314405456487, + "grad_norm": 1.0973150730133057, + "learning_rate": 2.40529302106028e-05, + "loss": 0.6181, + "step": 38080 + }, + { + "epoch": 1.5654532007560193, + "grad_norm": 1.163824200630188, + "learning_rate": 2.396642599498573e-05, + "loss": 0.5985, + "step": 38100 + }, + { + "epoch": 1.56627496096639, + "grad_norm": 1.2081292867660522, + "learning_rate": 2.3880056428975572e-05, + "loss": 0.5934, + "step": 38120 + }, + { + "epoch": 1.5670967211767606, + "grad_norm": 1.1347095966339111, + "learning_rate": 2.379382166552614e-05, + "loss": 0.6004, + "step": 38140 + }, + { + "epoch": 1.5679184813871312, + "grad_norm": 1.0734606981277466, + "learning_rate": 2.3707721857352628e-05, + "loss": 0.6167, + "step": 38160 + }, + { + "epoch": 1.5687402415975018, + "grad_norm": 1.0315816402435303, + "learning_rate": 2.362175715693106e-05, + "loss": 0.5854, + "step": 38180 + }, + { + "epoch": 1.5695620018078724, + "grad_norm": 1.1294316053390503, + "learning_rate": 2.3535927716498397e-05, + "loss": 0.5945, + "step": 38200 + }, + { + "epoch": 1.570383762018243, + "grad_norm": 1.1472307443618774, + "learning_rate": 2.3450233688051936e-05, + "loss": 0.5976, + "step": 38220 + }, + { + "epoch": 1.5712055222286136, + "grad_norm": 1.253547191619873, + "learning_rate": 2.3364675223349186e-05, + "loss": 0.6084, + "step": 38240 + }, + { + "epoch": 1.5720272824389843, + "grad_norm": 1.0203603506088257, + "learning_rate": 2.3279252473907674e-05, + "loss": 0.5904, + "step": 38260 + }, + { + "epoch": 1.5728490426493549, + "grad_norm": 1.0900869369506836, + "learning_rate": 2.3193965591004408e-05, + "loss": 0.6092, + "step": 38280 + }, + { + "epoch": 1.5736708028597255, + "grad_norm": 1.003138542175293, + "learning_rate": 2.3108814725675975e-05, + "loss": 0.5982, + "step": 38300 + }, + { + "epoch": 1.574492563070096, + "grad_norm": 1.138856291770935, + "learning_rate": 2.3023800028717956e-05, + "loss": 0.6178, + "step": 38320 + }, + { + "epoch": 1.5753143232804667, + "grad_norm": 1.1773872375488281, + "learning_rate": 2.2943162329690658e-05, + "loss": 0.6051, + "step": 38340 + }, + { + "epoch": 1.5761360834908373, + "grad_norm": 1.2317793369293213, + "learning_rate": 2.2858413593867434e-05, + "loss": 0.6073, + "step": 38360 + }, + { + "epoch": 1.576957843701208, + "grad_norm": 1.0834593772888184, + "learning_rate": 2.2773801469855805e-05, + "loss": 0.6154, + "step": 38380 + }, + { + "epoch": 1.5777796039115786, + "grad_norm": 1.1221275329589844, + "learning_rate": 2.2689326107497267e-05, + "loss": 0.5776, + "step": 38400 + }, + { + "epoch": 1.5786013641219492, + "grad_norm": 1.0579743385314941, + "learning_rate": 2.260498765639125e-05, + "loss": 0.5986, + "step": 38420 + }, + { + "epoch": 1.5794231243323198, + "grad_norm": 1.0688302516937256, + "learning_rate": 2.252078626589462e-05, + "loss": 0.5839, + "step": 38440 + }, + { + "epoch": 1.5802448845426904, + "grad_norm": 1.1622627973556519, + "learning_rate": 2.2436722085121565e-05, + "loss": 0.6065, + "step": 38460 + }, + { + "epoch": 1.581066644753061, + "grad_norm": 1.1227208375930786, + "learning_rate": 2.2352795262943272e-05, + "loss": 0.6048, + "step": 38480 + }, + { + "epoch": 1.5818884049634316, + "grad_norm": 0.9703273177146912, + "learning_rate": 2.2269005947987664e-05, + "loss": 0.6037, + "step": 38500 + }, + { + "epoch": 1.5827101651738023, + "grad_norm": 1.0502504110336304, + "learning_rate": 2.2185354288639216e-05, + "loss": 0.6096, + "step": 38520 + }, + { + "epoch": 1.5835319253841729, + "grad_norm": 1.1714296340942383, + "learning_rate": 2.210184043303852e-05, + "loss": 0.612, + "step": 38540 + }, + { + "epoch": 1.5843536855945435, + "grad_norm": 1.051988124847412, + "learning_rate": 2.2018464529082282e-05, + "loss": 0.6334, + "step": 38560 + }, + { + "epoch": 1.585175445804914, + "grad_norm": 1.1384596824645996, + "learning_rate": 2.1935226724422686e-05, + "loss": 0.6027, + "step": 38580 + }, + { + "epoch": 1.5859972060152847, + "grad_norm": 1.1457267999649048, + "learning_rate": 2.1852127166467572e-05, + "loss": 0.5929, + "step": 38600 + }, + { + "epoch": 1.5868189662256553, + "grad_norm": 1.1787493228912354, + "learning_rate": 2.1769166002379826e-05, + "loss": 0.5897, + "step": 38620 + }, + { + "epoch": 1.587640726436026, + "grad_norm": 1.1157082319259644, + "learning_rate": 2.1686343379077246e-05, + "loss": 0.5987, + "step": 38640 + }, + { + "epoch": 1.5884624866463966, + "grad_norm": 1.0821255445480347, + "learning_rate": 2.1603659443232394e-05, + "loss": 0.6027, + "step": 38660 + }, + { + "epoch": 1.5892842468567672, + "grad_norm": 1.0201191902160645, + "learning_rate": 2.152111434127212e-05, + "loss": 0.6001, + "step": 38680 + }, + { + "epoch": 1.5901060070671378, + "grad_norm": 1.0591297149658203, + "learning_rate": 2.1438708219377444e-05, + "loss": 0.6132, + "step": 38700 + }, + { + "epoch": 1.5909277672775084, + "grad_norm": 1.0407557487487793, + "learning_rate": 2.1356441223483246e-05, + "loss": 0.5904, + "step": 38720 + }, + { + "epoch": 1.591749527487879, + "grad_norm": 1.0210407972335815, + "learning_rate": 2.12743134992781e-05, + "loss": 0.5876, + "step": 38740 + }, + { + "epoch": 1.5925712876982496, + "grad_norm": 1.0307002067565918, + "learning_rate": 2.1192325192203843e-05, + "loss": 0.6178, + "step": 38760 + }, + { + "epoch": 1.5933930479086202, + "grad_norm": 0.9952294826507568, + "learning_rate": 2.1110476447455453e-05, + "loss": 0.59, + "step": 38780 + }, + { + "epoch": 1.5942148081189909, + "grad_norm": 1.145140528678894, + "learning_rate": 2.1028767409980776e-05, + "loss": 0.5751, + "step": 38800 + }, + { + "epoch": 1.5950365683293615, + "grad_norm": 1.0947434902191162, + "learning_rate": 2.094719822448019e-05, + "loss": 0.5838, + "step": 38820 + }, + { + "epoch": 1.595858328539732, + "grad_norm": 1.2791152000427246, + "learning_rate": 2.086576903540649e-05, + "loss": 0.6055, + "step": 38840 + }, + { + "epoch": 1.5966800887501027, + "grad_norm": 1.142967700958252, + "learning_rate": 2.0784479986964467e-05, + "loss": 0.5962, + "step": 38860 + }, + { + "epoch": 1.5975018489604733, + "grad_norm": 1.1335628032684326, + "learning_rate": 2.070333122311081e-05, + "loss": 0.6056, + "step": 38880 + }, + { + "epoch": 1.598323609170844, + "grad_norm": 1.2280479669570923, + "learning_rate": 2.0622322887553703e-05, + "loss": 0.6012, + "step": 38900 + }, + { + "epoch": 1.5991453693812145, + "grad_norm": 1.0319156646728516, + "learning_rate": 2.0541455123752686e-05, + "loss": 0.5985, + "step": 38920 + }, + { + "epoch": 1.5999671295915852, + "grad_norm": 0.9951415061950684, + "learning_rate": 2.046072807491832e-05, + "loss": 0.594, + "step": 38940 + }, + { + "epoch": 1.6007888898019558, + "grad_norm": 1.0593833923339844, + "learning_rate": 2.0380141884012004e-05, + "loss": 0.5987, + "step": 38960 + }, + { + "epoch": 1.6016106500123264, + "grad_norm": 1.0992417335510254, + "learning_rate": 2.0299696693745697e-05, + "loss": 0.5815, + "step": 38980 + }, + { + "epoch": 1.602432410222697, + "grad_norm": 0.9815024137496948, + "learning_rate": 2.0219392646581638e-05, + "loss": 0.5727, + "step": 39000 + }, + { + "epoch": 1.6032541704330676, + "grad_norm": 1.1512722969055176, + "learning_rate": 2.013922988473209e-05, + "loss": 0.6208, + "step": 39020 + }, + { + "epoch": 1.6040759306434382, + "grad_norm": 1.1306536197662354, + "learning_rate": 2.0059208550159125e-05, + "loss": 0.6162, + "step": 39040 + }, + { + "epoch": 1.6048976908538088, + "grad_norm": 1.1175142526626587, + "learning_rate": 1.9979328784574415e-05, + "loss": 0.5983, + "step": 39060 + }, + { + "epoch": 1.6057194510641795, + "grad_norm": 1.3080164194107056, + "learning_rate": 1.9899590729438856e-05, + "loss": 0.6112, + "step": 39080 + }, + { + "epoch": 1.60654121127455, + "grad_norm": 1.130448579788208, + "learning_rate": 1.981999452596236e-05, + "loss": 0.5753, + "step": 39100 + }, + { + "epoch": 1.6073629714849207, + "grad_norm": 1.0560057163238525, + "learning_rate": 1.9740540315103772e-05, + "loss": 0.593, + "step": 39120 + }, + { + "epoch": 1.6081847316952913, + "grad_norm": 1.0870985984802246, + "learning_rate": 1.9661228237570272e-05, + "loss": 0.606, + "step": 39140 + }, + { + "epoch": 1.609006491905662, + "grad_norm": 1.1902962923049927, + "learning_rate": 1.9582058433817528e-05, + "loss": 0.6184, + "step": 39160 + }, + { + "epoch": 1.6098282521160325, + "grad_norm": 1.0945310592651367, + "learning_rate": 1.9503031044049136e-05, + "loss": 0.5869, + "step": 39180 + }, + { + "epoch": 1.6106500123264031, + "grad_norm": 1.018189549446106, + "learning_rate": 1.942414620821651e-05, + "loss": 0.605, + "step": 39200 + }, + { + "epoch": 1.6114717725367738, + "grad_norm": 1.0882275104522705, + "learning_rate": 1.934540406601867e-05, + "loss": 0.604, + "step": 39220 + }, + { + "epoch": 1.6122935327471444, + "grad_norm": 1.091739296913147, + "learning_rate": 1.9266804756901812e-05, + "loss": 0.588, + "step": 39240 + }, + { + "epoch": 1.613115292957515, + "grad_norm": 1.0217018127441406, + "learning_rate": 1.918834842005933e-05, + "loss": 0.5712, + "step": 39260 + }, + { + "epoch": 1.6139370531678856, + "grad_norm": 1.0229742527008057, + "learning_rate": 1.9110035194431298e-05, + "loss": 0.5956, + "step": 39280 + }, + { + "epoch": 1.6147588133782562, + "grad_norm": 1.082470417022705, + "learning_rate": 1.903186521870448e-05, + "loss": 0.5962, + "step": 39300 + }, + { + "epoch": 1.6155805735886268, + "grad_norm": 1.0316046476364136, + "learning_rate": 1.895383863131185e-05, + "loss": 0.6197, + "step": 39320 + }, + { + "epoch": 1.6164023337989974, + "grad_norm": 1.150221347808838, + "learning_rate": 1.887595557043248e-05, + "loss": 0.6, + "step": 39340 + }, + { + "epoch": 1.617224094009368, + "grad_norm": 1.0211883783340454, + "learning_rate": 1.879821617399129e-05, + "loss": 0.5957, + "step": 39360 + }, + { + "epoch": 1.6180458542197387, + "grad_norm": 1.1843931674957275, + "learning_rate": 1.8720620579658733e-05, + "loss": 0.5956, + "step": 39380 + }, + { + "epoch": 1.6188676144301093, + "grad_norm": 1.0743961334228516, + "learning_rate": 1.8643168924850695e-05, + "loss": 0.6038, + "step": 39400 + }, + { + "epoch": 1.61968937464048, + "grad_norm": 0.9900259971618652, + "learning_rate": 1.8565861346728032e-05, + "loss": 0.5963, + "step": 39420 + }, + { + "epoch": 1.6205111348508505, + "grad_norm": 1.045684814453125, + "learning_rate": 1.848869798219659e-05, + "loss": 0.6107, + "step": 39440 + }, + { + "epoch": 1.6213328950612211, + "grad_norm": 1.1444482803344727, + "learning_rate": 1.8411678967906655e-05, + "loss": 0.6047, + "step": 39460 + }, + { + "epoch": 1.6221546552715917, + "grad_norm": 1.0717675685882568, + "learning_rate": 1.833480444025304e-05, + "loss": 0.6126, + "step": 39480 + }, + { + "epoch": 1.6229764154819624, + "grad_norm": 1.04216468334198, + "learning_rate": 1.8258074535374604e-05, + "loss": 0.584, + "step": 39500 + }, + { + "epoch": 1.623798175692333, + "grad_norm": 1.0672125816345215, + "learning_rate": 1.818148938915406e-05, + "loss": 0.5991, + "step": 39520 + }, + { + "epoch": 1.6246199359027036, + "grad_norm": 1.1428139209747314, + "learning_rate": 1.81050491372179e-05, + "loss": 0.5991, + "step": 39540 + }, + { + "epoch": 1.6254416961130742, + "grad_norm": 1.1530945301055908, + "learning_rate": 1.80287539149358e-05, + "loss": 0.6207, + "step": 39560 + }, + { + "epoch": 1.6262634563234448, + "grad_norm": 1.0400162935256958, + "learning_rate": 1.7952603857420837e-05, + "loss": 0.6112, + "step": 39580 + }, + { + "epoch": 1.6270852165338154, + "grad_norm": 1.172669768333435, + "learning_rate": 1.7876599099528822e-05, + "loss": 0.5991, + "step": 39600 + }, + { + "epoch": 1.627906976744186, + "grad_norm": 0.9719765782356262, + "learning_rate": 1.78007397758584e-05, + "loss": 0.6059, + "step": 39620 + }, + { + "epoch": 1.6287287369545567, + "grad_norm": 1.0943204164505005, + "learning_rate": 1.7725026020750547e-05, + "loss": 0.5868, + "step": 39640 + }, + { + "epoch": 1.6295504971649273, + "grad_norm": 1.281966209411621, + "learning_rate": 1.764945796828852e-05, + "loss": 0.6005, + "step": 39660 + }, + { + "epoch": 1.630372257375298, + "grad_norm": 1.1267539262771606, + "learning_rate": 1.7574035752297503e-05, + "loss": 0.5725, + "step": 39680 + }, + { + "epoch": 1.6311940175856685, + "grad_norm": 1.1680996417999268, + "learning_rate": 1.749875950634442e-05, + "loss": 0.5967, + "step": 39700 + }, + { + "epoch": 1.6320157777960391, + "grad_norm": 1.0391075611114502, + "learning_rate": 1.742362936373776e-05, + "loss": 0.5903, + "step": 39720 + }, + { + "epoch": 1.6328375380064097, + "grad_norm": 1.166467547416687, + "learning_rate": 1.734864545752716e-05, + "loss": 0.6019, + "step": 39740 + }, + { + "epoch": 1.6336592982167804, + "grad_norm": 1.027796983718872, + "learning_rate": 1.7273807920503436e-05, + "loss": 0.5935, + "step": 39760 + }, + { + "epoch": 1.634481058427151, + "grad_norm": 1.0871942043304443, + "learning_rate": 1.7199116885197995e-05, + "loss": 0.6119, + "step": 39780 + }, + { + "epoch": 1.6353028186375216, + "grad_norm": 1.0691869258880615, + "learning_rate": 1.7124572483882996e-05, + "loss": 0.6061, + "step": 39800 + }, + { + "epoch": 1.6361245788478922, + "grad_norm": 1.0936367511749268, + "learning_rate": 1.70501748485708e-05, + "loss": 0.5716, + "step": 39820 + }, + { + "epoch": 1.6369463390582628, + "grad_norm": 1.1092150211334229, + "learning_rate": 1.6975924111013873e-05, + "loss": 0.5975, + "step": 39840 + }, + { + "epoch": 1.6377680992686334, + "grad_norm": 0.9624285697937012, + "learning_rate": 1.6901820402704606e-05, + "loss": 0.6096, + "step": 39860 + }, + { + "epoch": 1.638589859479004, + "grad_norm": 1.1558884382247925, + "learning_rate": 1.6827863854874938e-05, + "loss": 0.6157, + "step": 39880 + }, + { + "epoch": 1.6394116196893747, + "grad_norm": 1.3336347341537476, + "learning_rate": 1.6754054598496215e-05, + "loss": 0.601, + "step": 39900 + }, + { + "epoch": 1.6402333798997453, + "grad_norm": 0.9931703209877014, + "learning_rate": 1.668039276427894e-05, + "loss": 0.5828, + "step": 39920 + }, + { + "epoch": 1.6410551401101159, + "grad_norm": 1.170390248298645, + "learning_rate": 1.6606878482672582e-05, + "loss": 0.5879, + "step": 39940 + }, + { + "epoch": 1.6418769003204865, + "grad_norm": 1.0788556337356567, + "learning_rate": 1.653351188386526e-05, + "loss": 0.603, + "step": 39960 + }, + { + "epoch": 1.6426986605308571, + "grad_norm": 1.1531950235366821, + "learning_rate": 1.6460293097783574e-05, + "loss": 0.6071, + "step": 39980 + }, + { + "epoch": 1.6435204207412277, + "grad_norm": 1.1545748710632324, + "learning_rate": 1.638722225409236e-05, + "loss": 0.5967, + "step": 40000 + }, + { + "epoch": 1.6435204207412277, + "eval_loss": 0.8823444247245789, + "eval_runtime": 16.6502, + "eval_samples_per_second": 157.355, + "eval_steps_per_second": 4.925, + "step": 40000 + }, + { + "epoch": 1.6443421809515983, + "grad_norm": 1.1601914167404175, + "learning_rate": 1.6314299482194418e-05, + "loss": 0.6032, + "step": 40020 + }, + { + "epoch": 1.645163941161969, + "grad_norm": 1.0907021760940552, + "learning_rate": 1.624152491123043e-05, + "loss": 0.6075, + "step": 40040 + }, + { + "epoch": 1.6459857013723396, + "grad_norm": 1.2451189756393433, + "learning_rate": 1.6168898670078537e-05, + "loss": 0.5927, + "step": 40060 + }, + { + "epoch": 1.6468074615827102, + "grad_norm": 1.2181775569915771, + "learning_rate": 1.609642088735418e-05, + "loss": 0.5866, + "step": 40080 + }, + { + "epoch": 1.6476292217930808, + "grad_norm": 1.0169905424118042, + "learning_rate": 1.6024091691410013e-05, + "loss": 0.5901, + "step": 40100 + }, + { + "epoch": 1.6484509820034514, + "grad_norm": 1.0728631019592285, + "learning_rate": 1.595191121033538e-05, + "loss": 0.5929, + "step": 40120 + }, + { + "epoch": 1.649272742213822, + "grad_norm": 1.216049313545227, + "learning_rate": 1.5879879571956436e-05, + "loss": 0.5836, + "step": 40140 + }, + { + "epoch": 1.6500945024241926, + "grad_norm": 1.093613862991333, + "learning_rate": 1.5807996903835608e-05, + "loss": 0.5816, + "step": 40160 + }, + { + "epoch": 1.6509162626345633, + "grad_norm": 1.1138787269592285, + "learning_rate": 1.5736263333271618e-05, + "loss": 0.5954, + "step": 40180 + }, + { + "epoch": 1.6517380228449339, + "grad_norm": 1.0472607612609863, + "learning_rate": 1.5664678987299085e-05, + "loss": 0.6054, + "step": 40200 + }, + { + "epoch": 1.6525597830553045, + "grad_norm": 1.0426794290542603, + "learning_rate": 1.5593243992688356e-05, + "loss": 0.5902, + "step": 40220 + }, + { + "epoch": 1.653381543265675, + "grad_norm": 1.1743807792663574, + "learning_rate": 1.552195847594533e-05, + "loss": 0.5822, + "step": 40240 + }, + { + "epoch": 1.6542033034760457, + "grad_norm": 1.111167550086975, + "learning_rate": 1.5450822563311128e-05, + "loss": 0.628, + "step": 40260 + }, + { + "epoch": 1.6550250636864163, + "grad_norm": 1.124616265296936, + "learning_rate": 1.5379836380762035e-05, + "loss": 0.5867, + "step": 40280 + }, + { + "epoch": 1.655846823896787, + "grad_norm": 1.0396867990493774, + "learning_rate": 1.530900005400906e-05, + "loss": 0.6029, + "step": 40300 + }, + { + "epoch": 1.6566685841071576, + "grad_norm": 1.176413893699646, + "learning_rate": 1.5238313708497964e-05, + "loss": 0.6044, + "step": 40320 + }, + { + "epoch": 1.6574903443175282, + "grad_norm": 1.1538265943527222, + "learning_rate": 1.5167777469408728e-05, + "loss": 0.591, + "step": 40340 + }, + { + "epoch": 1.6583121045278988, + "grad_norm": 1.1776963472366333, + "learning_rate": 1.5097391461655663e-05, + "loss": 0.5988, + "step": 40360 + }, + { + "epoch": 1.6591338647382694, + "grad_norm": 1.0726710557937622, + "learning_rate": 1.5027155809886962e-05, + "loss": 0.6071, + "step": 40380 + }, + { + "epoch": 1.65995562494864, + "grad_norm": 1.0493417978286743, + "learning_rate": 1.4957070638484515e-05, + "loss": 0.5879, + "step": 40400 + }, + { + "epoch": 1.6607773851590106, + "grad_norm": 1.1617422103881836, + "learning_rate": 1.4887136071563856e-05, + "loss": 0.5913, + "step": 40420 + }, + { + "epoch": 1.6615991453693812, + "grad_norm": 1.0822564363479614, + "learning_rate": 1.4817352232973626e-05, + "loss": 0.601, + "step": 40440 + }, + { + "epoch": 1.6624209055797519, + "grad_norm": 1.1444182395935059, + "learning_rate": 1.4747719246295676e-05, + "loss": 0.602, + "step": 40460 + }, + { + "epoch": 1.6632426657901225, + "grad_norm": 1.0876108407974243, + "learning_rate": 1.4678237234844649e-05, + "loss": 0.6045, + "step": 40480 + }, + { + "epoch": 1.664064426000493, + "grad_norm": 1.1832387447357178, + "learning_rate": 1.460890632166787e-05, + "loss": 0.5967, + "step": 40500 + }, + { + "epoch": 1.6648861862108637, + "grad_norm": 0.9234125018119812, + "learning_rate": 1.4543182020758783e-05, + "loss": 0.5895, + "step": 40520 + }, + { + "epoch": 1.6657079464212343, + "grad_norm": 1.1049689054489136, + "learning_rate": 1.4474146102117969e-05, + "loss": 0.6013, + "step": 40540 + }, + { + "epoch": 1.666529706631605, + "grad_norm": 1.0653325319290161, + "learning_rate": 1.4405261643181056e-05, + "loss": 0.6025, + "step": 40560 + }, + { + "epoch": 1.6673514668419755, + "grad_norm": 1.2282037734985352, + "learning_rate": 1.4336528765937151e-05, + "loss": 0.614, + "step": 40580 + }, + { + "epoch": 1.6681732270523462, + "grad_norm": 1.1362701654434204, + "learning_rate": 1.4267947592106845e-05, + "loss": 0.6083, + "step": 40600 + }, + { + "epoch": 1.6689949872627168, + "grad_norm": 1.0828361511230469, + "learning_rate": 1.4199518243142196e-05, + "loss": 0.5823, + "step": 40620 + }, + { + "epoch": 1.6698167474730874, + "grad_norm": 1.042107105255127, + "learning_rate": 1.4131240840226201e-05, + "loss": 0.5848, + "step": 40640 + }, + { + "epoch": 1.670638507683458, + "grad_norm": 1.00413978099823, + "learning_rate": 1.4063115504272973e-05, + "loss": 0.6082, + "step": 40660 + }, + { + "epoch": 1.6714602678938286, + "grad_norm": 1.0120972394943237, + "learning_rate": 1.3995142355927216e-05, + "loss": 0.582, + "step": 40680 + }, + { + "epoch": 1.6722820281041992, + "grad_norm": 1.2198662757873535, + "learning_rate": 1.3927321515564107e-05, + "loss": 0.5908, + "step": 40700 + }, + { + "epoch": 1.6731037883145699, + "grad_norm": 1.1000231504440308, + "learning_rate": 1.3859653103289205e-05, + "loss": 0.585, + "step": 40720 + }, + { + "epoch": 1.6739255485249405, + "grad_norm": 1.0395424365997314, + "learning_rate": 1.3792137238937975e-05, + "loss": 0.5767, + "step": 40740 + }, + { + "epoch": 1.674747308735311, + "grad_norm": 1.1138056516647339, + "learning_rate": 1.3724774042075882e-05, + "loss": 0.5843, + "step": 40760 + }, + { + "epoch": 1.6755690689456817, + "grad_norm": 1.0291800498962402, + "learning_rate": 1.3657563631997928e-05, + "loss": 0.583, + "step": 40780 + }, + { + "epoch": 1.6763908291560523, + "grad_norm": 1.1663140058517456, + "learning_rate": 1.3590506127728575e-05, + "loss": 0.5913, + "step": 40800 + }, + { + "epoch": 1.677212589366423, + "grad_norm": 1.0701870918273926, + "learning_rate": 1.3523601648021533e-05, + "loss": 0.5924, + "step": 40820 + }, + { + "epoch": 1.6780343495767935, + "grad_norm": 1.1024507284164429, + "learning_rate": 1.345685031135947e-05, + "loss": 0.5786, + "step": 40840 + }, + { + "epoch": 1.6788561097871642, + "grad_norm": 0.9765409231185913, + "learning_rate": 1.3393578497947468e-05, + "loss": 0.5825, + "step": 40860 + }, + { + "epoch": 1.6796778699975348, + "grad_norm": 1.2072242498397827, + "learning_rate": 1.332712612998166e-05, + "loss": 0.6016, + "step": 40880 + }, + { + "epoch": 1.6804996302079054, + "grad_norm": 1.1122468709945679, + "learning_rate": 1.326082725300385e-05, + "loss": 0.6234, + "step": 40900 + }, + { + "epoch": 1.681321390418276, + "grad_norm": 1.0995004177093506, + "learning_rate": 1.319468198442424e-05, + "loss": 0.5913, + "step": 40920 + }, + { + "epoch": 1.6821431506286466, + "grad_norm": 1.0409519672393799, + "learning_rate": 1.3128690441380998e-05, + "loss": 0.5948, + "step": 40940 + }, + { + "epoch": 1.6829649108390172, + "grad_norm": 1.1031830310821533, + "learning_rate": 1.3062852740740072e-05, + "loss": 0.5988, + "step": 40960 + }, + { + "epoch": 1.6837866710493878, + "grad_norm": 1.1113379001617432, + "learning_rate": 1.2997168999094978e-05, + "loss": 0.5822, + "step": 40980 + }, + { + "epoch": 1.6846084312597585, + "grad_norm": 1.0048646926879883, + "learning_rate": 1.2931639332766543e-05, + "loss": 0.5817, + "step": 41000 + }, + { + "epoch": 1.685430191470129, + "grad_norm": 1.159192681312561, + "learning_rate": 1.2866263857802818e-05, + "loss": 0.5832, + "step": 41020 + }, + { + "epoch": 1.6862519516804997, + "grad_norm": 1.1091160774230957, + "learning_rate": 1.280104268997865e-05, + "loss": 0.5858, + "step": 41040 + }, + { + "epoch": 1.6870737118908703, + "grad_norm": 1.1325902938842773, + "learning_rate": 1.2735975944795775e-05, + "loss": 0.5843, + "step": 41060 + }, + { + "epoch": 1.687895472101241, + "grad_norm": 1.0565228462219238, + "learning_rate": 1.267106373748237e-05, + "loss": 0.5852, + "step": 41080 + }, + { + "epoch": 1.6887172323116115, + "grad_norm": 1.1818876266479492, + "learning_rate": 1.2606306182992933e-05, + "loss": 0.588, + "step": 41100 + }, + { + "epoch": 1.6895389925219821, + "grad_norm": 0.8950326442718506, + "learning_rate": 1.2541703396008142e-05, + "loss": 0.5963, + "step": 41120 + }, + { + "epoch": 1.6903607527323525, + "grad_norm": 1.1267063617706299, + "learning_rate": 1.2477255490934559e-05, + "loss": 0.5758, + "step": 41140 + }, + { + "epoch": 1.6911825129427234, + "grad_norm": 0.9466457962989807, + "learning_rate": 1.241296258190444e-05, + "loss": 0.5963, + "step": 41160 + }, + { + "epoch": 1.6920042731530938, + "grad_norm": 1.0484158992767334, + "learning_rate": 1.2348824782775581e-05, + "loss": 0.586, + "step": 41180 + }, + { + "epoch": 1.6928260333634646, + "grad_norm": 1.1975510120391846, + "learning_rate": 1.2284842207131109e-05, + "loss": 0.5775, + "step": 41200 + }, + { + "epoch": 1.693647793573835, + "grad_norm": 1.1231242418289185, + "learning_rate": 1.2221014968279233e-05, + "loss": 0.5915, + "step": 41220 + }, + { + "epoch": 1.6944695537842058, + "grad_norm": 1.1098995208740234, + "learning_rate": 1.2157343179253079e-05, + "loss": 0.5886, + "step": 41240 + }, + { + "epoch": 1.6952913139945762, + "grad_norm": 1.053568720817566, + "learning_rate": 1.2093826952810471e-05, + "loss": 0.5961, + "step": 41260 + }, + { + "epoch": 1.696113074204947, + "grad_norm": 1.1454240083694458, + "learning_rate": 1.2030466401433748e-05, + "loss": 0.5888, + "step": 41280 + }, + { + "epoch": 1.6969348344153175, + "grad_norm": 1.0990582704544067, + "learning_rate": 1.1967261637329607e-05, + "loss": 0.5945, + "step": 41300 + }, + { + "epoch": 1.6977565946256883, + "grad_norm": 1.2519744634628296, + "learning_rate": 1.190421277242878e-05, + "loss": 0.5782, + "step": 41320 + }, + { + "epoch": 1.6985783548360587, + "grad_norm": 0.9864106178283691, + "learning_rate": 1.1841319918385996e-05, + "loss": 0.5856, + "step": 41340 + }, + { + "epoch": 1.6994001150464295, + "grad_norm": 1.0756564140319824, + "learning_rate": 1.1778583186579628e-05, + "loss": 0.5893, + "step": 41360 + }, + { + "epoch": 1.7002218752568, + "grad_norm": 1.163355827331543, + "learning_rate": 1.1716002688111616e-05, + "loss": 0.6051, + "step": 41380 + }, + { + "epoch": 1.7010436354671707, + "grad_norm": 1.259600281715393, + "learning_rate": 1.1653578533807186e-05, + "loss": 0.6031, + "step": 41400 + }, + { + "epoch": 1.7018653956775411, + "grad_norm": 1.1272526979446411, + "learning_rate": 1.1591310834214709e-05, + "loss": 0.584, + "step": 41420 + }, + { + "epoch": 1.702687155887912, + "grad_norm": 1.1468937397003174, + "learning_rate": 1.152919969960552e-05, + "loss": 0.6068, + "step": 41440 + }, + { + "epoch": 1.7035089160982824, + "grad_norm": 1.1517606973648071, + "learning_rate": 1.1467245239973633e-05, + "loss": 0.5757, + "step": 41460 + }, + { + "epoch": 1.7043306763086532, + "grad_norm": 1.0870920419692993, + "learning_rate": 1.1405447565035631e-05, + "loss": 0.6043, + "step": 41480 + }, + { + "epoch": 1.7051524365190236, + "grad_norm": 1.0770379304885864, + "learning_rate": 1.1343806784230426e-05, + "loss": 0.5905, + "step": 41500 + }, + { + "epoch": 1.7059741967293944, + "grad_norm": 1.0060986280441284, + "learning_rate": 1.128232300671912e-05, + "loss": 0.6047, + "step": 41520 + }, + { + "epoch": 1.7067959569397648, + "grad_norm": 1.0191991329193115, + "learning_rate": 1.1220996341384748e-05, + "loss": 0.5647, + "step": 41540 + }, + { + "epoch": 1.7076177171501357, + "grad_norm": 1.191707730293274, + "learning_rate": 1.1159826896832082e-05, + "loss": 0.5875, + "step": 41560 + }, + { + "epoch": 1.708439477360506, + "grad_norm": 1.2851048707962036, + "learning_rate": 1.1098814781387568e-05, + "loss": 0.5908, + "step": 41580 + }, + { + "epoch": 1.709261237570877, + "grad_norm": 1.121020793914795, + "learning_rate": 1.1037960103098877e-05, + "loss": 0.6084, + "step": 41600 + }, + { + "epoch": 1.7100829977812473, + "grad_norm": 1.1154911518096924, + "learning_rate": 1.0977262969735014e-05, + "loss": 0.5814, + "step": 41620 + }, + { + "epoch": 1.7109047579916181, + "grad_norm": 1.1276777982711792, + "learning_rate": 1.091672348878594e-05, + "loss": 0.5853, + "step": 41640 + }, + { + "epoch": 1.7117265182019885, + "grad_norm": 1.080946922302246, + "learning_rate": 1.0856341767462364e-05, + "loss": 0.605, + "step": 41660 + }, + { + "epoch": 1.7125482784123593, + "grad_norm": 1.1074481010437012, + "learning_rate": 1.0796117912695736e-05, + "loss": 0.5711, + "step": 41680 + }, + { + "epoch": 1.7133700386227297, + "grad_norm": 1.1239150762557983, + "learning_rate": 1.07360520311378e-05, + "loss": 0.5841, + "step": 41700 + }, + { + "epoch": 1.7141917988331006, + "grad_norm": 1.0257426500320435, + "learning_rate": 1.0676144229160655e-05, + "loss": 0.5723, + "step": 41720 + }, + { + "epoch": 1.715013559043471, + "grad_norm": 1.100321888923645, + "learning_rate": 1.0616394612856361e-05, + "loss": 0.5999, + "step": 41740 + }, + { + "epoch": 1.7158353192538418, + "grad_norm": 1.0664868354797363, + "learning_rate": 1.0556803288036954e-05, + "loss": 0.6029, + "step": 41760 + }, + { + "epoch": 1.7166570794642122, + "grad_norm": 1.017471194267273, + "learning_rate": 1.0497370360234037e-05, + "loss": 0.5844, + "step": 41780 + }, + { + "epoch": 1.717478839674583, + "grad_norm": 1.0937682390213013, + "learning_rate": 1.0438095934698766e-05, + "loss": 0.5844, + "step": 41800 + }, + { + "epoch": 1.7183005998849534, + "grad_norm": 1.0297489166259766, + "learning_rate": 1.037898011640157e-05, + "loss": 0.5882, + "step": 41820 + }, + { + "epoch": 1.7191223600953243, + "grad_norm": 1.0966808795928955, + "learning_rate": 1.032002301003202e-05, + "loss": 0.5827, + "step": 41840 + }, + { + "epoch": 1.7199441203056947, + "grad_norm": 1.1811338663101196, + "learning_rate": 1.026122471999863e-05, + "loss": 0.6009, + "step": 41860 + }, + { + "epoch": 1.7207658805160655, + "grad_norm": 1.0118420124053955, + "learning_rate": 1.0202585350428606e-05, + "loss": 0.5921, + "step": 41880 + }, + { + "epoch": 1.7215876407264359, + "grad_norm": 1.0821563005447388, + "learning_rate": 1.0144105005167836e-05, + "loss": 0.5956, + "step": 41900 + }, + { + "epoch": 1.7224094009368067, + "grad_norm": 1.0998246669769287, + "learning_rate": 1.0085783787780412e-05, + "loss": 0.5851, + "step": 41920 + }, + { + "epoch": 1.7232311611471771, + "grad_norm": 1.2186909914016724, + "learning_rate": 1.0027621801548792e-05, + "loss": 0.6043, + "step": 41940 + }, + { + "epoch": 1.724052921357548, + "grad_norm": 1.1629652976989746, + "learning_rate": 9.972515496304035e-06, + "loss": 0.5626, + "step": 41960 + }, + { + "epoch": 1.7248746815679183, + "grad_norm": 1.0134592056274414, + "learning_rate": 9.914664306824105e-06, + "loss": 0.5605, + "step": 41980 + }, + { + "epoch": 1.7256964417782892, + "grad_norm": 1.134350061416626, + "learning_rate": 9.856972651539343e-06, + "loss": 0.5788, + "step": 42000 + }, + { + "epoch": 1.7256964417782892, + "eval_loss": 0.8728025555610657, + "eval_runtime": 16.6626, + "eval_samples_per_second": 157.238, + "eval_steps_per_second": 4.921, + "step": 42000 + }, + { + "epoch": 1.7265182019886596, + "grad_norm": 1.1034976243972778, + "learning_rate": 9.799440632617284e-06, + "loss": 0.583, + "step": 42020 + }, + { + "epoch": 1.7273399621990304, + "grad_norm": 1.0975291728973389, + "learning_rate": 9.742068351942668e-06, + "loss": 0.5831, + "step": 42040 + }, + { + "epoch": 1.7281617224094008, + "grad_norm": 1.1249350309371948, + "learning_rate": 9.68485591111744e-06, + "loss": 0.5758, + "step": 42060 + }, + { + "epoch": 1.7289834826197716, + "grad_norm": 1.0432801246643066, + "learning_rate": 9.6278034114604e-06, + "loss": 0.5898, + "step": 42080 + }, + { + "epoch": 1.729805242830142, + "grad_norm": 1.0415185689926147, + "learning_rate": 9.570910954007128e-06, + "loss": 0.5781, + "step": 42100 + }, + { + "epoch": 1.7306270030405129, + "grad_norm": 1.0248029232025146, + "learning_rate": 9.517011450287305e-06, + "loss": 0.588, + "step": 42120 + }, + { + "epoch": 1.7314487632508833, + "grad_norm": 1.0104538202285767, + "learning_rate": 9.460431364661492e-06, + "loss": 0.5787, + "step": 42140 + }, + { + "epoch": 1.732270523461254, + "grad_norm": 1.0787935256958008, + "learning_rate": 9.404011617642439e-06, + "loss": 0.6012, + "step": 42160 + }, + { + "epoch": 1.7330922836716245, + "grad_norm": 1.10727059841156, + "learning_rate": 9.347752309145241e-06, + "loss": 0.5939, + "step": 42180 + }, + { + "epoch": 1.7339140438819953, + "grad_norm": 1.0951191186904907, + "learning_rate": 9.291653538800727e-06, + "loss": 0.5707, + "step": 42200 + }, + { + "epoch": 1.7347358040923657, + "grad_norm": 0.9968528151512146, + "learning_rate": 9.235715405955558e-06, + "loss": 0.6074, + "step": 42220 + }, + { + "epoch": 1.7355575643027366, + "grad_norm": 1.2177423238754272, + "learning_rate": 9.17993800967183e-06, + "loss": 0.5915, + "step": 42240 + }, + { + "epoch": 1.736379324513107, + "grad_norm": 1.041279673576355, + "learning_rate": 9.124321448727014e-06, + "loss": 0.5841, + "step": 42260 + }, + { + "epoch": 1.7372010847234778, + "grad_norm": 1.0093954801559448, + "learning_rate": 9.068865821613803e-06, + "loss": 0.5966, + "step": 42280 + }, + { + "epoch": 1.7380228449338482, + "grad_norm": 0.9137701392173767, + "learning_rate": 9.013571226539773e-06, + "loss": 0.5792, + "step": 42300 + }, + { + "epoch": 1.738844605144219, + "grad_norm": 1.1452417373657227, + "learning_rate": 8.958437761427452e-06, + "loss": 0.5968, + "step": 42320 + }, + { + "epoch": 1.7396663653545894, + "grad_norm": 1.0660525560379028, + "learning_rate": 8.903465523913957e-06, + "loss": 0.5883, + "step": 42340 + }, + { + "epoch": 1.7404881255649602, + "grad_norm": 1.2415095567703247, + "learning_rate": 8.848654611350849e-06, + "loss": 0.5766, + "step": 42360 + }, + { + "epoch": 1.7413098857753306, + "grad_norm": 1.097631812095642, + "learning_rate": 8.794005120804082e-06, + "loss": 0.5838, + "step": 42380 + }, + { + "epoch": 1.7421316459857015, + "grad_norm": 0.9962956309318542, + "learning_rate": 8.739517149053689e-06, + "loss": 0.5873, + "step": 42400 + }, + { + "epoch": 1.7429534061960719, + "grad_norm": 1.1835882663726807, + "learning_rate": 8.685190792593656e-06, + "loss": 0.5863, + "step": 42420 + }, + { + "epoch": 1.7437751664064427, + "grad_norm": 1.101272702217102, + "learning_rate": 8.631026147631772e-06, + "loss": 0.5901, + "step": 42440 + }, + { + "epoch": 1.744596926616813, + "grad_norm": 1.1990203857421875, + "learning_rate": 8.577023310089483e-06, + "loss": 0.6065, + "step": 42460 + }, + { + "epoch": 1.745418686827184, + "grad_norm": 1.1126364469528198, + "learning_rate": 8.523182375601635e-06, + "loss": 0.5755, + "step": 42480 + }, + { + "epoch": 1.7462404470375543, + "grad_norm": 1.124306082725525, + "learning_rate": 8.469503439516402e-06, + "loss": 0.6036, + "step": 42500 + }, + { + "epoch": 1.7470622072479252, + "grad_norm": 1.1404842138290405, + "learning_rate": 8.41598659689502e-06, + "loss": 0.5996, + "step": 42520 + }, + { + "epoch": 1.7478839674582956, + "grad_norm": 1.0869922637939453, + "learning_rate": 8.36263194251169e-06, + "loss": 0.5756, + "step": 42540 + }, + { + "epoch": 1.7487057276686664, + "grad_norm": 1.0685384273529053, + "learning_rate": 8.309439570853439e-06, + "loss": 0.5936, + "step": 42560 + }, + { + "epoch": 1.7495274878790368, + "grad_norm": 1.1454116106033325, + "learning_rate": 8.256409576119827e-06, + "loss": 0.5871, + "step": 42580 + }, + { + "epoch": 1.7503492480894076, + "grad_norm": 1.119585633277893, + "learning_rate": 8.203542052222924e-06, + "loss": 0.5948, + "step": 42600 + }, + { + "epoch": 1.751171008299778, + "grad_norm": 1.0509191751480103, + "learning_rate": 8.150837092787034e-06, + "loss": 0.5856, + "step": 42620 + }, + { + "epoch": 1.7519927685101488, + "grad_norm": 0.9895453453063965, + "learning_rate": 8.098294791148565e-06, + "loss": 0.5877, + "step": 42640 + }, + { + "epoch": 1.7528145287205192, + "grad_norm": 1.1008808612823486, + "learning_rate": 8.045915240355917e-06, + "loss": 0.59, + "step": 42660 + }, + { + "epoch": 1.75363628893089, + "grad_norm": 0.9279462695121765, + "learning_rate": 7.993698533169192e-06, + "loss": 0.5911, + "step": 42680 + }, + { + "epoch": 1.7544580491412605, + "grad_norm": 1.1394389867782593, + "learning_rate": 7.941644762060229e-06, + "loss": 0.5756, + "step": 42700 + }, + { + "epoch": 1.7552798093516313, + "grad_norm": 1.0450705289840698, + "learning_rate": 7.889754019212203e-06, + "loss": 0.6016, + "step": 42720 + }, + { + "epoch": 1.7561015695620017, + "grad_norm": 1.2323449850082397, + "learning_rate": 7.838026396519638e-06, + "loss": 0.5808, + "step": 42740 + }, + { + "epoch": 1.7569233297723725, + "grad_norm": 1.1465022563934326, + "learning_rate": 7.786461985588156e-06, + "loss": 0.5987, + "step": 42760 + }, + { + "epoch": 1.757745089982743, + "grad_norm": 0.9561547636985779, + "learning_rate": 7.73506087773439e-06, + "loss": 0.563, + "step": 42780 + }, + { + "epoch": 1.7585668501931138, + "grad_norm": 1.154842734336853, + "learning_rate": 7.683823163985737e-06, + "loss": 0.5682, + "step": 42800 + }, + { + "epoch": 1.7593886104034842, + "grad_norm": 1.1190966367721558, + "learning_rate": 7.632748935080213e-06, + "loss": 0.5896, + "step": 42820 + }, + { + "epoch": 1.760210370613855, + "grad_norm": 1.064261794090271, + "learning_rate": 7.581838281466414e-06, + "loss": 0.5778, + "step": 42840 + }, + { + "epoch": 1.7610321308242254, + "grad_norm": 1.095628261566162, + "learning_rate": 7.531091293303094e-06, + "loss": 0.5657, + "step": 42860 + }, + { + "epoch": 1.7618538910345962, + "grad_norm": 1.1514066457748413, + "learning_rate": 7.480508060459346e-06, + "loss": 0.579, + "step": 42880 + }, + { + "epoch": 1.7626756512449666, + "grad_norm": 1.1395295858383179, + "learning_rate": 7.430088672514124e-06, + "loss": 0.6007, + "step": 42900 + }, + { + "epoch": 1.7634974114553374, + "grad_norm": 1.2230483293533325, + "learning_rate": 7.379833218756338e-06, + "loss": 0.5996, + "step": 42920 + }, + { + "epoch": 1.7643191716657078, + "grad_norm": 1.0743821859359741, + "learning_rate": 7.329741788184485e-06, + "loss": 0.5863, + "step": 42940 + }, + { + "epoch": 1.7651409318760787, + "grad_norm": 1.1711527109146118, + "learning_rate": 7.279814469506652e-06, + "loss": 0.5864, + "step": 42960 + }, + { + "epoch": 1.765962692086449, + "grad_norm": 1.1039119958877563, + "learning_rate": 7.230051351140266e-06, + "loss": 0.5763, + "step": 42980 + }, + { + "epoch": 1.76678445229682, + "grad_norm": 1.0589841604232788, + "learning_rate": 7.180452521211978e-06, + "loss": 0.5857, + "step": 43000 + }, + { + "epoch": 1.7676062125071903, + "grad_norm": 1.1599675416946411, + "learning_rate": 7.131018067557516e-06, + "loss": 0.5798, + "step": 43020 + }, + { + "epoch": 1.7684279727175611, + "grad_norm": 1.10663640499115, + "learning_rate": 7.081748077721462e-06, + "loss": 0.5749, + "step": 43040 + }, + { + "epoch": 1.7692497329279315, + "grad_norm": 1.0838004350662231, + "learning_rate": 7.032642638957232e-06, + "loss": 0.5767, + "step": 43060 + }, + { + "epoch": 1.7700714931383024, + "grad_norm": 1.0529030561447144, + "learning_rate": 6.983701838226708e-06, + "loss": 0.6105, + "step": 43080 + }, + { + "epoch": 1.7708932533486728, + "grad_norm": 1.0537713766098022, + "learning_rate": 6.934925762200328e-06, + "loss": 0.5857, + "step": 43100 + }, + { + "epoch": 1.7717150135590436, + "grad_norm": 1.105526328086853, + "learning_rate": 6.886314497256752e-06, + "loss": 0.5677, + "step": 43120 + }, + { + "epoch": 1.772536773769414, + "grad_norm": 1.0443006753921509, + "learning_rate": 6.837868129482772e-06, + "loss": 0.5791, + "step": 43140 + }, + { + "epoch": 1.7733585339797848, + "grad_norm": 1.0910414457321167, + "learning_rate": 6.789586744673226e-06, + "loss": 0.5779, + "step": 43160 + }, + { + "epoch": 1.7741802941901552, + "grad_norm": 1.2543234825134277, + "learning_rate": 6.741470428330676e-06, + "loss": 0.5898, + "step": 43180 + }, + { + "epoch": 1.775002054400526, + "grad_norm": 0.9940236806869507, + "learning_rate": 6.693519265665449e-06, + "loss": 0.5807, + "step": 43200 + }, + { + "epoch": 1.7758238146108964, + "grad_norm": 1.129135012626648, + "learning_rate": 6.645733341595339e-06, + "loss": 0.5931, + "step": 43220 + }, + { + "epoch": 1.7766455748212673, + "grad_norm": 1.0876851081848145, + "learning_rate": 6.598112740745544e-06, + "loss": 0.587, + "step": 43240 + }, + { + "epoch": 1.7774673350316377, + "grad_norm": 1.1217765808105469, + "learning_rate": 6.550657547448513e-06, + "loss": 0.5664, + "step": 43260 + }, + { + "epoch": 1.7782890952420085, + "grad_norm": 1.006568431854248, + "learning_rate": 6.503367845743702e-06, + "loss": 0.572, + "step": 43280 + }, + { + "epoch": 1.779110855452379, + "grad_norm": 1.0781569480895996, + "learning_rate": 6.456243719377553e-06, + "loss": 0.5824, + "step": 43300 + }, + { + "epoch": 1.7799326156627497, + "grad_norm": 1.0513370037078857, + "learning_rate": 6.4116292395006935e-06, + "loss": 0.5924, + "step": 43320 + }, + { + "epoch": 1.7807543758731201, + "grad_norm": 1.1264588832855225, + "learning_rate": 6.364828224809993e-06, + "loss": 0.609, + "step": 43340 + }, + { + "epoch": 1.781576136083491, + "grad_norm": 1.1192883253097534, + "learning_rate": 6.318193030800956e-06, + "loss": 0.5814, + "step": 43360 + }, + { + "epoch": 1.7823978962938614, + "grad_norm": 1.1564245223999023, + "learning_rate": 6.271723740060908e-06, + "loss": 0.5825, + "step": 43380 + }, + { + "epoch": 1.7832196565042322, + "grad_norm": 1.1175339221954346, + "learning_rate": 6.227731656718094e-06, + "loss": 0.6158, + "step": 43400 + }, + { + "epoch": 1.7840414167146026, + "grad_norm": 1.1350170373916626, + "learning_rate": 6.1815861137816456e-06, + "loss": 0.584, + "step": 43420 + }, + { + "epoch": 1.7848631769249734, + "grad_norm": 1.0391989946365356, + "learning_rate": 6.1356067160345695e-06, + "loss": 0.5725, + "step": 43440 + }, + { + "epoch": 1.7856849371353438, + "grad_norm": 1.0166730880737305, + "learning_rate": 6.089793544902756e-06, + "loss": 0.5822, + "step": 43460 + }, + { + "epoch": 1.7865066973457147, + "grad_norm": 1.161569595336914, + "learning_rate": 6.0441466815178705e-06, + "loss": 0.6067, + "step": 43480 + }, + { + "epoch": 1.787328457556085, + "grad_norm": 1.1493220329284668, + "learning_rate": 5.998666206716985e-06, + "loss": 0.5804, + "step": 43500 + }, + { + "epoch": 1.7881502177664559, + "grad_norm": 1.005832552909851, + "learning_rate": 5.953352201042484e-06, + "loss": 0.5852, + "step": 43520 + }, + { + "epoch": 1.7889719779768263, + "grad_norm": 1.1422655582427979, + "learning_rate": 5.9082047447420405e-06, + "loss": 0.5935, + "step": 43540 + }, + { + "epoch": 1.7897937381871971, + "grad_norm": 1.0794512033462524, + "learning_rate": 5.863223917768268e-06, + "loss": 0.5841, + "step": 43560 + }, + { + "epoch": 1.7906154983975675, + "grad_norm": 0.9649259448051453, + "learning_rate": 5.818409799778779e-06, + "loss": 0.5813, + "step": 43580 + }, + { + "epoch": 1.7914372586079383, + "grad_norm": 1.0412386655807495, + "learning_rate": 5.7737624701359125e-06, + "loss": 0.5912, + "step": 43600 + }, + { + "epoch": 1.7922590188183087, + "grad_norm": 0.9937067031860352, + "learning_rate": 5.729282007906678e-06, + "loss": 0.5652, + "step": 43620 + }, + { + "epoch": 1.7930807790286796, + "grad_norm": 1.1072604656219482, + "learning_rate": 5.68496849186253e-06, + "loss": 0.5813, + "step": 43640 + }, + { + "epoch": 1.79390253923905, + "grad_norm": 1.1642615795135498, + "learning_rate": 5.640822000479307e-06, + "loss": 0.5769, + "step": 43660 + }, + { + "epoch": 1.7947242994494208, + "grad_norm": 1.1445039510726929, + "learning_rate": 5.596842611937025e-06, + "loss": 0.5789, + "step": 43680 + }, + { + "epoch": 1.7955460596597912, + "grad_norm": 1.1920628547668457, + "learning_rate": 5.5530304041198075e-06, + "loss": 0.5743, + "step": 43700 + }, + { + "epoch": 1.796367819870162, + "grad_norm": 1.0678682327270508, + "learning_rate": 5.509385454615712e-06, + "loss": 0.5811, + "step": 43720 + }, + { + "epoch": 1.7971895800805324, + "grad_norm": 1.15229070186615, + "learning_rate": 5.465907840716555e-06, + "loss": 0.5835, + "step": 43740 + }, + { + "epoch": 1.7980113402909033, + "grad_norm": 1.0926685333251953, + "learning_rate": 5.422597639417903e-06, + "loss": 0.5952, + "step": 43760 + }, + { + "epoch": 1.7988331005012737, + "grad_norm": 1.0511040687561035, + "learning_rate": 5.379454927418714e-06, + "loss": 0.5803, + "step": 43780 + }, + { + "epoch": 1.7996548607116445, + "grad_norm": 1.1438969373703003, + "learning_rate": 5.336479781121473e-06, + "loss": 0.5866, + "step": 43800 + }, + { + "epoch": 1.8004766209220149, + "grad_norm": 1.0851142406463623, + "learning_rate": 5.293672276631823e-06, + "loss": 0.5861, + "step": 43820 + }, + { + "epoch": 1.8012983811323857, + "grad_norm": 1.071152925491333, + "learning_rate": 5.251032489758545e-06, + "loss": 0.5965, + "step": 43840 + }, + { + "epoch": 1.802120141342756, + "grad_norm": 1.1403220891952515, + "learning_rate": 5.208560496013471e-06, + "loss": 0.5796, + "step": 43860 + }, + { + "epoch": 1.802941901553127, + "grad_norm": 0.9966292977333069, + "learning_rate": 5.166256370611189e-06, + "loss": 0.5664, + "step": 43880 + }, + { + "epoch": 1.8037636617634973, + "grad_norm": 1.014594316482544, + "learning_rate": 5.124120188469061e-06, + "loss": 0.5889, + "step": 43900 + }, + { + "epoch": 1.8045854219738682, + "grad_norm": 1.1401876211166382, + "learning_rate": 5.082152024207032e-06, + "loss": 0.5886, + "step": 43920 + }, + { + "epoch": 1.8054071821842386, + "grad_norm": 1.0322624444961548, + "learning_rate": 5.04035195214747e-06, + "loss": 0.5951, + "step": 43940 + }, + { + "epoch": 1.8062289423946094, + "grad_norm": 1.0889531373977661, + "learning_rate": 4.998720046315097e-06, + "loss": 0.5795, + "step": 43960 + }, + { + "epoch": 1.8070507026049798, + "grad_norm": 1.1183452606201172, + "learning_rate": 4.957256380436826e-06, + "loss": 0.5964, + "step": 43980 + }, + { + "epoch": 1.8078724628153506, + "grad_norm": 1.037669062614441, + "learning_rate": 4.915961027941596e-06, + "loss": 0.578, + "step": 44000 + }, + { + "epoch": 1.8078724628153506, + "eval_loss": 0.8637903928756714, + "eval_runtime": 21.3438, + "eval_samples_per_second": 122.752, + "eval_steps_per_second": 3.842, + "step": 44000 + }, + { + "epoch": 1.808694223025721, + "grad_norm": 0.8755192160606384, + "learning_rate": 4.874834061960298e-06, + "loss": 0.4632, + "step": 44020 + }, + { + "epoch": 1.8095159832360919, + "grad_norm": 0.953360915184021, + "learning_rate": 4.83387555532564e-06, + "loss": 0.4426, + "step": 44040 + }, + { + "epoch": 1.8103377434464623, + "grad_norm": 0.9059198498725891, + "learning_rate": 4.7930855805719875e-06, + "loss": 0.4451, + "step": 44060 + }, + { + "epoch": 1.811159503656833, + "grad_norm": 0.9322590231895447, + "learning_rate": 4.752464209935215e-06, + "loss": 0.4425, + "step": 44080 + }, + { + "epoch": 1.8119812638672035, + "grad_norm": 0.9682320952415466, + "learning_rate": 4.712011515352688e-06, + "loss": 0.4406, + "step": 44100 + }, + { + "epoch": 1.8128030240775743, + "grad_norm": 0.9625361561775208, + "learning_rate": 4.671727568462958e-06, + "loss": 0.4296, + "step": 44120 + }, + { + "epoch": 1.8136247842879447, + "grad_norm": 0.8894620537757874, + "learning_rate": 4.631612440605837e-06, + "loss": 0.4459, + "step": 44140 + }, + { + "epoch": 1.8144465444983155, + "grad_norm": 0.9132700562477112, + "learning_rate": 4.5916662028221094e-06, + "loss": 0.438, + "step": 44160 + }, + { + "epoch": 1.815268304708686, + "grad_norm": 1.0488115549087524, + "learning_rate": 4.551888925853509e-06, + "loss": 0.4485, + "step": 44180 + }, + { + "epoch": 1.8160900649190568, + "grad_norm": 0.9608358144760132, + "learning_rate": 4.512280680142522e-06, + "loss": 0.4416, + "step": 44200 + }, + { + "epoch": 1.8169118251294272, + "grad_norm": 1.0176913738250732, + "learning_rate": 4.472841535832295e-06, + "loss": 0.4422, + "step": 44220 + }, + { + "epoch": 1.817733585339798, + "grad_norm": 0.9737944602966309, + "learning_rate": 4.433571562766514e-06, + "loss": 0.4217, + "step": 44240 + }, + { + "epoch": 1.8185553455501684, + "grad_norm": 0.9006738662719727, + "learning_rate": 4.394470830489272e-06, + "loss": 0.4589, + "step": 44260 + }, + { + "epoch": 1.8193771057605392, + "grad_norm": 1.0624366998672485, + "learning_rate": 4.355539408244991e-06, + "loss": 0.4506, + "step": 44280 + }, + { + "epoch": 1.8201988659709096, + "grad_norm": 1.1879520416259766, + "learning_rate": 4.316777364978175e-06, + "loss": 0.4532, + "step": 44300 + }, + { + "epoch": 1.8210206261812805, + "grad_norm": 1.1148782968521118, + "learning_rate": 4.278184769333482e-06, + "loss": 0.4299, + "step": 44320 + }, + { + "epoch": 1.8218423863916509, + "grad_norm": 0.9817942380905151, + "learning_rate": 4.239761689655364e-06, + "loss": 0.4249, + "step": 44340 + }, + { + "epoch": 1.8226641466020217, + "grad_norm": 0.9905684590339661, + "learning_rate": 4.201508193988168e-06, + "loss": 0.4374, + "step": 44360 + }, + { + "epoch": 1.823485906812392, + "grad_norm": 1.0558414459228516, + "learning_rate": 4.163424350075895e-06, + "loss": 0.4501, + "step": 44380 + }, + { + "epoch": 1.824307667022763, + "grad_norm": 1.105684757232666, + "learning_rate": 4.1274018997290775e-06, + "loss": 0.438, + "step": 44400 + }, + { + "epoch": 1.8251294272331333, + "grad_norm": 0.9954769611358643, + "learning_rate": 4.089649070449642e-06, + "loss": 0.4404, + "step": 44420 + }, + { + "epoch": 1.825951187443504, + "grad_norm": 1.0505714416503906, + "learning_rate": 4.052066091019047e-06, + "loss": 0.4338, + "step": 44440 + }, + { + "epoch": 1.8267729476538745, + "grad_norm": 1.083191990852356, + "learning_rate": 4.014653027993797e-06, + "loss": 0.4334, + "step": 44460 + }, + { + "epoch": 1.8275947078642452, + "grad_norm": 1.1424214839935303, + "learning_rate": 3.977409947629595e-06, + "loss": 0.4414, + "step": 44480 + }, + { + "epoch": 1.8284164680746158, + "grad_norm": 0.9991270899772644, + "learning_rate": 3.9403369158810595e-06, + "loss": 0.4419, + "step": 44500 + }, + { + "epoch": 1.8292382282849864, + "grad_norm": 1.0082736015319824, + "learning_rate": 3.903433998401662e-06, + "loss": 0.4346, + "step": 44520 + }, + { + "epoch": 1.830059988495357, + "grad_norm": 1.122463345527649, + "learning_rate": 3.866701260543637e-06, + "loss": 0.4286, + "step": 44540 + }, + { + "epoch": 1.8308817487057276, + "grad_norm": 1.15240478515625, + "learning_rate": 3.830138767357827e-06, + "loss": 0.4248, + "step": 44560 + }, + { + "epoch": 1.8317035089160982, + "grad_norm": 1.1562724113464355, + "learning_rate": 3.7955621469369153e-06, + "loss": 0.4267, + "step": 44580 + }, + { + "epoch": 1.8325252691264688, + "grad_norm": 0.9919096827507019, + "learning_rate": 3.7593318168225867e-06, + "loss": 0.4265, + "step": 44600 + }, + { + "epoch": 1.8333470293368395, + "grad_norm": 0.9958898425102234, + "learning_rate": 3.723271921523508e-06, + "loss": 0.4438, + "step": 44620 + }, + { + "epoch": 1.83416878954721, + "grad_norm": 1.0441087484359741, + "learning_rate": 3.6873825248989524e-06, + "loss": 0.4661, + "step": 44640 + }, + { + "epoch": 1.8349905497575807, + "grad_norm": 1.0398614406585693, + "learning_rate": 3.651663690506313e-06, + "loss": 0.4385, + "step": 44660 + }, + { + "epoch": 1.8358123099679513, + "grad_norm": 1.0855858325958252, + "learning_rate": 3.616115481600857e-06, + "loss": 0.4385, + "step": 44680 + }, + { + "epoch": 1.836634070178322, + "grad_norm": 1.2031474113464355, + "learning_rate": 3.5807379611357826e-06, + "loss": 0.4322, + "step": 44700 + }, + { + "epoch": 1.8374558303886925, + "grad_norm": 1.0220446586608887, + "learning_rate": 3.545531191761897e-06, + "loss": 0.419, + "step": 44720 + }, + { + "epoch": 1.8382775905990631, + "grad_norm": 1.1677261590957642, + "learning_rate": 3.5104952358277154e-06, + "loss": 0.428, + "step": 44740 + }, + { + "epoch": 1.8390993508094338, + "grad_norm": 1.097439169883728, + "learning_rate": 3.475630155379206e-06, + "loss": 0.4484, + "step": 44760 + }, + { + "epoch": 1.8399211110198044, + "grad_norm": 1.115867257118225, + "learning_rate": 3.4409360121597235e-06, + "loss": 0.4268, + "step": 44780 + }, + { + "epoch": 1.840742871230175, + "grad_norm": 1.5234203338623047, + "learning_rate": 3.406412867609976e-06, + "loss": 0.4448, + "step": 44800 + }, + { + "epoch": 1.8415646314405456, + "grad_norm": 1.0893431901931763, + "learning_rate": 3.3720607828677362e-06, + "loss": 0.4382, + "step": 44820 + }, + { + "epoch": 1.8423863916509162, + "grad_norm": 1.0602861642837524, + "learning_rate": 3.3378798187679418e-06, + "loss": 0.4308, + "step": 44840 + }, + { + "epoch": 1.8432081518612868, + "grad_norm": 1.249525547027588, + "learning_rate": 3.303870035842427e-06, + "loss": 0.4261, + "step": 44860 + }, + { + "epoch": 1.8440299120716575, + "grad_norm": 1.2411805391311646, + "learning_rate": 3.270031494319925e-06, + "loss": 0.4362, + "step": 44880 + }, + { + "epoch": 1.844851672282028, + "grad_norm": 1.0888171195983887, + "learning_rate": 3.2363642541258676e-06, + "loss": 0.4348, + "step": 44900 + }, + { + "epoch": 1.8456734324923987, + "grad_norm": 1.056647539138794, + "learning_rate": 3.2028683748823505e-06, + "loss": 0.4356, + "step": 44920 + }, + { + "epoch": 1.8464951927027693, + "grad_norm": 1.0287542343139648, + "learning_rate": 3.169543915907991e-06, + "loss": 0.4384, + "step": 44940 + }, + { + "epoch": 1.84731695291314, + "grad_norm": 1.0829901695251465, + "learning_rate": 3.136390936217848e-06, + "loss": 0.4452, + "step": 44960 + }, + { + "epoch": 1.8481387131235105, + "grad_norm": 0.9647024273872375, + "learning_rate": 3.1034094945233018e-06, + "loss": 0.4303, + "step": 44980 + }, + { + "epoch": 1.8489604733338811, + "grad_norm": 1.252030611038208, + "learning_rate": 3.07059964923192e-06, + "loss": 0.454, + "step": 45000 + }, + { + "epoch": 1.8497822335442518, + "grad_norm": 1.0106678009033203, + "learning_rate": 3.037961458447469e-06, + "loss": 0.4371, + "step": 45020 + }, + { + "epoch": 1.8506039937546224, + "grad_norm": 1.0734151601791382, + "learning_rate": 3.0054949799696142e-06, + "loss": 0.4328, + "step": 45040 + }, + { + "epoch": 1.851425753964993, + "grad_norm": 1.1433912515640259, + "learning_rate": 2.9732002712940187e-06, + "loss": 0.4195, + "step": 45060 + }, + { + "epoch": 1.8522475141753636, + "grad_norm": 1.105094313621521, + "learning_rate": 2.9410773896121237e-06, + "loss": 0.4369, + "step": 45080 + }, + { + "epoch": 1.8530692743857342, + "grad_norm": 1.1068464517593384, + "learning_rate": 2.909126391811068e-06, + "loss": 0.4287, + "step": 45100 + }, + { + "epoch": 1.8538910345961048, + "grad_norm": 1.1528667211532593, + "learning_rate": 2.8773473344736235e-06, + "loss": 0.4261, + "step": 45120 + }, + { + "epoch": 1.8547127948064754, + "grad_norm": 1.1072239875793457, + "learning_rate": 2.8457402738780504e-06, + "loss": 0.4389, + "step": 45140 + }, + { + "epoch": 1.855534555016846, + "grad_norm": 0.9646241664886475, + "learning_rate": 2.8143052659980185e-06, + "loss": 0.4193, + "step": 45160 + }, + { + "epoch": 1.8563563152272167, + "grad_norm": 1.1033663749694824, + "learning_rate": 2.7830423665024862e-06, + "loss": 0.4257, + "step": 45180 + }, + { + "epoch": 1.8571780754375873, + "grad_norm": 1.1620920896530151, + "learning_rate": 2.751951630755689e-06, + "loss": 0.4328, + "step": 45200 + }, + { + "epoch": 1.857999835647958, + "grad_norm": 0.9689277410507202, + "learning_rate": 2.7210331138168955e-06, + "loss": 0.4262, + "step": 45220 + }, + { + "epoch": 1.8588215958583285, + "grad_norm": 1.0218113660812378, + "learning_rate": 2.6902868704404172e-06, + "loss": 0.4227, + "step": 45240 + }, + { + "epoch": 1.8596433560686991, + "grad_norm": 1.1273281574249268, + "learning_rate": 2.6597129550754997e-06, + "loss": 0.4125, + "step": 45260 + }, + { + "epoch": 1.8604651162790697, + "grad_norm": 1.068606972694397, + "learning_rate": 2.629311421866165e-06, + "loss": 0.4241, + "step": 45280 + }, + { + "epoch": 1.8612868764894404, + "grad_norm": 1.105440616607666, + "learning_rate": 2.5990823246512253e-06, + "loss": 0.4358, + "step": 45300 + }, + { + "epoch": 1.862108636699811, + "grad_norm": 1.0128626823425293, + "learning_rate": 2.5690257169640688e-06, + "loss": 0.4277, + "step": 45320 + }, + { + "epoch": 1.8629303969101816, + "grad_norm": 0.9848488569259644, + "learning_rate": 2.5391416520326284e-06, + "loss": 0.4298, + "step": 45340 + }, + { + "epoch": 1.8637521571205522, + "grad_norm": 1.1279760599136353, + "learning_rate": 2.5094301827792933e-06, + "loss": 0.4312, + "step": 45360 + }, + { + "epoch": 1.8645739173309228, + "grad_norm": 1.1564669609069824, + "learning_rate": 2.479891361820785e-06, + "loss": 0.4386, + "step": 45380 + }, + { + "epoch": 1.8653956775412934, + "grad_norm": 1.040480375289917, + "learning_rate": 2.4505252414680713e-06, + "loss": 0.4208, + "step": 45400 + }, + { + "epoch": 1.866217437751664, + "grad_norm": 1.0698386430740356, + "learning_rate": 2.421331873726296e-06, + "loss": 0.4353, + "step": 45420 + }, + { + "epoch": 1.8670391979620347, + "grad_norm": 1.1802546977996826, + "learning_rate": 2.3923113102946816e-06, + "loss": 0.4328, + "step": 45440 + }, + { + "epoch": 1.8678609581724053, + "grad_norm": 1.0874077081680298, + "learning_rate": 2.363463602566396e-06, + "loss": 0.4199, + "step": 45460 + }, + { + "epoch": 1.8686827183827759, + "grad_norm": 1.2014697790145874, + "learning_rate": 2.334788801628518e-06, + "loss": 0.4279, + "step": 45480 + }, + { + "epoch": 1.8695044785931465, + "grad_norm": 1.140428066253662, + "learning_rate": 2.3062869582619053e-06, + "loss": 0.4294, + "step": 45500 + }, + { + "epoch": 1.8703262388035171, + "grad_norm": 1.1033835411071777, + "learning_rate": 2.277958122941115e-06, + "loss": 0.4159, + "step": 45520 + }, + { + "epoch": 1.8711479990138877, + "grad_norm": 1.0941648483276367, + "learning_rate": 2.249802345834373e-06, + "loss": 0.4241, + "step": 45540 + }, + { + "epoch": 1.8719697592242583, + "grad_norm": 1.1285514831542969, + "learning_rate": 2.2218196768033496e-06, + "loss": 0.421, + "step": 45560 + }, + { + "epoch": 1.872791519434629, + "grad_norm": 1.0075688362121582, + "learning_rate": 2.1940101654032487e-06, + "loss": 0.4261, + "step": 45580 + }, + { + "epoch": 1.8736132796449996, + "grad_norm": 1.2179702520370483, + "learning_rate": 2.166373860882509e-06, + "loss": 0.4311, + "step": 45600 + }, + { + "epoch": 1.8744350398553702, + "grad_norm": 1.1928651332855225, + "learning_rate": 2.1389108121829593e-06, + "loss": 0.413, + "step": 45620 + }, + { + "epoch": 1.8752568000657408, + "grad_norm": 1.330972671508789, + "learning_rate": 2.1116210679395066e-06, + "loss": 0.4281, + "step": 45640 + }, + { + "epoch": 1.8760785602761114, + "grad_norm": 1.1296700239181519, + "learning_rate": 2.0845046764801924e-06, + "loss": 0.4361, + "step": 45660 + }, + { + "epoch": 1.876900320486482, + "grad_norm": 1.0228627920150757, + "learning_rate": 2.057561685826093e-06, + "loss": 0.4562, + "step": 45680 + }, + { + "epoch": 1.8777220806968526, + "grad_norm": 1.1409462690353394, + "learning_rate": 2.030792143691118e-06, + "loss": 0.4378, + "step": 45700 + }, + { + "epoch": 1.8785438409072233, + "grad_norm": 1.078661322593689, + "learning_rate": 2.0041960974821027e-06, + "loss": 0.4299, + "step": 45720 + }, + { + "epoch": 1.8793656011175939, + "grad_norm": 1.0520544052124023, + "learning_rate": 1.977773594298582e-06, + "loss": 0.434, + "step": 45740 + }, + { + "epoch": 1.8801873613279645, + "grad_norm": 1.0248843431472778, + "learning_rate": 1.9515246809327815e-06, + "loss": 0.4227, + "step": 45760 + }, + { + "epoch": 1.881009121538335, + "grad_norm": 0.9711484909057617, + "learning_rate": 1.926749043146392e-06, + "loss": 0.4221, + "step": 45780 + }, + { + "epoch": 1.8818308817487057, + "grad_norm": 1.135709524154663, + "learning_rate": 1.9008387633469904e-06, + "loss": 0.4176, + "step": 45800 + }, + { + "epoch": 1.8826526419590763, + "grad_norm": 1.0329680442810059, + "learning_rate": 1.8763849105989163e-06, + "loss": 0.4137, + "step": 45820 + }, + { + "epoch": 1.883474402169447, + "grad_norm": 1.0853397846221924, + "learning_rate": 1.8508134388439902e-06, + "loss": 0.4235, + "step": 45840 + }, + { + "epoch": 1.8842961623798176, + "grad_norm": 1.2208008766174316, + "learning_rate": 1.8254157817434447e-06, + "loss": 0.4487, + "step": 45860 + }, + { + "epoch": 1.8851179225901882, + "grad_norm": 0.9802207946777344, + "learning_rate": 1.8001919842745686e-06, + "loss": 0.442, + "step": 45880 + }, + { + "epoch": 1.8859396828005588, + "grad_norm": 1.0660961866378784, + "learning_rate": 1.775142091106774e-06, + "loss": 0.4303, + "step": 45900 + }, + { + "epoch": 1.8867614430109294, + "grad_norm": 1.0748445987701416, + "learning_rate": 1.750266146601498e-06, + "loss": 0.429, + "step": 45920 + }, + { + "epoch": 1.8875832032213, + "grad_norm": 1.2010825872421265, + "learning_rate": 1.7255641948121127e-06, + "loss": 0.4155, + "step": 45940 + }, + { + "epoch": 1.8884049634316706, + "grad_norm": 1.149553656578064, + "learning_rate": 1.7010362794838918e-06, + "loss": 0.4232, + "step": 45960 + }, + { + "epoch": 1.8892267236420413, + "grad_norm": 1.208585262298584, + "learning_rate": 1.6766824440538565e-06, + "loss": 0.4242, + "step": 45980 + }, + { + "epoch": 1.8900484838524119, + "grad_norm": 1.1593657732009888, + "learning_rate": 1.6525027316507957e-06, + "loss": 0.4376, + "step": 46000 + }, + { + "epoch": 1.8900484838524119, + "eval_loss": 0.9416248798370361, + "eval_runtime": 16.5465, + "eval_samples_per_second": 158.341, + "eval_steps_per_second": 4.956, + "step": 46000 + }, + { + "epoch": 1.8908702440627825, + "grad_norm": 0.9449489712715149, + "learning_rate": 1.6284971850951125e-06, + "loss": 0.4212, + "step": 46020 + }, + { + "epoch": 1.891692004273153, + "grad_norm": 1.046122670173645, + "learning_rate": 1.6046658468987897e-06, + "loss": 0.4332, + "step": 46040 + }, + { + "epoch": 1.8925137644835237, + "grad_norm": 1.1397738456726074, + "learning_rate": 1.58100875926529e-06, + "loss": 0.4395, + "step": 46060 + }, + { + "epoch": 1.8933355246938943, + "grad_norm": 1.2659273147583008, + "learning_rate": 1.557525964089479e-06, + "loss": 0.414, + "step": 46080 + }, + { + "epoch": 1.894157284904265, + "grad_norm": 1.2042860984802246, + "learning_rate": 1.5342175029576023e-06, + "loss": 0.4244, + "step": 46100 + }, + { + "epoch": 1.8949790451146356, + "grad_norm": 1.1012762784957886, + "learning_rate": 1.51108341714713e-06, + "loss": 0.4324, + "step": 46120 + }, + { + "epoch": 1.8958008053250062, + "grad_norm": 1.114878535270691, + "learning_rate": 1.4881237476267574e-06, + "loss": 0.4369, + "step": 46140 + }, + { + "epoch": 1.8966225655353768, + "grad_norm": 1.1757618188858032, + "learning_rate": 1.4653385350563043e-06, + "loss": 0.4101, + "step": 46160 + }, + { + "epoch": 1.8974443257457474, + "grad_norm": 1.1672312021255493, + "learning_rate": 1.4427278197866045e-06, + "loss": 0.4138, + "step": 46180 + }, + { + "epoch": 1.898266085956118, + "grad_norm": 1.184259057044983, + "learning_rate": 1.4202916418595058e-06, + "loss": 0.4153, + "step": 46200 + }, + { + "epoch": 1.8990878461664886, + "grad_norm": 1.2748756408691406, + "learning_rate": 1.3980300410077584e-06, + "loss": 0.4031, + "step": 46220 + }, + { + "epoch": 1.8999096063768592, + "grad_norm": 1.1279730796813965, + "learning_rate": 1.3759430566549269e-06, + "loss": 0.4296, + "step": 46240 + }, + { + "epoch": 1.9007313665872299, + "grad_norm": 1.1729366779327393, + "learning_rate": 1.3540307279153674e-06, + "loss": 0.4261, + "step": 46260 + }, + { + "epoch": 1.9015531267976005, + "grad_norm": 1.0671958923339844, + "learning_rate": 1.3322930935941502e-06, + "loss": 0.4218, + "step": 46280 + }, + { + "epoch": 1.902374887007971, + "grad_norm": 1.169188380241394, + "learning_rate": 1.3107301921869152e-06, + "loss": 0.4216, + "step": 46300 + }, + { + "epoch": 1.9031966472183417, + "grad_norm": 1.2280913591384888, + "learning_rate": 1.289342061879928e-06, + "loss": 0.4149, + "step": 46320 + }, + { + "epoch": 1.9040184074287123, + "grad_norm": 1.1291908025741577, + "learning_rate": 1.268128740549912e-06, + "loss": 0.4208, + "step": 46340 + }, + { + "epoch": 1.904840167639083, + "grad_norm": 1.114375352859497, + "learning_rate": 1.2470902657640172e-06, + "loss": 0.4061, + "step": 46360 + }, + { + "epoch": 1.9056619278494535, + "grad_norm": 1.1792467832565308, + "learning_rate": 1.2262266747797847e-06, + "loss": 0.4135, + "step": 46380 + }, + { + "epoch": 1.9064836880598242, + "grad_norm": 1.1172113418579102, + "learning_rate": 1.2055380045450038e-06, + "loss": 0.4259, + "step": 46400 + }, + { + "epoch": 1.9073054482701948, + "grad_norm": 1.136423945426941, + "learning_rate": 1.1850242916977449e-06, + "loss": 0.4274, + "step": 46420 + }, + { + "epoch": 1.9081272084805654, + "grad_norm": 1.257373332977295, + "learning_rate": 1.164685572566182e-06, + "loss": 0.421, + "step": 46440 + }, + { + "epoch": 1.908948968690936, + "grad_norm": 1.100024938583374, + "learning_rate": 1.1445218831686367e-06, + "loss": 0.4245, + "step": 46460 + }, + { + "epoch": 1.9097707289013066, + "grad_norm": 1.2640262842178345, + "learning_rate": 1.1245332592134562e-06, + "loss": 0.4188, + "step": 46480 + }, + { + "epoch": 1.9105924891116772, + "grad_norm": 1.1880619525909424, + "learning_rate": 1.104719736098958e-06, + "loss": 0.4258, + "step": 46500 + }, + { + "epoch": 1.9114142493220478, + "grad_norm": 1.1756030321121216, + "learning_rate": 1.0850813489133528e-06, + "loss": 0.4242, + "step": 46520 + }, + { + "epoch": 1.9122360095324185, + "grad_norm": 1.2056884765625, + "learning_rate": 1.0656181324347093e-06, + "loss": 0.4232, + "step": 46540 + }, + { + "epoch": 1.913057769742789, + "grad_norm": 1.0848861932754517, + "learning_rate": 1.0463301211309006e-06, + "loss": 0.422, + "step": 46560 + }, + { + "epoch": 1.9138795299531597, + "grad_norm": 1.2697360515594482, + "learning_rate": 1.027217349159504e-06, + "loss": 0.4369, + "step": 46580 + }, + { + "epoch": 1.9147012901635303, + "grad_norm": 1.1444509029388428, + "learning_rate": 1.008279850367766e-06, + "loss": 0.4231, + "step": 46600 + }, + { + "epoch": 1.915523050373901, + "grad_norm": 1.225815773010254, + "learning_rate": 9.895176582925492e-07, + "loss": 0.4063, + "step": 46620 + }, + { + "epoch": 1.9163448105842715, + "grad_norm": 1.1986970901489258, + "learning_rate": 9.709308061602417e-07, + "loss": 0.4194, + "step": 46640 + }, + { + "epoch": 1.9171665707946421, + "grad_norm": 1.0552966594696045, + "learning_rate": 9.525193268867249e-07, + "loss": 0.4368, + "step": 46660 + }, + { + "epoch": 1.9179883310050128, + "grad_norm": 1.2278395891189575, + "learning_rate": 9.342832530773061e-07, + "loss": 0.4247, + "step": 46680 + }, + { + "epoch": 1.9188100912153834, + "grad_norm": 1.1720478534698486, + "learning_rate": 9.162226170266964e-07, + "loss": 0.4484, + "step": 46700 + }, + { + "epoch": 1.919631851425754, + "grad_norm": 1.0597708225250244, + "learning_rate": 8.983374507188669e-07, + "loss": 0.4158, + "step": 46720 + }, + { + "epoch": 1.9204536116361246, + "grad_norm": 1.2956316471099854, + "learning_rate": 8.806277858270706e-07, + "loss": 0.4274, + "step": 46740 + }, + { + "epoch": 1.9212753718464952, + "grad_norm": 0.9802606701850891, + "learning_rate": 8.630936537137757e-07, + "loss": 0.4307, + "step": 46760 + }, + { + "epoch": 1.9220971320568658, + "grad_norm": 1.118882417678833, + "learning_rate": 8.457350854305657e-07, + "loss": 0.415, + "step": 46780 + }, + { + "epoch": 1.9229188922672364, + "grad_norm": 1.2108888626098633, + "learning_rate": 8.285521117181394e-07, + "loss": 0.412, + "step": 46800 + }, + { + "epoch": 1.923740652477607, + "grad_norm": 1.1903843879699707, + "learning_rate": 8.115447630062112e-07, + "loss": 0.4167, + "step": 46820 + }, + { + "epoch": 1.9245624126879777, + "grad_norm": 1.017788052558899, + "learning_rate": 7.947130694134996e-07, + "loss": 0.4249, + "step": 46840 + }, + { + "epoch": 1.9253841728983483, + "grad_norm": 1.0981863737106323, + "learning_rate": 7.780570607476278e-07, + "loss": 0.4072, + "step": 46860 + }, + { + "epoch": 1.926205933108719, + "grad_norm": 1.1456172466278076, + "learning_rate": 7.615767665051122e-07, + "loss": 0.4233, + "step": 46880 + }, + { + "epoch": 1.9270276933190895, + "grad_norm": 1.1544848680496216, + "learning_rate": 7.452722158712732e-07, + "loss": 0.4324, + "step": 46900 + }, + { + "epoch": 1.9278494535294601, + "grad_norm": 1.125557780265808, + "learning_rate": 7.291434377202255e-07, + "loss": 0.4248, + "step": 46920 + }, + { + "epoch": 1.9286712137398307, + "grad_norm": 0.9662781357765198, + "learning_rate": 7.139839337578336e-07, + "loss": 0.4405, + "step": 46940 + }, + { + "epoch": 1.9294929739502014, + "grad_norm": 1.2945139408111572, + "learning_rate": 6.981979938184835e-07, + "loss": 0.4191, + "step": 46960 + }, + { + "epoch": 1.930314734160572, + "grad_norm": 1.1728248596191406, + "learning_rate": 6.825879097267507e-07, + "loss": 0.4314, + "step": 46980 + }, + { + "epoch": 1.9311364943709426, + "grad_norm": 1.1901942491531372, + "learning_rate": 6.671537091268775e-07, + "loss": 0.4168, + "step": 47000 + }, + { + "epoch": 1.9319582545813132, + "grad_norm": 1.1235634088516235, + "learning_rate": 6.526541555399446e-07, + "loss": 0.422, + "step": 47020 + }, + { + "epoch": 1.9327800147916838, + "grad_norm": 1.1675294637680054, + "learning_rate": 6.375630060813875e-07, + "loss": 0.4172, + "step": 47040 + }, + { + "epoch": 1.9336017750020544, + "grad_norm": 1.1512994766235352, + "learning_rate": 6.226478198503527e-07, + "loss": 0.4234, + "step": 47060 + }, + { + "epoch": 1.934423535212425, + "grad_norm": 1.2757488489151, + "learning_rate": 6.07908623260467e-07, + "loss": 0.4237, + "step": 47080 + }, + { + "epoch": 1.9352452954227957, + "grad_norm": 1.087795376777649, + "learning_rate": 5.933454424137175e-07, + "loss": 0.4213, + "step": 47100 + }, + { + "epoch": 1.9360670556331663, + "grad_norm": 1.1851129531860352, + "learning_rate": 5.789583031003743e-07, + "loss": 0.4231, + "step": 47120 + }, + { + "epoch": 1.936888815843537, + "grad_norm": 1.0666377544403076, + "learning_rate": 5.647472307989676e-07, + "loss": 0.4154, + "step": 47140 + }, + { + "epoch": 1.9377105760539075, + "grad_norm": 1.0941452980041504, + "learning_rate": 5.507122506761886e-07, + "loss": 0.4094, + "step": 47160 + }, + { + "epoch": 1.9385323362642781, + "grad_norm": 1.3112642765045166, + "learning_rate": 5.368533875869331e-07, + "loss": 0.4098, + "step": 47180 + }, + { + "epoch": 1.9393540964746487, + "grad_norm": 0.9906901121139526, + "learning_rate": 5.231706660741575e-07, + "loss": 0.4294, + "step": 47200 + }, + { + "epoch": 1.9401758566850194, + "grad_norm": 1.081827998161316, + "learning_rate": 5.096641103689348e-07, + "loss": 0.4079, + "step": 47220 + }, + { + "epoch": 1.94099761689539, + "grad_norm": 1.2195242643356323, + "learning_rate": 4.963337443902982e-07, + "loss": 0.4213, + "step": 47240 + }, + { + "epoch": 1.9418193771057606, + "grad_norm": 1.2033476829528809, + "learning_rate": 4.831795917453418e-07, + "loss": 0.4276, + "step": 47260 + }, + { + "epoch": 1.9426411373161312, + "grad_norm": 1.0639315843582153, + "learning_rate": 4.702016757290206e-07, + "loss": 0.411, + "step": 47280 + }, + { + "epoch": 1.9434628975265018, + "grad_norm": 1.079074501991272, + "learning_rate": 4.5740001932425e-07, + "loss": 0.4235, + "step": 47300 + }, + { + "epoch": 1.9442846577368724, + "grad_norm": 1.1713237762451172, + "learning_rate": 4.44774645201762e-07, + "loss": 0.4091, + "step": 47320 + }, + { + "epoch": 1.945106417947243, + "grad_norm": 1.232969045639038, + "learning_rate": 4.323255757201383e-07, + "loss": 0.4075, + "step": 47340 + }, + { + "epoch": 1.9459281781576137, + "grad_norm": 1.1638152599334717, + "learning_rate": 4.200528329257103e-07, + "loss": 0.4281, + "step": 47360 + }, + { + "epoch": 1.9467499383679843, + "grad_norm": 1.2757538557052612, + "learning_rate": 4.0795643855255914e-07, + "loss": 0.4385, + "step": 47380 + }, + { + "epoch": 1.9475716985783549, + "grad_norm": 1.08627450466156, + "learning_rate": 3.960364140224826e-07, + "loss": 0.429, + "step": 47400 + }, + { + "epoch": 1.9483934587887255, + "grad_norm": 1.2395083904266357, + "learning_rate": 3.842927804449059e-07, + "loss": 0.4194, + "step": 47420 + }, + { + "epoch": 1.949215218999096, + "grad_norm": 1.2743449211120605, + "learning_rate": 3.727255586169265e-07, + "loss": 0.4343, + "step": 47440 + }, + { + "epoch": 1.9500369792094667, + "grad_norm": 1.1402217149734497, + "learning_rate": 3.6133476902318055e-07, + "loss": 0.4113, + "step": 47460 + }, + { + "epoch": 1.9508587394198373, + "grad_norm": 1.2253342866897583, + "learning_rate": 3.501204318358875e-07, + "loss": 0.4159, + "step": 47480 + }, + { + "epoch": 1.951680499630208, + "grad_norm": 1.1899055242538452, + "learning_rate": 3.390825669147724e-07, + "loss": 0.4185, + "step": 47500 + }, + { + "epoch": 1.9525022598405786, + "grad_norm": 1.0581785440444946, + "learning_rate": 3.282211938070545e-07, + "loss": 0.407, + "step": 47520 + }, + { + "epoch": 1.9533240200509492, + "grad_norm": 1.143174171447754, + "learning_rate": 3.1753633174738116e-07, + "loss": 0.3927, + "step": 47540 + }, + { + "epoch": 1.9541457802613198, + "grad_norm": 1.1011602878570557, + "learning_rate": 3.070279996578163e-07, + "loss": 0.4089, + "step": 47560 + }, + { + "epoch": 1.9549675404716904, + "grad_norm": 1.1266615390777588, + "learning_rate": 2.966962161478182e-07, + "loss": 0.4117, + "step": 47580 + }, + { + "epoch": 1.955789300682061, + "grad_norm": 1.152133584022522, + "learning_rate": 2.8654099951417324e-07, + "loss": 0.4287, + "step": 47600 + }, + { + "epoch": 1.9566110608924316, + "grad_norm": 1.1630717515945435, + "learning_rate": 2.7656236774099564e-07, + "loss": 0.4353, + "step": 47620 + }, + { + "epoch": 1.9574328211028023, + "grad_norm": 1.036206603050232, + "learning_rate": 2.6676033849966087e-07, + "loss": 0.413, + "step": 47640 + }, + { + "epoch": 1.9582545813131729, + "grad_norm": 1.1259959936141968, + "learning_rate": 2.5713492914881677e-07, + "loss": 0.433, + "step": 47660 + }, + { + "epoch": 1.9590763415235435, + "grad_norm": 1.2203707695007324, + "learning_rate": 2.4768615673432806e-07, + "loss": 0.4102, + "step": 47680 + }, + { + "epoch": 1.959898101733914, + "grad_norm": 1.2147117853164673, + "learning_rate": 2.384140379892319e-07, + "loss": 0.4058, + "step": 47700 + }, + { + "epoch": 1.9607198619442845, + "grad_norm": 1.1942362785339355, + "learning_rate": 2.2931858933374907e-07, + "loss": 0.418, + "step": 47720 + }, + { + "epoch": 1.9615416221546553, + "grad_norm": 1.0480471849441528, + "learning_rate": 2.203998268752061e-07, + "loss": 0.4061, + "step": 47740 + }, + { + "epoch": 1.9623633823650257, + "grad_norm": 1.2805067300796509, + "learning_rate": 2.1165776640804658e-07, + "loss": 0.4289, + "step": 47760 + }, + { + "epoch": 1.9631851425753966, + "grad_norm": 1.2007514238357544, + "learning_rate": 2.0309242341379765e-07, + "loss": 0.4283, + "step": 47780 + }, + { + "epoch": 1.964006902785767, + "grad_norm": 1.1871360540390015, + "learning_rate": 1.947038130610146e-07, + "loss": 0.4089, + "step": 47800 + }, + { + "epoch": 1.9648286629961378, + "grad_norm": 1.1369190216064453, + "learning_rate": 1.8649195020528087e-07, + "loss": 0.4045, + "step": 47820 + }, + { + "epoch": 1.9656504232065082, + "grad_norm": 1.255011796951294, + "learning_rate": 1.7845684938916363e-07, + "loss": 0.4226, + "step": 47840 + }, + { + "epoch": 1.966472183416879, + "grad_norm": 1.2561933994293213, + "learning_rate": 1.7059852484223593e-07, + "loss": 0.4188, + "step": 47860 + }, + { + "epoch": 1.9672939436272494, + "grad_norm": 1.1774425506591797, + "learning_rate": 1.6291699048095464e-07, + "loss": 0.42, + "step": 47880 + }, + { + "epoch": 1.9681157038376202, + "grad_norm": 1.0992674827575684, + "learning_rate": 1.554122599087493e-07, + "loss": 0.4136, + "step": 47900 + }, + { + "epoch": 1.9689374640479906, + "grad_norm": 1.1646844148635864, + "learning_rate": 1.4808434641589985e-07, + "loss": 0.4238, + "step": 47920 + }, + { + "epoch": 1.9697592242583615, + "grad_norm": 1.1003245115280151, + "learning_rate": 1.409332629795923e-07, + "loss": 0.3955, + "step": 47940 + }, + { + "epoch": 1.9705809844687319, + "grad_norm": 1.0711121559143066, + "learning_rate": 1.3395902226384093e-07, + "loss": 0.4218, + "step": 47960 + }, + { + "epoch": 1.9714027446791027, + "grad_norm": 1.0575335025787354, + "learning_rate": 1.2716163661948833e-07, + "loss": 0.4218, + "step": 47980 + }, + { + "epoch": 1.972224504889473, + "grad_norm": 1.1879905462265015, + "learning_rate": 1.205411180841831e-07, + "loss": 0.4031, + "step": 48000 + }, + { + "epoch": 1.972224504889473, + "eval_loss": 0.9563899636268616, + "eval_runtime": 16.5443, + "eval_samples_per_second": 158.363, + "eval_steps_per_second": 4.956, + "step": 48000 + }, + { + "epoch": 1.973046265099844, + "grad_norm": 1.0687421560287476, + "learning_rate": 1.1409747838234675e-07, + "loss": 0.4245, + "step": 48020 + }, + { + "epoch": 1.9738680253102143, + "grad_norm": 1.2620290517807007, + "learning_rate": 1.0783072892518454e-07, + "loss": 0.4032, + "step": 48040 + }, + { + "epoch": 1.9746897855205852, + "grad_norm": 1.201851487159729, + "learning_rate": 1.0174088081060795e-07, + "loss": 0.416, + "step": 48060 + }, + { + "epoch": 1.9755115457309556, + "grad_norm": 1.2505393028259277, + "learning_rate": 9.611938979670054e-08, + "loss": 0.4171, + "step": 48080 + }, + { + "epoch": 1.9763333059413264, + "grad_norm": 1.2700996398925781, + "learning_rate": 9.037453003418739e-08, + "loss": 0.4151, + "step": 48100 + }, + { + "epoch": 1.9771550661516968, + "grad_norm": 1.1103131771087646, + "learning_rate": 8.480660252785378e-08, + "loss": 0.4188, + "step": 48120 + }, + { + "epoch": 1.9779768263620676, + "grad_norm": 1.0756477117538452, + "learning_rate": 7.941561713806777e-08, + "loss": 0.4078, + "step": 48140 + }, + { + "epoch": 1.978798586572438, + "grad_norm": 1.1716761589050293, + "learning_rate": 7.420158341185924e-08, + "loss": 0.4149, + "step": 48160 + }, + { + "epoch": 1.9796203467828088, + "grad_norm": 1.0660831928253174, + "learning_rate": 6.916451058286422e-08, + "loss": 0.4208, + "step": 48180 + }, + { + "epoch": 1.9804421069931792, + "grad_norm": 1.1756824254989624, + "learning_rate": 6.430440757136946e-08, + "loss": 0.4029, + "step": 48200 + }, + { + "epoch": 1.98126386720355, + "grad_norm": 1.0613312721252441, + "learning_rate": 5.962128298423463e-08, + "loss": 0.4096, + "step": 48220 + }, + { + "epoch": 1.9820856274139205, + "grad_norm": 1.1064634323120117, + "learning_rate": 5.533624844936691e-08, + "loss": 0.4193, + "step": 48240 + }, + { + "epoch": 1.9829073876242913, + "grad_norm": 1.0227088928222656, + "learning_rate": 5.099825535829217e-08, + "loss": 0.4216, + "step": 48260 + }, + { + "epoch": 1.9837291478346617, + "grad_norm": 1.2082315683364868, + "learning_rate": 4.683726425576129e-08, + "loss": 0.4266, + "step": 48280 + }, + { + "epoch": 1.9845509080450325, + "grad_norm": 1.2815755605697632, + "learning_rate": 4.2853282510546545e-08, + "loss": 0.3935, + "step": 48300 + }, + { + "epoch": 1.985372668255403, + "grad_norm": 1.1386293172836304, + "learning_rate": 3.904631717798202e-08, + "loss": 0.4065, + "step": 48320 + }, + { + "epoch": 1.9861944284657738, + "grad_norm": 1.1509379148483276, + "learning_rate": 3.5416374999919235e-08, + "loss": 0.4228, + "step": 48340 + }, + { + "epoch": 1.9870161886761442, + "grad_norm": 1.1019117832183838, + "learning_rate": 3.1963462404693835e-08, + "loss": 0.4147, + "step": 48360 + }, + { + "epoch": 1.987837948886515, + "grad_norm": 1.0444250106811523, + "learning_rate": 2.8687585507147784e-08, + "loss": 0.4276, + "step": 48380 + }, + { + "epoch": 1.9886597090968854, + "grad_norm": 1.1302378177642822, + "learning_rate": 2.5588750108618275e-08, + "loss": 0.4149, + "step": 48400 + }, + { + "epoch": 1.9894814693072562, + "grad_norm": 1.082972764968872, + "learning_rate": 2.2666961696893308e-08, + "loss": 0.4388, + "step": 48420 + }, + { + "epoch": 1.9903032295176266, + "grad_norm": 1.1898425817489624, + "learning_rate": 1.9922225446245e-08, + "loss": 0.3988, + "step": 48440 + }, + { + "epoch": 1.9911249897279975, + "grad_norm": 1.2624133825302124, + "learning_rate": 1.7354546217385192e-08, + "loss": 0.4062, + "step": 48460 + }, + { + "epoch": 1.9919467499383678, + "grad_norm": 1.1594356298446655, + "learning_rate": 1.4963928557465425e-08, + "loss": 0.4142, + "step": 48480 + }, + { + "epoch": 1.9927685101487387, + "grad_norm": 1.2006750106811523, + "learning_rate": 1.2750376700099153e-08, + "loss": 0.4203, + "step": 48500 + }, + { + "epoch": 1.993590270359109, + "grad_norm": 1.0823673009872437, + "learning_rate": 1.0713894565317351e-08, + "loss": 0.421, + "step": 48520 + }, + { + "epoch": 1.99441203056948, + "grad_norm": 1.067572832107544, + "learning_rate": 8.854485759568487e-09, + "loss": 0.4103, + "step": 48540 + }, + { + "epoch": 1.9952337907798503, + "grad_norm": 1.3517385721206665, + "learning_rate": 7.172153575718543e-09, + "loss": 0.42, + "step": 48560 + }, + { + "epoch": 1.9960555509902211, + "grad_norm": 1.1812193393707275, + "learning_rate": 5.666900993062107e-09, + "loss": 0.4011, + "step": 48580 + }, + { + "epoch": 1.9968773112005915, + "grad_norm": 1.1128283739089966, + "learning_rate": 4.338730677266867e-09, + "loss": 0.4091, + "step": 48600 + }, + { + "epoch": 1.9976990714109624, + "grad_norm": 1.2038655281066895, + "learning_rate": 3.1876449804291163e-09, + "loss": 0.4236, + "step": 48620 + }, + { + "epoch": 1.9985208316213328, + "grad_norm": 1.2628865242004395, + "learning_rate": 2.213645941029352e-09, + "loss": 0.4098, + "step": 48640 + }, + { + "epoch": 1.9993425918317036, + "grad_norm": 0.9929442405700684, + "learning_rate": 1.4167352839544735e-09, + "loss": 0.4198, + "step": 48660 + } + ], + "logging_steps": 20, + "max_steps": 48676, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.4754061349278974e+19, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}