diff --git "a/checkpoint-45000/trainer_state.json" "b/checkpoint-45000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-45000/trainer_state.json" @@ -0,0 +1,31534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998222538215429, + "eval_steps": 500, + "global_step": 45000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00022218272307145397, + "grad_norm": 3.3125, + "learning_rate": 1.9996000000000003e-05, + "loss": 1.2837, + "step": 10 + }, + { + "epoch": 0.00044436544614290794, + "grad_norm": 3.84375, + "learning_rate": 1.9991555555555558e-05, + "loss": 0.6861, + "step": 20 + }, + { + "epoch": 0.0006665481692143619, + "grad_norm": 2.609375, + "learning_rate": 1.9987111111111113e-05, + "loss": 0.6204, + "step": 30 + }, + { + "epoch": 0.0008887308922858159, + "grad_norm": 2.359375, + "learning_rate": 1.9982666666666668e-05, + "loss": 0.5587, + "step": 40 + }, + { + "epoch": 0.0011109136153572697, + "grad_norm": 2.859375, + "learning_rate": 1.9978222222222222e-05, + "loss": 0.5842, + "step": 50 + }, + { + "epoch": 0.0013330963384287238, + "grad_norm": 2.484375, + "learning_rate": 1.9973777777777777e-05, + "loss": 0.6044, + "step": 60 + }, + { + "epoch": 0.0015552790615001777, + "grad_norm": 2.15625, + "learning_rate": 1.9969333333333335e-05, + "loss": 0.5807, + "step": 70 + }, + { + "epoch": 0.0017774617845716318, + "grad_norm": 2.625, + "learning_rate": 1.996488888888889e-05, + "loss": 0.5589, + "step": 80 + }, + { + "epoch": 0.0019996445076430856, + "grad_norm": 2.4375, + "learning_rate": 1.9960444444444445e-05, + "loss": 0.5105, + "step": 90 + }, + { + "epoch": 0.0022218272307145395, + "grad_norm": 1.8828125, + "learning_rate": 1.9956000000000003e-05, + "loss": 0.5402, + "step": 100 + }, + { + "epoch": 0.002444009953785994, + "grad_norm": 2.65625, + "learning_rate": 1.9951555555555558e-05, + "loss": 0.5461, + "step": 110 + }, + { + "epoch": 0.0026661926768574476, + "grad_norm": 2.15625, + "learning_rate": 1.9947111111111113e-05, + "loss": 0.5681, + "step": 120 + }, + { + "epoch": 0.0028883753999289015, + "grad_norm": 2.609375, + "learning_rate": 1.9942666666666668e-05, + "loss": 0.5401, + "step": 130 + }, + { + "epoch": 0.0031105581230003554, + "grad_norm": 2.5, + "learning_rate": 1.9938222222222223e-05, + "loss": 0.525, + "step": 140 + }, + { + "epoch": 0.0033327408460718097, + "grad_norm": 2.1875, + "learning_rate": 1.9933777777777778e-05, + "loss": 0.5138, + "step": 150 + }, + { + "epoch": 0.0035549235691432635, + "grad_norm": 2.328125, + "learning_rate": 1.9929333333333332e-05, + "loss": 0.5688, + "step": 160 + }, + { + "epoch": 0.0037771062922147174, + "grad_norm": 1.9765625, + "learning_rate": 1.992488888888889e-05, + "loss": 0.5616, + "step": 170 + }, + { + "epoch": 0.003999289015286171, + "grad_norm": 2.34375, + "learning_rate": 1.9920444444444446e-05, + "loss": 0.5798, + "step": 180 + }, + { + "epoch": 0.004221471738357625, + "grad_norm": 2.6875, + "learning_rate": 1.9916e-05, + "loss": 0.5198, + "step": 190 + }, + { + "epoch": 0.004443654461429079, + "grad_norm": 2.296875, + "learning_rate": 1.991155555555556e-05, + "loss": 0.5195, + "step": 200 + }, + { + "epoch": 0.004665837184500533, + "grad_norm": 2.140625, + "learning_rate": 1.9907111111111113e-05, + "loss": 0.5163, + "step": 210 + }, + { + "epoch": 0.004888019907571988, + "grad_norm": 2.171875, + "learning_rate": 1.990266666666667e-05, + "loss": 0.5526, + "step": 220 + }, + { + "epoch": 0.0051102026306434414, + "grad_norm": 2.375, + "learning_rate": 1.9898222222222223e-05, + "loss": 0.5381, + "step": 230 + }, + { + "epoch": 0.005332385353714895, + "grad_norm": 1.8046875, + "learning_rate": 1.9893777777777778e-05, + "loss": 0.4959, + "step": 240 + }, + { + "epoch": 0.005554568076786349, + "grad_norm": 2.40625, + "learning_rate": 1.9889333333333336e-05, + "loss": 0.5625, + "step": 250 + }, + { + "epoch": 0.005776750799857803, + "grad_norm": 2.109375, + "learning_rate": 1.988488888888889e-05, + "loss": 0.5574, + "step": 260 + }, + { + "epoch": 0.005998933522929257, + "grad_norm": 1.90625, + "learning_rate": 1.9880444444444446e-05, + "loss": 0.5417, + "step": 270 + }, + { + "epoch": 0.006221116246000711, + "grad_norm": 2.171875, + "learning_rate": 1.9876e-05, + "loss": 0.5873, + "step": 280 + }, + { + "epoch": 0.006443298969072165, + "grad_norm": 2.75, + "learning_rate": 1.9871555555555556e-05, + "loss": 0.5572, + "step": 290 + }, + { + "epoch": 0.006665481692143619, + "grad_norm": 1.7421875, + "learning_rate": 1.9867111111111114e-05, + "loss": 0.4952, + "step": 300 + }, + { + "epoch": 0.006887664415215073, + "grad_norm": 2.40625, + "learning_rate": 1.986266666666667e-05, + "loss": 0.5766, + "step": 310 + }, + { + "epoch": 0.007109847138286527, + "grad_norm": 2.03125, + "learning_rate": 1.9858222222222224e-05, + "loss": 0.5868, + "step": 320 + }, + { + "epoch": 0.007332029861357981, + "grad_norm": 2.671875, + "learning_rate": 1.985377777777778e-05, + "loss": 0.531, + "step": 330 + }, + { + "epoch": 0.007554212584429435, + "grad_norm": 2.109375, + "learning_rate": 1.9849333333333337e-05, + "loss": 0.5477, + "step": 340 + }, + { + "epoch": 0.007776395307500889, + "grad_norm": 2.703125, + "learning_rate": 1.984488888888889e-05, + "loss": 0.5688, + "step": 350 + }, + { + "epoch": 0.007998578030572343, + "grad_norm": 2.15625, + "learning_rate": 1.9840444444444446e-05, + "loss": 0.5066, + "step": 360 + }, + { + "epoch": 0.008220760753643797, + "grad_norm": 2.59375, + "learning_rate": 1.9836e-05, + "loss": 0.5124, + "step": 370 + }, + { + "epoch": 0.00844294347671525, + "grad_norm": 2.125, + "learning_rate": 1.9831555555555556e-05, + "loss": 0.5109, + "step": 380 + }, + { + "epoch": 0.008665126199786705, + "grad_norm": 2.59375, + "learning_rate": 1.982711111111111e-05, + "loss": 0.5672, + "step": 390 + }, + { + "epoch": 0.008887308922858158, + "grad_norm": 2.203125, + "learning_rate": 1.982266666666667e-05, + "loss": 0.5503, + "step": 400 + }, + { + "epoch": 0.009109491645929613, + "grad_norm": 1.9609375, + "learning_rate": 1.9818222222222224e-05, + "loss": 0.5321, + "step": 410 + }, + { + "epoch": 0.009331674369001066, + "grad_norm": 2.515625, + "learning_rate": 1.981377777777778e-05, + "loss": 0.4862, + "step": 420 + }, + { + "epoch": 0.00955385709207252, + "grad_norm": 2.4375, + "learning_rate": 1.9809333333333337e-05, + "loss": 0.4908, + "step": 430 + }, + { + "epoch": 0.009776039815143975, + "grad_norm": 2.46875, + "learning_rate": 1.9804888888888892e-05, + "loss": 0.5749, + "step": 440 + }, + { + "epoch": 0.009998222538215428, + "grad_norm": 2.78125, + "learning_rate": 1.9800444444444447e-05, + "loss": 0.532, + "step": 450 + }, + { + "epoch": 0.010220405261286883, + "grad_norm": 1.7421875, + "learning_rate": 1.9796e-05, + "loss": 0.5432, + "step": 460 + }, + { + "epoch": 0.010442587984358336, + "grad_norm": 2.25, + "learning_rate": 1.9791555555555556e-05, + "loss": 0.5124, + "step": 470 + }, + { + "epoch": 0.01066477070742979, + "grad_norm": 2.375, + "learning_rate": 1.978711111111111e-05, + "loss": 0.5398, + "step": 480 + }, + { + "epoch": 0.010886953430501244, + "grad_norm": 2.453125, + "learning_rate": 1.978266666666667e-05, + "loss": 0.5623, + "step": 490 + }, + { + "epoch": 0.011109136153572698, + "grad_norm": 2.28125, + "learning_rate": 1.9778222222222224e-05, + "loss": 0.4947, + "step": 500 + }, + { + "epoch": 0.011331318876644151, + "grad_norm": 1.9453125, + "learning_rate": 1.977377777777778e-05, + "loss": 0.5006, + "step": 510 + }, + { + "epoch": 0.011553501599715606, + "grad_norm": 2.3125, + "learning_rate": 1.9769333333333337e-05, + "loss": 0.5097, + "step": 520 + }, + { + "epoch": 0.01177568432278706, + "grad_norm": 2.0625, + "learning_rate": 1.9764888888888892e-05, + "loss": 0.4702, + "step": 530 + }, + { + "epoch": 0.011997867045858514, + "grad_norm": 1.9609375, + "learning_rate": 1.9760444444444447e-05, + "loss": 0.5285, + "step": 540 + }, + { + "epoch": 0.012220049768929968, + "grad_norm": 1.8125, + "learning_rate": 1.9756000000000002e-05, + "loss": 0.522, + "step": 550 + }, + { + "epoch": 0.012442232492001421, + "grad_norm": 2.046875, + "learning_rate": 1.9751555555555557e-05, + "loss": 0.4878, + "step": 560 + }, + { + "epoch": 0.012664415215072876, + "grad_norm": 2.21875, + "learning_rate": 1.974711111111111e-05, + "loss": 0.5499, + "step": 570 + }, + { + "epoch": 0.01288659793814433, + "grad_norm": 1.984375, + "learning_rate": 1.9742666666666666e-05, + "loss": 0.487, + "step": 580 + }, + { + "epoch": 0.013108780661215784, + "grad_norm": 2.15625, + "learning_rate": 1.9738222222222225e-05, + "loss": 0.5365, + "step": 590 + }, + { + "epoch": 0.013330963384287239, + "grad_norm": 2.1875, + "learning_rate": 1.973377777777778e-05, + "loss": 0.5492, + "step": 600 + }, + { + "epoch": 0.013553146107358692, + "grad_norm": 2.21875, + "learning_rate": 1.9729333333333334e-05, + "loss": 0.4887, + "step": 610 + }, + { + "epoch": 0.013775328830430146, + "grad_norm": 2.328125, + "learning_rate": 1.9724888888888893e-05, + "loss": 0.5365, + "step": 620 + }, + { + "epoch": 0.0139975115535016, + "grad_norm": 2.078125, + "learning_rate": 1.9720444444444447e-05, + "loss": 0.5356, + "step": 630 + }, + { + "epoch": 0.014219694276573054, + "grad_norm": 1.9296875, + "learning_rate": 1.9716000000000002e-05, + "loss": 0.5229, + "step": 640 + }, + { + "epoch": 0.014441876999644507, + "grad_norm": 2.03125, + "learning_rate": 1.9711555555555557e-05, + "loss": 0.4801, + "step": 650 + }, + { + "epoch": 0.014664059722715962, + "grad_norm": 2.046875, + "learning_rate": 1.9707111111111112e-05, + "loss": 0.5292, + "step": 660 + }, + { + "epoch": 0.014886242445787415, + "grad_norm": 2.21875, + "learning_rate": 1.9702666666666667e-05, + "loss": 0.4828, + "step": 670 + }, + { + "epoch": 0.01510842516885887, + "grad_norm": 2.0625, + "learning_rate": 1.9698222222222222e-05, + "loss": 0.5338, + "step": 680 + }, + { + "epoch": 0.015330607891930324, + "grad_norm": 2.140625, + "learning_rate": 1.969377777777778e-05, + "loss": 0.5348, + "step": 690 + }, + { + "epoch": 0.015552790615001777, + "grad_norm": 2.109375, + "learning_rate": 1.9689333333333335e-05, + "loss": 0.5324, + "step": 700 + }, + { + "epoch": 0.01577497333807323, + "grad_norm": 1.8359375, + "learning_rate": 1.968488888888889e-05, + "loss": 0.4862, + "step": 710 + }, + { + "epoch": 0.015997156061144685, + "grad_norm": 1.875, + "learning_rate": 1.9680444444444448e-05, + "loss": 0.4729, + "step": 720 + }, + { + "epoch": 0.01621933878421614, + "grad_norm": 2.203125, + "learning_rate": 1.9676000000000003e-05, + "loss": 0.5072, + "step": 730 + }, + { + "epoch": 0.016441521507287594, + "grad_norm": 1.75, + "learning_rate": 1.9671555555555558e-05, + "loss": 0.4851, + "step": 740 + }, + { + "epoch": 0.016663704230359046, + "grad_norm": 1.734375, + "learning_rate": 1.9667111111111112e-05, + "loss": 0.5163, + "step": 750 + }, + { + "epoch": 0.0168858869534305, + "grad_norm": 1.9609375, + "learning_rate": 1.9662666666666667e-05, + "loss": 0.5307, + "step": 760 + }, + { + "epoch": 0.017108069676501955, + "grad_norm": 2.0, + "learning_rate": 1.9658222222222222e-05, + "loss": 0.4971, + "step": 770 + }, + { + "epoch": 0.01733025239957341, + "grad_norm": 1.8671875, + "learning_rate": 1.9653777777777777e-05, + "loss": 0.5307, + "step": 780 + }, + { + "epoch": 0.017552435122644865, + "grad_norm": 2.65625, + "learning_rate": 1.9649333333333335e-05, + "loss": 0.5158, + "step": 790 + }, + { + "epoch": 0.017774617845716316, + "grad_norm": 2.359375, + "learning_rate": 1.964488888888889e-05, + "loss": 0.5216, + "step": 800 + }, + { + "epoch": 0.01799680056878777, + "grad_norm": 2.109375, + "learning_rate": 1.9640444444444445e-05, + "loss": 0.544, + "step": 810 + }, + { + "epoch": 0.018218983291859225, + "grad_norm": 1.8203125, + "learning_rate": 1.9636000000000003e-05, + "loss": 0.5232, + "step": 820 + }, + { + "epoch": 0.01844116601493068, + "grad_norm": 2.15625, + "learning_rate": 1.9631555555555558e-05, + "loss": 0.481, + "step": 830 + }, + { + "epoch": 0.01866334873800213, + "grad_norm": 2.171875, + "learning_rate": 1.9627111111111113e-05, + "loss": 0.4943, + "step": 840 + }, + { + "epoch": 0.018885531461073586, + "grad_norm": 1.9296875, + "learning_rate": 1.9622666666666668e-05, + "loss": 0.5153, + "step": 850 + }, + { + "epoch": 0.01910771418414504, + "grad_norm": 1.84375, + "learning_rate": 1.9618222222222222e-05, + "loss": 0.4575, + "step": 860 + }, + { + "epoch": 0.019329896907216496, + "grad_norm": 1.7890625, + "learning_rate": 1.961377777777778e-05, + "loss": 0.5078, + "step": 870 + }, + { + "epoch": 0.01955207963028795, + "grad_norm": 1.890625, + "learning_rate": 1.9609333333333336e-05, + "loss": 0.4857, + "step": 880 + }, + { + "epoch": 0.0197742623533594, + "grad_norm": 1.7890625, + "learning_rate": 1.960488888888889e-05, + "loss": 0.5011, + "step": 890 + }, + { + "epoch": 0.019996445076430856, + "grad_norm": 1.6796875, + "learning_rate": 1.9600444444444445e-05, + "loss": 0.5439, + "step": 900 + }, + { + "epoch": 0.02021862779950231, + "grad_norm": 2.0625, + "learning_rate": 1.9596e-05, + "loss": 0.4918, + "step": 910 + }, + { + "epoch": 0.020440810522573766, + "grad_norm": 1.859375, + "learning_rate": 1.959155555555556e-05, + "loss": 0.4805, + "step": 920 + }, + { + "epoch": 0.020662993245645217, + "grad_norm": 2.046875, + "learning_rate": 1.9587111111111113e-05, + "loss": 0.4735, + "step": 930 + }, + { + "epoch": 0.02088517596871667, + "grad_norm": 1.546875, + "learning_rate": 1.9582666666666668e-05, + "loss": 0.4953, + "step": 940 + }, + { + "epoch": 0.021107358691788126, + "grad_norm": 1.7734375, + "learning_rate": 1.9578222222222223e-05, + "loss": 0.4638, + "step": 950 + }, + { + "epoch": 0.02132954141485958, + "grad_norm": 2.3125, + "learning_rate": 1.957377777777778e-05, + "loss": 0.5258, + "step": 960 + }, + { + "epoch": 0.021551724137931036, + "grad_norm": 2.3125, + "learning_rate": 1.9569333333333336e-05, + "loss": 0.4854, + "step": 970 + }, + { + "epoch": 0.021773906861002487, + "grad_norm": 2.421875, + "learning_rate": 1.956488888888889e-05, + "loss": 0.5109, + "step": 980 + }, + { + "epoch": 0.021996089584073942, + "grad_norm": 1.921875, + "learning_rate": 1.9560444444444446e-05, + "loss": 0.5116, + "step": 990 + }, + { + "epoch": 0.022218272307145397, + "grad_norm": 2.34375, + "learning_rate": 1.9556e-05, + "loss": 0.5149, + "step": 1000 + }, + { + "epoch": 0.02244045503021685, + "grad_norm": 1.703125, + "learning_rate": 1.9551555555555555e-05, + "loss": 0.5, + "step": 1010 + }, + { + "epoch": 0.022662637753288303, + "grad_norm": 2.1875, + "learning_rate": 1.9547111111111114e-05, + "loss": 0.4927, + "step": 1020 + }, + { + "epoch": 0.022884820476359757, + "grad_norm": 2.0625, + "learning_rate": 1.954266666666667e-05, + "loss": 0.5126, + "step": 1030 + }, + { + "epoch": 0.023107003199431212, + "grad_norm": 2.046875, + "learning_rate": 1.9538222222222223e-05, + "loss": 0.5177, + "step": 1040 + }, + { + "epoch": 0.023329185922502667, + "grad_norm": 2.015625, + "learning_rate": 1.953377777777778e-05, + "loss": 0.4775, + "step": 1050 + }, + { + "epoch": 0.02355136864557412, + "grad_norm": 2.125, + "learning_rate": 1.9529333333333336e-05, + "loss": 0.5069, + "step": 1060 + }, + { + "epoch": 0.023773551368645573, + "grad_norm": 1.96875, + "learning_rate": 1.952488888888889e-05, + "loss": 0.4852, + "step": 1070 + }, + { + "epoch": 0.023995734091717028, + "grad_norm": 2.234375, + "learning_rate": 1.9520444444444446e-05, + "loss": 0.4755, + "step": 1080 + }, + { + "epoch": 0.024217916814788482, + "grad_norm": 2.125, + "learning_rate": 1.9516e-05, + "loss": 0.4473, + "step": 1090 + }, + { + "epoch": 0.024440099537859937, + "grad_norm": 2.203125, + "learning_rate": 1.9511555555555556e-05, + "loss": 0.5012, + "step": 1100 + }, + { + "epoch": 0.024662282260931388, + "grad_norm": 2.171875, + "learning_rate": 1.950711111111111e-05, + "loss": 0.4845, + "step": 1110 + }, + { + "epoch": 0.024884464984002843, + "grad_norm": 1.640625, + "learning_rate": 1.950266666666667e-05, + "loss": 0.4897, + "step": 1120 + }, + { + "epoch": 0.025106647707074298, + "grad_norm": 1.7578125, + "learning_rate": 1.9498222222222224e-05, + "loss": 0.4571, + "step": 1130 + }, + { + "epoch": 0.025328830430145752, + "grad_norm": 2.1875, + "learning_rate": 1.9493777777777782e-05, + "loss": 0.4878, + "step": 1140 + }, + { + "epoch": 0.025551013153217207, + "grad_norm": 2.078125, + "learning_rate": 1.9489333333333337e-05, + "loss": 0.5114, + "step": 1150 + }, + { + "epoch": 0.02577319587628866, + "grad_norm": 2.203125, + "learning_rate": 1.948488888888889e-05, + "loss": 0.5107, + "step": 1160 + }, + { + "epoch": 0.025995378599360113, + "grad_norm": 2.078125, + "learning_rate": 1.9480444444444446e-05, + "loss": 0.5008, + "step": 1170 + }, + { + "epoch": 0.026217561322431568, + "grad_norm": 2.328125, + "learning_rate": 1.9476e-05, + "loss": 0.5021, + "step": 1180 + }, + { + "epoch": 0.026439744045503023, + "grad_norm": 2.0625, + "learning_rate": 1.9471555555555556e-05, + "loss": 0.538, + "step": 1190 + }, + { + "epoch": 0.026661926768574477, + "grad_norm": 1.625, + "learning_rate": 1.946711111111111e-05, + "loss": 0.4946, + "step": 1200 + }, + { + "epoch": 0.02688410949164593, + "grad_norm": 2.296875, + "learning_rate": 1.946266666666667e-05, + "loss": 0.4855, + "step": 1210 + }, + { + "epoch": 0.027106292214717383, + "grad_norm": 2.171875, + "learning_rate": 1.9458222222222224e-05, + "loss": 0.5346, + "step": 1220 + }, + { + "epoch": 0.027328474937788838, + "grad_norm": 1.7109375, + "learning_rate": 1.945377777777778e-05, + "loss": 0.5336, + "step": 1230 + }, + { + "epoch": 0.027550657660860293, + "grad_norm": 2.03125, + "learning_rate": 1.9449333333333337e-05, + "loss": 0.5163, + "step": 1240 + }, + { + "epoch": 0.027772840383931744, + "grad_norm": 1.875, + "learning_rate": 1.9444888888888892e-05, + "loss": 0.483, + "step": 1250 + }, + { + "epoch": 0.0279950231070032, + "grad_norm": 2.203125, + "learning_rate": 1.9440444444444447e-05, + "loss": 0.5556, + "step": 1260 + }, + { + "epoch": 0.028217205830074654, + "grad_norm": 2.46875, + "learning_rate": 1.9436e-05, + "loss": 0.5281, + "step": 1270 + }, + { + "epoch": 0.028439388553146108, + "grad_norm": 2.265625, + "learning_rate": 1.9431555555555556e-05, + "loss": 0.4806, + "step": 1280 + }, + { + "epoch": 0.028661571276217563, + "grad_norm": 1.7265625, + "learning_rate": 1.942711111111111e-05, + "loss": 0.5233, + "step": 1290 + }, + { + "epoch": 0.028883753999289014, + "grad_norm": 1.8828125, + "learning_rate": 1.9422666666666666e-05, + "loss": 0.5057, + "step": 1300 + }, + { + "epoch": 0.02910593672236047, + "grad_norm": 1.7890625, + "learning_rate": 1.9418222222222224e-05, + "loss": 0.4829, + "step": 1310 + }, + { + "epoch": 0.029328119445431924, + "grad_norm": 2.046875, + "learning_rate": 1.941377777777778e-05, + "loss": 0.4969, + "step": 1320 + }, + { + "epoch": 0.02955030216850338, + "grad_norm": 2.09375, + "learning_rate": 1.9409333333333334e-05, + "loss": 0.5034, + "step": 1330 + }, + { + "epoch": 0.02977248489157483, + "grad_norm": 1.8515625, + "learning_rate": 1.9404888888888892e-05, + "loss": 0.5165, + "step": 1340 + }, + { + "epoch": 0.029994667614646284, + "grad_norm": 1.953125, + "learning_rate": 1.9400444444444447e-05, + "loss": 0.4964, + "step": 1350 + }, + { + "epoch": 0.03021685033771774, + "grad_norm": 1.8125, + "learning_rate": 1.9396000000000002e-05, + "loss": 0.4331, + "step": 1360 + }, + { + "epoch": 0.030439033060789194, + "grad_norm": 1.8203125, + "learning_rate": 1.9391555555555557e-05, + "loss": 0.4651, + "step": 1370 + }, + { + "epoch": 0.03066121578386065, + "grad_norm": 2.015625, + "learning_rate": 1.938711111111111e-05, + "loss": 0.48, + "step": 1380 + }, + { + "epoch": 0.0308833985069321, + "grad_norm": 1.9609375, + "learning_rate": 1.9382666666666667e-05, + "loss": 0.4874, + "step": 1390 + }, + { + "epoch": 0.031105581230003555, + "grad_norm": 1.734375, + "learning_rate": 1.937822222222222e-05, + "loss": 0.4431, + "step": 1400 + }, + { + "epoch": 0.03132776395307501, + "grad_norm": 1.96875, + "learning_rate": 1.937377777777778e-05, + "loss": 0.5207, + "step": 1410 + }, + { + "epoch": 0.03154994667614646, + "grad_norm": 1.984375, + "learning_rate": 1.9369333333333334e-05, + "loss": 0.5071, + "step": 1420 + }, + { + "epoch": 0.03177212939921792, + "grad_norm": 2.046875, + "learning_rate": 1.936488888888889e-05, + "loss": 0.5093, + "step": 1430 + }, + { + "epoch": 0.03199431212228937, + "grad_norm": 1.59375, + "learning_rate": 1.9360444444444448e-05, + "loss": 0.4823, + "step": 1440 + }, + { + "epoch": 0.03221649484536082, + "grad_norm": 1.5546875, + "learning_rate": 1.9356000000000002e-05, + "loss": 0.4658, + "step": 1450 + }, + { + "epoch": 0.03243867756843228, + "grad_norm": 1.78125, + "learning_rate": 1.9351555555555557e-05, + "loss": 0.5124, + "step": 1460 + }, + { + "epoch": 0.03266086029150373, + "grad_norm": 2.046875, + "learning_rate": 1.9347111111111112e-05, + "loss": 0.513, + "step": 1470 + }, + { + "epoch": 0.03288304301457519, + "grad_norm": 1.9921875, + "learning_rate": 1.9342666666666667e-05, + "loss": 0.4456, + "step": 1480 + }, + { + "epoch": 0.03310522573764664, + "grad_norm": 1.8984375, + "learning_rate": 1.9338222222222225e-05, + "loss": 0.5171, + "step": 1490 + }, + { + "epoch": 0.03332740846071809, + "grad_norm": 1.875, + "learning_rate": 1.933377777777778e-05, + "loss": 0.4782, + "step": 1500 + }, + { + "epoch": 0.03354959118378955, + "grad_norm": 1.7265625, + "learning_rate": 1.9329333333333335e-05, + "loss": 0.4689, + "step": 1510 + }, + { + "epoch": 0.033771773906861, + "grad_norm": 1.96875, + "learning_rate": 1.932488888888889e-05, + "loss": 0.506, + "step": 1520 + }, + { + "epoch": 0.03399395662993246, + "grad_norm": 2.421875, + "learning_rate": 1.9320444444444445e-05, + "loss": 0.5159, + "step": 1530 + }, + { + "epoch": 0.03421613935300391, + "grad_norm": 2.0625, + "learning_rate": 1.9316000000000003e-05, + "loss": 0.504, + "step": 1540 + }, + { + "epoch": 0.03443832207607536, + "grad_norm": 1.5546875, + "learning_rate": 1.9311555555555558e-05, + "loss": 0.5043, + "step": 1550 + }, + { + "epoch": 0.03466050479914682, + "grad_norm": 1.84375, + "learning_rate": 1.9307111111111112e-05, + "loss": 0.4947, + "step": 1560 + }, + { + "epoch": 0.03488268752221827, + "grad_norm": 2.3125, + "learning_rate": 1.9302666666666667e-05, + "loss": 0.4833, + "step": 1570 + }, + { + "epoch": 0.03510487024528973, + "grad_norm": 1.8046875, + "learning_rate": 1.9298222222222226e-05, + "loss": 0.4666, + "step": 1580 + }, + { + "epoch": 0.03532705296836118, + "grad_norm": 2.078125, + "learning_rate": 1.929377777777778e-05, + "loss": 0.4763, + "step": 1590 + }, + { + "epoch": 0.03554923569143263, + "grad_norm": 2.234375, + "learning_rate": 1.9289333333333335e-05, + "loss": 0.519, + "step": 1600 + }, + { + "epoch": 0.03577141841450409, + "grad_norm": 1.8046875, + "learning_rate": 1.928488888888889e-05, + "loss": 0.5078, + "step": 1610 + }, + { + "epoch": 0.03599360113757554, + "grad_norm": 1.9921875, + "learning_rate": 1.9280444444444445e-05, + "loss": 0.5186, + "step": 1620 + }, + { + "epoch": 0.036215783860647, + "grad_norm": 1.6875, + "learning_rate": 1.9276e-05, + "loss": 0.4755, + "step": 1630 + }, + { + "epoch": 0.03643796658371845, + "grad_norm": 1.8828125, + "learning_rate": 1.9271555555555558e-05, + "loss": 0.4768, + "step": 1640 + }, + { + "epoch": 0.0366601493067899, + "grad_norm": 1.859375, + "learning_rate": 1.9267111111111113e-05, + "loss": 0.5004, + "step": 1650 + }, + { + "epoch": 0.03688233202986136, + "grad_norm": 2.40625, + "learning_rate": 1.9262666666666668e-05, + "loss": 0.5269, + "step": 1660 + }, + { + "epoch": 0.03710451475293281, + "grad_norm": 1.7890625, + "learning_rate": 1.9258222222222226e-05, + "loss": 0.4504, + "step": 1670 + }, + { + "epoch": 0.03732669747600426, + "grad_norm": 2.265625, + "learning_rate": 1.925377777777778e-05, + "loss": 0.5188, + "step": 1680 + }, + { + "epoch": 0.03754888019907572, + "grad_norm": 2.140625, + "learning_rate": 1.9249333333333336e-05, + "loss": 0.4665, + "step": 1690 + }, + { + "epoch": 0.03777106292214717, + "grad_norm": 1.890625, + "learning_rate": 1.924488888888889e-05, + "loss": 0.5029, + "step": 1700 + }, + { + "epoch": 0.03799324564521863, + "grad_norm": 1.9609375, + "learning_rate": 1.9240444444444445e-05, + "loss": 0.5019, + "step": 1710 + }, + { + "epoch": 0.03821542836829008, + "grad_norm": 2.0625, + "learning_rate": 1.9236e-05, + "loss": 0.4716, + "step": 1720 + }, + { + "epoch": 0.03843761109136153, + "grad_norm": 1.8515625, + "learning_rate": 1.9231555555555555e-05, + "loss": 0.4626, + "step": 1730 + }, + { + "epoch": 0.03865979381443299, + "grad_norm": 1.8828125, + "learning_rate": 1.9227111111111113e-05, + "loss": 0.4727, + "step": 1740 + }, + { + "epoch": 0.03888197653750444, + "grad_norm": 2.0, + "learning_rate": 1.9222666666666668e-05, + "loss": 0.5317, + "step": 1750 + }, + { + "epoch": 0.0391041592605759, + "grad_norm": 1.734375, + "learning_rate": 1.9218222222222226e-05, + "loss": 0.4401, + "step": 1760 + }, + { + "epoch": 0.03932634198364735, + "grad_norm": 1.6875, + "learning_rate": 1.921377777777778e-05, + "loss": 0.4829, + "step": 1770 + }, + { + "epoch": 0.0395485247067188, + "grad_norm": 2.109375, + "learning_rate": 1.9209333333333336e-05, + "loss": 0.4393, + "step": 1780 + }, + { + "epoch": 0.03977070742979026, + "grad_norm": 1.84375, + "learning_rate": 1.920488888888889e-05, + "loss": 0.4402, + "step": 1790 + }, + { + "epoch": 0.03999289015286171, + "grad_norm": 2.25, + "learning_rate": 1.9200444444444446e-05, + "loss": 0.491, + "step": 1800 + }, + { + "epoch": 0.04021507287593317, + "grad_norm": 2.203125, + "learning_rate": 1.9196e-05, + "loss": 0.487, + "step": 1810 + }, + { + "epoch": 0.04043725559900462, + "grad_norm": 1.6171875, + "learning_rate": 1.9191555555555555e-05, + "loss": 0.5038, + "step": 1820 + }, + { + "epoch": 0.04065943832207607, + "grad_norm": 2.125, + "learning_rate": 1.9187111111111114e-05, + "loss": 0.4536, + "step": 1830 + }, + { + "epoch": 0.04088162104514753, + "grad_norm": 1.6875, + "learning_rate": 1.918266666666667e-05, + "loss": 0.5185, + "step": 1840 + }, + { + "epoch": 0.04110380376821898, + "grad_norm": 1.8828125, + "learning_rate": 1.9178222222222223e-05, + "loss": 0.4818, + "step": 1850 + }, + { + "epoch": 0.041325986491290434, + "grad_norm": 1.9921875, + "learning_rate": 1.917377777777778e-05, + "loss": 0.4608, + "step": 1860 + }, + { + "epoch": 0.04154816921436189, + "grad_norm": 1.7734375, + "learning_rate": 1.9169333333333336e-05, + "loss": 0.4942, + "step": 1870 + }, + { + "epoch": 0.04177035193743334, + "grad_norm": 1.734375, + "learning_rate": 1.916488888888889e-05, + "loss": 0.4829, + "step": 1880 + }, + { + "epoch": 0.0419925346605048, + "grad_norm": 2.015625, + "learning_rate": 1.9160444444444446e-05, + "loss": 0.4504, + "step": 1890 + }, + { + "epoch": 0.04221471738357625, + "grad_norm": 1.9765625, + "learning_rate": 1.9156e-05, + "loss": 0.491, + "step": 1900 + }, + { + "epoch": 0.042436900106647704, + "grad_norm": 2.34375, + "learning_rate": 1.9151555555555556e-05, + "loss": 0.486, + "step": 1910 + }, + { + "epoch": 0.04265908282971916, + "grad_norm": 2.4375, + "learning_rate": 1.914711111111111e-05, + "loss": 0.4814, + "step": 1920 + }, + { + "epoch": 0.042881265552790614, + "grad_norm": 1.9453125, + "learning_rate": 1.914266666666667e-05, + "loss": 0.4853, + "step": 1930 + }, + { + "epoch": 0.04310344827586207, + "grad_norm": 1.84375, + "learning_rate": 1.9138222222222224e-05, + "loss": 0.5006, + "step": 1940 + }, + { + "epoch": 0.04332563099893352, + "grad_norm": 1.8046875, + "learning_rate": 1.913377777777778e-05, + "loss": 0.5142, + "step": 1950 + }, + { + "epoch": 0.043547813722004974, + "grad_norm": 1.8984375, + "learning_rate": 1.9129333333333337e-05, + "loss": 0.4764, + "step": 1960 + }, + { + "epoch": 0.04376999644507643, + "grad_norm": 1.703125, + "learning_rate": 1.912488888888889e-05, + "loss": 0.4343, + "step": 1970 + }, + { + "epoch": 0.043992179168147884, + "grad_norm": 2.34375, + "learning_rate": 1.9120444444444446e-05, + "loss": 0.5007, + "step": 1980 + }, + { + "epoch": 0.04421436189121934, + "grad_norm": 2.3125, + "learning_rate": 1.9116e-05, + "loss": 0.5271, + "step": 1990 + }, + { + "epoch": 0.04443654461429079, + "grad_norm": 1.9375, + "learning_rate": 1.9111555555555556e-05, + "loss": 0.4994, + "step": 2000 + }, + { + "epoch": 0.044658727337362245, + "grad_norm": 1.828125, + "learning_rate": 1.910711111111111e-05, + "loss": 0.4418, + "step": 2010 + }, + { + "epoch": 0.0448809100604337, + "grad_norm": 1.7578125, + "learning_rate": 1.9102666666666666e-05, + "loss": 0.4824, + "step": 2020 + }, + { + "epoch": 0.045103092783505154, + "grad_norm": 1.9921875, + "learning_rate": 1.9098222222222224e-05, + "loss": 0.4577, + "step": 2030 + }, + { + "epoch": 0.045325275506576605, + "grad_norm": 2.0625, + "learning_rate": 1.909377777777778e-05, + "loss": 0.4742, + "step": 2040 + }, + { + "epoch": 0.04554745822964806, + "grad_norm": 1.9921875, + "learning_rate": 1.9089333333333334e-05, + "loss": 0.4691, + "step": 2050 + }, + { + "epoch": 0.045769640952719515, + "grad_norm": 2.109375, + "learning_rate": 1.9084888888888892e-05, + "loss": 0.4703, + "step": 2060 + }, + { + "epoch": 0.04599182367579097, + "grad_norm": 1.8125, + "learning_rate": 1.9080444444444447e-05, + "loss": 0.4721, + "step": 2070 + }, + { + "epoch": 0.046214006398862424, + "grad_norm": 1.8515625, + "learning_rate": 1.9076e-05, + "loss": 0.4702, + "step": 2080 + }, + { + "epoch": 0.046436189121933875, + "grad_norm": 1.6953125, + "learning_rate": 1.9071555555555557e-05, + "loss": 0.4956, + "step": 2090 + }, + { + "epoch": 0.046658371845005334, + "grad_norm": 2.109375, + "learning_rate": 1.906711111111111e-05, + "loss": 0.4889, + "step": 2100 + }, + { + "epoch": 0.046880554568076785, + "grad_norm": 2.046875, + "learning_rate": 1.906266666666667e-05, + "loss": 0.4642, + "step": 2110 + }, + { + "epoch": 0.04710273729114824, + "grad_norm": 1.78125, + "learning_rate": 1.9058222222222224e-05, + "loss": 0.4673, + "step": 2120 + }, + { + "epoch": 0.047324920014219694, + "grad_norm": 1.8125, + "learning_rate": 1.905377777777778e-05, + "loss": 0.478, + "step": 2130 + }, + { + "epoch": 0.047547102737291146, + "grad_norm": 2.109375, + "learning_rate": 1.9049333333333334e-05, + "loss": 0.4863, + "step": 2140 + }, + { + "epoch": 0.047769285460362604, + "grad_norm": 1.9453125, + "learning_rate": 1.904488888888889e-05, + "loss": 0.4782, + "step": 2150 + }, + { + "epoch": 0.047991468183434055, + "grad_norm": 1.6328125, + "learning_rate": 1.9040444444444447e-05, + "loss": 0.4336, + "step": 2160 + }, + { + "epoch": 0.04821365090650551, + "grad_norm": 1.9140625, + "learning_rate": 1.9036000000000002e-05, + "loss": 0.4448, + "step": 2170 + }, + { + "epoch": 0.048435833629576965, + "grad_norm": 1.6796875, + "learning_rate": 1.9031555555555557e-05, + "loss": 0.4361, + "step": 2180 + }, + { + "epoch": 0.048658016352648416, + "grad_norm": 2.078125, + "learning_rate": 1.9027111111111112e-05, + "loss": 0.4494, + "step": 2190 + }, + { + "epoch": 0.048880199075719874, + "grad_norm": 1.75, + "learning_rate": 1.902266666666667e-05, + "loss": 0.4557, + "step": 2200 + }, + { + "epoch": 0.049102381798791325, + "grad_norm": 1.75, + "learning_rate": 1.9018222222222225e-05, + "loss": 0.4584, + "step": 2210 + }, + { + "epoch": 0.049324564521862777, + "grad_norm": 2.265625, + "learning_rate": 1.901377777777778e-05, + "loss": 0.5059, + "step": 2220 + }, + { + "epoch": 0.049546747244934235, + "grad_norm": 2.359375, + "learning_rate": 1.9009333333333335e-05, + "loss": 0.4931, + "step": 2230 + }, + { + "epoch": 0.049768929968005686, + "grad_norm": 1.7734375, + "learning_rate": 1.900488888888889e-05, + "loss": 0.4557, + "step": 2240 + }, + { + "epoch": 0.049991112691077144, + "grad_norm": 1.9453125, + "learning_rate": 1.9000444444444444e-05, + "loss": 0.4672, + "step": 2250 + }, + { + "epoch": 0.050213295414148595, + "grad_norm": 1.71875, + "learning_rate": 1.8996000000000002e-05, + "loss": 0.4516, + "step": 2260 + }, + { + "epoch": 0.05043547813722005, + "grad_norm": 1.9453125, + "learning_rate": 1.8991555555555557e-05, + "loss": 0.4555, + "step": 2270 + }, + { + "epoch": 0.050657660860291505, + "grad_norm": 2.1875, + "learning_rate": 1.8987111111111112e-05, + "loss": 0.453, + "step": 2280 + }, + { + "epoch": 0.050879843583362956, + "grad_norm": 1.90625, + "learning_rate": 1.898266666666667e-05, + "loss": 0.4844, + "step": 2290 + }, + { + "epoch": 0.051102026306434414, + "grad_norm": 2.296875, + "learning_rate": 1.8978222222222225e-05, + "loss": 0.4848, + "step": 2300 + }, + { + "epoch": 0.051324209029505866, + "grad_norm": 1.9765625, + "learning_rate": 1.897377777777778e-05, + "loss": 0.4426, + "step": 2310 + }, + { + "epoch": 0.05154639175257732, + "grad_norm": 1.8984375, + "learning_rate": 1.8969333333333335e-05, + "loss": 0.4427, + "step": 2320 + }, + { + "epoch": 0.051768574475648775, + "grad_norm": 1.96875, + "learning_rate": 1.896488888888889e-05, + "loss": 0.4774, + "step": 2330 + }, + { + "epoch": 0.051990757198720226, + "grad_norm": 1.875, + "learning_rate": 1.8960444444444445e-05, + "loss": 0.4683, + "step": 2340 + }, + { + "epoch": 0.052212939921791685, + "grad_norm": 2.0625, + "learning_rate": 1.8956e-05, + "loss": 0.4915, + "step": 2350 + }, + { + "epoch": 0.052435122644863136, + "grad_norm": 2.078125, + "learning_rate": 1.8951555555555558e-05, + "loss": 0.4971, + "step": 2360 + }, + { + "epoch": 0.05265730536793459, + "grad_norm": 1.7890625, + "learning_rate": 1.8947111111111113e-05, + "loss": 0.4544, + "step": 2370 + }, + { + "epoch": 0.052879488091006045, + "grad_norm": 2.109375, + "learning_rate": 1.894266666666667e-05, + "loss": 0.4575, + "step": 2380 + }, + { + "epoch": 0.053101670814077497, + "grad_norm": 1.78125, + "learning_rate": 1.8938222222222226e-05, + "loss": 0.5068, + "step": 2390 + }, + { + "epoch": 0.053323853537148955, + "grad_norm": 1.671875, + "learning_rate": 1.893377777777778e-05, + "loss": 0.4863, + "step": 2400 + }, + { + "epoch": 0.053546036260220406, + "grad_norm": 1.828125, + "learning_rate": 1.8929333333333335e-05, + "loss": 0.4527, + "step": 2410 + }, + { + "epoch": 0.05376821898329186, + "grad_norm": 1.7421875, + "learning_rate": 1.892488888888889e-05, + "loss": 0.5141, + "step": 2420 + }, + { + "epoch": 0.053990401706363315, + "grad_norm": 2.0625, + "learning_rate": 1.8920444444444445e-05, + "loss": 0.478, + "step": 2430 + }, + { + "epoch": 0.05421258442943477, + "grad_norm": 2.125, + "learning_rate": 1.8916e-05, + "loss": 0.4193, + "step": 2440 + }, + { + "epoch": 0.05443476715250622, + "grad_norm": 2.125, + "learning_rate": 1.8911555555555555e-05, + "loss": 0.4617, + "step": 2450 + }, + { + "epoch": 0.054656949875577676, + "grad_norm": 1.890625, + "learning_rate": 1.8907111111111113e-05, + "loss": 0.4434, + "step": 2460 + }, + { + "epoch": 0.05487913259864913, + "grad_norm": 1.8125, + "learning_rate": 1.8902666666666668e-05, + "loss": 0.456, + "step": 2470 + }, + { + "epoch": 0.055101315321720586, + "grad_norm": 1.640625, + "learning_rate": 1.8898222222222226e-05, + "loss": 0.4581, + "step": 2480 + }, + { + "epoch": 0.05532349804479204, + "grad_norm": 2.5625, + "learning_rate": 1.889377777777778e-05, + "loss": 0.5004, + "step": 2490 + }, + { + "epoch": 0.05554568076786349, + "grad_norm": 1.6875, + "learning_rate": 1.8889333333333336e-05, + "loss": 0.4618, + "step": 2500 + }, + { + "epoch": 0.055767863490934946, + "grad_norm": 2.09375, + "learning_rate": 1.888488888888889e-05, + "loss": 0.4335, + "step": 2510 + }, + { + "epoch": 0.0559900462140064, + "grad_norm": 2.25, + "learning_rate": 1.8880444444444445e-05, + "loss": 0.4709, + "step": 2520 + }, + { + "epoch": 0.056212228937077856, + "grad_norm": 1.4609375, + "learning_rate": 1.8876e-05, + "loss": 0.4395, + "step": 2530 + }, + { + "epoch": 0.05643441166014931, + "grad_norm": 2.015625, + "learning_rate": 1.8871555555555555e-05, + "loss": 0.4741, + "step": 2540 + }, + { + "epoch": 0.05665659438322076, + "grad_norm": 1.5703125, + "learning_rate": 1.8867111111111113e-05, + "loss": 0.4438, + "step": 2550 + }, + { + "epoch": 0.056878777106292217, + "grad_norm": 1.7265625, + "learning_rate": 1.8862666666666668e-05, + "loss": 0.444, + "step": 2560 + }, + { + "epoch": 0.05710095982936367, + "grad_norm": 1.8125, + "learning_rate": 1.8858222222222223e-05, + "loss": 0.4554, + "step": 2570 + }, + { + "epoch": 0.057323142552435126, + "grad_norm": 1.8984375, + "learning_rate": 1.885377777777778e-05, + "loss": 0.507, + "step": 2580 + }, + { + "epoch": 0.05754532527550658, + "grad_norm": 2.078125, + "learning_rate": 1.8849333333333336e-05, + "loss": 0.4842, + "step": 2590 + }, + { + "epoch": 0.05776750799857803, + "grad_norm": 1.9765625, + "learning_rate": 1.884488888888889e-05, + "loss": 0.4443, + "step": 2600 + }, + { + "epoch": 0.05798969072164949, + "grad_norm": 1.8984375, + "learning_rate": 1.8840444444444446e-05, + "loss": 0.4685, + "step": 2610 + }, + { + "epoch": 0.05821187344472094, + "grad_norm": 1.9140625, + "learning_rate": 1.8836e-05, + "loss": 0.4668, + "step": 2620 + }, + { + "epoch": 0.05843405616779239, + "grad_norm": 2.15625, + "learning_rate": 1.8831555555555555e-05, + "loss": 0.4316, + "step": 2630 + }, + { + "epoch": 0.05865623889086385, + "grad_norm": 1.9609375, + "learning_rate": 1.882711111111111e-05, + "loss": 0.4482, + "step": 2640 + }, + { + "epoch": 0.0588784216139353, + "grad_norm": 1.8046875, + "learning_rate": 1.882266666666667e-05, + "loss": 0.4789, + "step": 2650 + }, + { + "epoch": 0.05910060433700676, + "grad_norm": 1.484375, + "learning_rate": 1.8818222222222223e-05, + "loss": 0.4504, + "step": 2660 + }, + { + "epoch": 0.05932278706007821, + "grad_norm": 2.09375, + "learning_rate": 1.8813777777777778e-05, + "loss": 0.486, + "step": 2670 + }, + { + "epoch": 0.05954496978314966, + "grad_norm": 1.7578125, + "learning_rate": 1.8809333333333336e-05, + "loss": 0.4454, + "step": 2680 + }, + { + "epoch": 0.05976715250622112, + "grad_norm": 2.03125, + "learning_rate": 1.880488888888889e-05, + "loss": 0.4169, + "step": 2690 + }, + { + "epoch": 0.05998933522929257, + "grad_norm": 1.875, + "learning_rate": 1.8800444444444446e-05, + "loss": 0.4754, + "step": 2700 + }, + { + "epoch": 0.06021151795236403, + "grad_norm": 2.25, + "learning_rate": 1.8796e-05, + "loss": 0.5018, + "step": 2710 + }, + { + "epoch": 0.06043370067543548, + "grad_norm": 1.8515625, + "learning_rate": 1.8791555555555556e-05, + "loss": 0.4931, + "step": 2720 + }, + { + "epoch": 0.06065588339850693, + "grad_norm": 1.9140625, + "learning_rate": 1.8787111111111114e-05, + "loss": 0.4725, + "step": 2730 + }, + { + "epoch": 0.06087806612157839, + "grad_norm": 1.6640625, + "learning_rate": 1.878266666666667e-05, + "loss": 0.4276, + "step": 2740 + }, + { + "epoch": 0.06110024884464984, + "grad_norm": 1.7890625, + "learning_rate": 1.8778222222222224e-05, + "loss": 0.4817, + "step": 2750 + }, + { + "epoch": 0.0613224315677213, + "grad_norm": 1.9375, + "learning_rate": 1.877377777777778e-05, + "loss": 0.4638, + "step": 2760 + }, + { + "epoch": 0.06154461429079275, + "grad_norm": 1.8046875, + "learning_rate": 1.8769333333333333e-05, + "loss": 0.4529, + "step": 2770 + }, + { + "epoch": 0.0617667970138642, + "grad_norm": 1.9609375, + "learning_rate": 1.876488888888889e-05, + "loss": 0.4779, + "step": 2780 + }, + { + "epoch": 0.06198897973693566, + "grad_norm": 1.90625, + "learning_rate": 1.8760444444444447e-05, + "loss": 0.4788, + "step": 2790 + }, + { + "epoch": 0.06221116246000711, + "grad_norm": 1.953125, + "learning_rate": 1.8756e-05, + "loss": 0.4513, + "step": 2800 + }, + { + "epoch": 0.06243334518307856, + "grad_norm": 1.6875, + "learning_rate": 1.8751555555555556e-05, + "loss": 0.4661, + "step": 2810 + }, + { + "epoch": 0.06265552790615002, + "grad_norm": 1.9296875, + "learning_rate": 1.8747111111111114e-05, + "loss": 0.4529, + "step": 2820 + }, + { + "epoch": 0.06287771062922147, + "grad_norm": 1.9765625, + "learning_rate": 1.874266666666667e-05, + "loss": 0.4556, + "step": 2830 + }, + { + "epoch": 0.06309989335229292, + "grad_norm": 2.171875, + "learning_rate": 1.8738222222222224e-05, + "loss": 0.4645, + "step": 2840 + }, + { + "epoch": 0.06332207607536439, + "grad_norm": 1.921875, + "learning_rate": 1.873377777777778e-05, + "loss": 0.3935, + "step": 2850 + }, + { + "epoch": 0.06354425879843584, + "grad_norm": 1.7890625, + "learning_rate": 1.8729333333333334e-05, + "loss": 0.4634, + "step": 2860 + }, + { + "epoch": 0.06376644152150729, + "grad_norm": 2.078125, + "learning_rate": 1.872488888888889e-05, + "loss": 0.4722, + "step": 2870 + }, + { + "epoch": 0.06398862424457874, + "grad_norm": 1.90625, + "learning_rate": 1.8720444444444447e-05, + "loss": 0.4674, + "step": 2880 + }, + { + "epoch": 0.06421080696765019, + "grad_norm": 2.015625, + "learning_rate": 1.8716000000000002e-05, + "loss": 0.4621, + "step": 2890 + }, + { + "epoch": 0.06443298969072164, + "grad_norm": 2.15625, + "learning_rate": 1.8711555555555557e-05, + "loss": 0.4584, + "step": 2900 + }, + { + "epoch": 0.06465517241379311, + "grad_norm": 1.859375, + "learning_rate": 1.8707111111111115e-05, + "loss": 0.4808, + "step": 2910 + }, + { + "epoch": 0.06487735513686456, + "grad_norm": 2.203125, + "learning_rate": 1.870266666666667e-05, + "loss": 0.4729, + "step": 2920 + }, + { + "epoch": 0.06509953785993601, + "grad_norm": 1.484375, + "learning_rate": 1.8698222222222225e-05, + "loss": 0.4282, + "step": 2930 + }, + { + "epoch": 0.06532172058300746, + "grad_norm": 2.078125, + "learning_rate": 1.869377777777778e-05, + "loss": 0.5072, + "step": 2940 + }, + { + "epoch": 0.06554390330607891, + "grad_norm": 1.8515625, + "learning_rate": 1.8689333333333334e-05, + "loss": 0.4563, + "step": 2950 + }, + { + "epoch": 0.06576608602915038, + "grad_norm": 1.7578125, + "learning_rate": 1.868488888888889e-05, + "loss": 0.4412, + "step": 2960 + }, + { + "epoch": 0.06598826875222183, + "grad_norm": 1.890625, + "learning_rate": 1.8680444444444444e-05, + "loss": 0.4591, + "step": 2970 + }, + { + "epoch": 0.06621045147529328, + "grad_norm": 2.03125, + "learning_rate": 1.8676000000000002e-05, + "loss": 0.4204, + "step": 2980 + }, + { + "epoch": 0.06643263419836473, + "grad_norm": 1.7578125, + "learning_rate": 1.8671555555555557e-05, + "loss": 0.4324, + "step": 2990 + }, + { + "epoch": 0.06665481692143618, + "grad_norm": 1.9921875, + "learning_rate": 1.8667111111111115e-05, + "loss": 0.434, + "step": 3000 + }, + { + "epoch": 0.06687699964450765, + "grad_norm": 2.359375, + "learning_rate": 1.866266666666667e-05, + "loss": 0.4756, + "step": 3010 + }, + { + "epoch": 0.0670991823675791, + "grad_norm": 2.1875, + "learning_rate": 1.8658222222222225e-05, + "loss": 0.4765, + "step": 3020 + }, + { + "epoch": 0.06732136509065055, + "grad_norm": 2.1875, + "learning_rate": 1.865377777777778e-05, + "loss": 0.4121, + "step": 3030 + }, + { + "epoch": 0.067543547813722, + "grad_norm": 1.6640625, + "learning_rate": 1.8649333333333335e-05, + "loss": 0.4742, + "step": 3040 + }, + { + "epoch": 0.06776573053679345, + "grad_norm": 1.8671875, + "learning_rate": 1.864488888888889e-05, + "loss": 0.4649, + "step": 3050 + }, + { + "epoch": 0.06798791325986492, + "grad_norm": 1.96875, + "learning_rate": 1.8640444444444444e-05, + "loss": 0.472, + "step": 3060 + }, + { + "epoch": 0.06821009598293637, + "grad_norm": 1.9765625, + "learning_rate": 1.8636e-05, + "loss": 0.4169, + "step": 3070 + }, + { + "epoch": 0.06843227870600782, + "grad_norm": 2.125, + "learning_rate": 1.8631555555555557e-05, + "loss": 0.4786, + "step": 3080 + }, + { + "epoch": 0.06865446142907927, + "grad_norm": 2.15625, + "learning_rate": 1.8627111111111112e-05, + "loss": 0.4465, + "step": 3090 + }, + { + "epoch": 0.06887664415215072, + "grad_norm": 1.984375, + "learning_rate": 1.862266666666667e-05, + "loss": 0.4672, + "step": 3100 + }, + { + "epoch": 0.06909882687522219, + "grad_norm": 2.375, + "learning_rate": 1.8618222222222225e-05, + "loss": 0.4604, + "step": 3110 + }, + { + "epoch": 0.06932100959829364, + "grad_norm": 1.890625, + "learning_rate": 1.861377777777778e-05, + "loss": 0.4389, + "step": 3120 + }, + { + "epoch": 0.06954319232136509, + "grad_norm": 2.171875, + "learning_rate": 1.8609333333333335e-05, + "loss": 0.4301, + "step": 3130 + }, + { + "epoch": 0.06976537504443654, + "grad_norm": 1.9296875, + "learning_rate": 1.860488888888889e-05, + "loss": 0.4703, + "step": 3140 + }, + { + "epoch": 0.069987557767508, + "grad_norm": 1.9921875, + "learning_rate": 1.8600444444444445e-05, + "loss": 0.4542, + "step": 3150 + }, + { + "epoch": 0.07020974049057946, + "grad_norm": 1.9921875, + "learning_rate": 1.8596e-05, + "loss": 0.4301, + "step": 3160 + }, + { + "epoch": 0.07043192321365091, + "grad_norm": 2.390625, + "learning_rate": 1.8591555555555554e-05, + "loss": 0.453, + "step": 3170 + }, + { + "epoch": 0.07065410593672236, + "grad_norm": 1.609375, + "learning_rate": 1.8587111111111113e-05, + "loss": 0.4156, + "step": 3180 + }, + { + "epoch": 0.07087628865979381, + "grad_norm": 2.0625, + "learning_rate": 1.8582666666666667e-05, + "loss": 0.4333, + "step": 3190 + }, + { + "epoch": 0.07109847138286526, + "grad_norm": 1.625, + "learning_rate": 1.8578222222222226e-05, + "loss": 0.4398, + "step": 3200 + }, + { + "epoch": 0.07132065410593673, + "grad_norm": 1.9609375, + "learning_rate": 1.857377777777778e-05, + "loss": 0.4683, + "step": 3210 + }, + { + "epoch": 0.07154283682900818, + "grad_norm": 1.6875, + "learning_rate": 1.8569333333333335e-05, + "loss": 0.4475, + "step": 3220 + }, + { + "epoch": 0.07176501955207963, + "grad_norm": 2.15625, + "learning_rate": 1.856488888888889e-05, + "loss": 0.4453, + "step": 3230 + }, + { + "epoch": 0.07198720227515108, + "grad_norm": 1.796875, + "learning_rate": 1.8560444444444445e-05, + "loss": 0.4922, + "step": 3240 + }, + { + "epoch": 0.07220938499822253, + "grad_norm": 1.9296875, + "learning_rate": 1.8556e-05, + "loss": 0.492, + "step": 3250 + }, + { + "epoch": 0.072431567721294, + "grad_norm": 1.7890625, + "learning_rate": 1.8551555555555555e-05, + "loss": 0.4376, + "step": 3260 + }, + { + "epoch": 0.07265375044436545, + "grad_norm": 2.4375, + "learning_rate": 1.8547111111111113e-05, + "loss": 0.4818, + "step": 3270 + }, + { + "epoch": 0.0728759331674369, + "grad_norm": 1.921875, + "learning_rate": 1.8542666666666668e-05, + "loss": 0.4779, + "step": 3280 + }, + { + "epoch": 0.07309811589050835, + "grad_norm": 1.546875, + "learning_rate": 1.8538222222222223e-05, + "loss": 0.4235, + "step": 3290 + }, + { + "epoch": 0.0733202986135798, + "grad_norm": 1.890625, + "learning_rate": 1.853377777777778e-05, + "loss": 0.4584, + "step": 3300 + }, + { + "epoch": 0.07354248133665126, + "grad_norm": 2.03125, + "learning_rate": 1.8529333333333336e-05, + "loss": 0.4687, + "step": 3310 + }, + { + "epoch": 0.07376466405972272, + "grad_norm": 1.9140625, + "learning_rate": 1.852488888888889e-05, + "loss": 0.4481, + "step": 3320 + }, + { + "epoch": 0.07398684678279417, + "grad_norm": 2.015625, + "learning_rate": 1.8520444444444445e-05, + "loss": 0.4901, + "step": 3330 + }, + { + "epoch": 0.07420902950586562, + "grad_norm": 1.4296875, + "learning_rate": 1.8516e-05, + "loss": 0.4399, + "step": 3340 + }, + { + "epoch": 0.07443121222893707, + "grad_norm": 2.203125, + "learning_rate": 1.851155555555556e-05, + "loss": 0.4367, + "step": 3350 + }, + { + "epoch": 0.07465339495200853, + "grad_norm": 2.28125, + "learning_rate": 1.8507111111111113e-05, + "loss": 0.4724, + "step": 3360 + }, + { + "epoch": 0.07487557767507999, + "grad_norm": 1.8515625, + "learning_rate": 1.8502666666666668e-05, + "loss": 0.4534, + "step": 3370 + }, + { + "epoch": 0.07509776039815144, + "grad_norm": 2.25, + "learning_rate": 1.8498222222222223e-05, + "loss": 0.4649, + "step": 3380 + }, + { + "epoch": 0.0753199431212229, + "grad_norm": 1.703125, + "learning_rate": 1.8493777777777778e-05, + "loss": 0.4539, + "step": 3390 + }, + { + "epoch": 0.07554212584429434, + "grad_norm": 1.75, + "learning_rate": 1.8489333333333336e-05, + "loss": 0.4484, + "step": 3400 + }, + { + "epoch": 0.0757643085673658, + "grad_norm": 1.9765625, + "learning_rate": 1.848488888888889e-05, + "loss": 0.4544, + "step": 3410 + }, + { + "epoch": 0.07598649129043726, + "grad_norm": 1.8671875, + "learning_rate": 1.8480444444444446e-05, + "loss": 0.4431, + "step": 3420 + }, + { + "epoch": 0.07620867401350871, + "grad_norm": 2.1875, + "learning_rate": 1.8476e-05, + "loss": 0.437, + "step": 3430 + }, + { + "epoch": 0.07643085673658016, + "grad_norm": 2.0625, + "learning_rate": 1.847155555555556e-05, + "loss": 0.458, + "step": 3440 + }, + { + "epoch": 0.07665303945965161, + "grad_norm": 2.09375, + "learning_rate": 1.8467111111111114e-05, + "loss": 0.4512, + "step": 3450 + }, + { + "epoch": 0.07687522218272307, + "grad_norm": 2.375, + "learning_rate": 1.846266666666667e-05, + "loss": 0.435, + "step": 3460 + }, + { + "epoch": 0.07709740490579453, + "grad_norm": 2.203125, + "learning_rate": 1.8458222222222223e-05, + "loss": 0.4778, + "step": 3470 + }, + { + "epoch": 0.07731958762886598, + "grad_norm": 1.9375, + "learning_rate": 1.8453777777777778e-05, + "loss": 0.446, + "step": 3480 + }, + { + "epoch": 0.07754177035193743, + "grad_norm": 1.859375, + "learning_rate": 1.8449333333333333e-05, + "loss": 0.4624, + "step": 3490 + }, + { + "epoch": 0.07776395307500888, + "grad_norm": 1.921875, + "learning_rate": 1.844488888888889e-05, + "loss": 0.4458, + "step": 3500 + }, + { + "epoch": 0.07798613579808034, + "grad_norm": 1.7734375, + "learning_rate": 1.8440444444444446e-05, + "loss": 0.4879, + "step": 3510 + }, + { + "epoch": 0.0782083185211518, + "grad_norm": 1.8046875, + "learning_rate": 1.8436e-05, + "loss": 0.4646, + "step": 3520 + }, + { + "epoch": 0.07843050124422325, + "grad_norm": 1.6015625, + "learning_rate": 1.843155555555556e-05, + "loss": 0.4816, + "step": 3530 + }, + { + "epoch": 0.0786526839672947, + "grad_norm": 1.8359375, + "learning_rate": 1.8427111111111114e-05, + "loss": 0.4507, + "step": 3540 + }, + { + "epoch": 0.07887486669036615, + "grad_norm": 1.9765625, + "learning_rate": 1.842266666666667e-05, + "loss": 0.425, + "step": 3550 + }, + { + "epoch": 0.0790970494134376, + "grad_norm": 2.203125, + "learning_rate": 1.8418222222222224e-05, + "loss": 0.4619, + "step": 3560 + }, + { + "epoch": 0.07931923213650907, + "grad_norm": 1.8828125, + "learning_rate": 1.841377777777778e-05, + "loss": 0.4677, + "step": 3570 + }, + { + "epoch": 0.07954141485958052, + "grad_norm": 2.15625, + "learning_rate": 1.8409333333333333e-05, + "loss": 0.4423, + "step": 3580 + }, + { + "epoch": 0.07976359758265197, + "grad_norm": 1.59375, + "learning_rate": 1.840488888888889e-05, + "loss": 0.4354, + "step": 3590 + }, + { + "epoch": 0.07998578030572343, + "grad_norm": 1.8984375, + "learning_rate": 1.8400444444444447e-05, + "loss": 0.4159, + "step": 3600 + }, + { + "epoch": 0.08020796302879488, + "grad_norm": 2.15625, + "learning_rate": 1.8396e-05, + "loss": 0.4579, + "step": 3610 + }, + { + "epoch": 0.08043014575186634, + "grad_norm": 1.9765625, + "learning_rate": 1.839155555555556e-05, + "loss": 0.4623, + "step": 3620 + }, + { + "epoch": 0.08065232847493779, + "grad_norm": 2.03125, + "learning_rate": 1.8387111111111114e-05, + "loss": 0.4846, + "step": 3630 + }, + { + "epoch": 0.08087451119800924, + "grad_norm": 1.5703125, + "learning_rate": 1.838266666666667e-05, + "loss": 0.4815, + "step": 3640 + }, + { + "epoch": 0.0810966939210807, + "grad_norm": 1.8828125, + "learning_rate": 1.8378222222222224e-05, + "loss": 0.4374, + "step": 3650 + }, + { + "epoch": 0.08131887664415215, + "grad_norm": 1.5703125, + "learning_rate": 1.837377777777778e-05, + "loss": 0.454, + "step": 3660 + }, + { + "epoch": 0.0815410593672236, + "grad_norm": 1.75, + "learning_rate": 1.8369333333333334e-05, + "loss": 0.4526, + "step": 3670 + }, + { + "epoch": 0.08176324209029506, + "grad_norm": 1.71875, + "learning_rate": 1.836488888888889e-05, + "loss": 0.4432, + "step": 3680 + }, + { + "epoch": 0.08198542481336651, + "grad_norm": 1.7109375, + "learning_rate": 1.8360444444444444e-05, + "loss": 0.4295, + "step": 3690 + }, + { + "epoch": 0.08220760753643797, + "grad_norm": 1.9140625, + "learning_rate": 1.8356000000000002e-05, + "loss": 0.4415, + "step": 3700 + }, + { + "epoch": 0.08242979025950942, + "grad_norm": 1.765625, + "learning_rate": 1.8351555555555557e-05, + "loss": 0.43, + "step": 3710 + }, + { + "epoch": 0.08265197298258087, + "grad_norm": 1.8515625, + "learning_rate": 1.8347111111111115e-05, + "loss": 0.4624, + "step": 3720 + }, + { + "epoch": 0.08287415570565233, + "grad_norm": 1.828125, + "learning_rate": 1.834266666666667e-05, + "loss": 0.4405, + "step": 3730 + }, + { + "epoch": 0.08309633842872378, + "grad_norm": 2.234375, + "learning_rate": 1.8338222222222225e-05, + "loss": 0.4621, + "step": 3740 + }, + { + "epoch": 0.08331852115179524, + "grad_norm": 2.03125, + "learning_rate": 1.833377777777778e-05, + "loss": 0.4183, + "step": 3750 + }, + { + "epoch": 0.08354070387486669, + "grad_norm": 1.921875, + "learning_rate": 1.8329333333333334e-05, + "loss": 0.49, + "step": 3760 + }, + { + "epoch": 0.08376288659793814, + "grad_norm": 2.125, + "learning_rate": 1.832488888888889e-05, + "loss": 0.4697, + "step": 3770 + }, + { + "epoch": 0.0839850693210096, + "grad_norm": 1.671875, + "learning_rate": 1.8320444444444444e-05, + "loss": 0.4161, + "step": 3780 + }, + { + "epoch": 0.08420725204408105, + "grad_norm": 1.9765625, + "learning_rate": 1.8316e-05, + "loss": 0.4223, + "step": 3790 + }, + { + "epoch": 0.0844294347671525, + "grad_norm": 1.9453125, + "learning_rate": 1.8311555555555557e-05, + "loss": 0.4626, + "step": 3800 + }, + { + "epoch": 0.08465161749022396, + "grad_norm": 1.96875, + "learning_rate": 1.8307111111111112e-05, + "loss": 0.4782, + "step": 3810 + }, + { + "epoch": 0.08487380021329541, + "grad_norm": 1.8984375, + "learning_rate": 1.830266666666667e-05, + "loss": 0.4219, + "step": 3820 + }, + { + "epoch": 0.08509598293636687, + "grad_norm": 2.09375, + "learning_rate": 1.8298222222222225e-05, + "loss": 0.4611, + "step": 3830 + }, + { + "epoch": 0.08531816565943832, + "grad_norm": 2.09375, + "learning_rate": 1.829377777777778e-05, + "loss": 0.4372, + "step": 3840 + }, + { + "epoch": 0.08554034838250978, + "grad_norm": 1.859375, + "learning_rate": 1.8289333333333335e-05, + "loss": 0.4556, + "step": 3850 + }, + { + "epoch": 0.08576253110558123, + "grad_norm": 1.9921875, + "learning_rate": 1.828488888888889e-05, + "loss": 0.4458, + "step": 3860 + }, + { + "epoch": 0.08598471382865268, + "grad_norm": 2.03125, + "learning_rate": 1.8280444444444444e-05, + "loss": 0.4566, + "step": 3870 + }, + { + "epoch": 0.08620689655172414, + "grad_norm": 1.578125, + "learning_rate": 1.8276e-05, + "loss": 0.4049, + "step": 3880 + }, + { + "epoch": 0.0864290792747956, + "grad_norm": 1.9296875, + "learning_rate": 1.8271555555555557e-05, + "loss": 0.4401, + "step": 3890 + }, + { + "epoch": 0.08665126199786705, + "grad_norm": 1.9140625, + "learning_rate": 1.8267111111111112e-05, + "loss": 0.4426, + "step": 3900 + }, + { + "epoch": 0.0868734447209385, + "grad_norm": 1.6875, + "learning_rate": 1.8262666666666667e-05, + "loss": 0.4529, + "step": 3910 + }, + { + "epoch": 0.08709562744400995, + "grad_norm": 1.7578125, + "learning_rate": 1.8258222222222225e-05, + "loss": 0.4444, + "step": 3920 + }, + { + "epoch": 0.08731781016708141, + "grad_norm": 2.0625, + "learning_rate": 1.825377777777778e-05, + "loss": 0.4486, + "step": 3930 + }, + { + "epoch": 0.08753999289015287, + "grad_norm": 2.046875, + "learning_rate": 1.8249333333333335e-05, + "loss": 0.4299, + "step": 3940 + }, + { + "epoch": 0.08776217561322432, + "grad_norm": 1.5859375, + "learning_rate": 1.824488888888889e-05, + "loss": 0.4385, + "step": 3950 + }, + { + "epoch": 0.08798435833629577, + "grad_norm": 1.9140625, + "learning_rate": 1.8240444444444445e-05, + "loss": 0.4493, + "step": 3960 + }, + { + "epoch": 0.08820654105936722, + "grad_norm": 1.71875, + "learning_rate": 1.8236000000000003e-05, + "loss": 0.455, + "step": 3970 + }, + { + "epoch": 0.08842872378243868, + "grad_norm": 2.1875, + "learning_rate": 1.8231555555555558e-05, + "loss": 0.4461, + "step": 3980 + }, + { + "epoch": 0.08865090650551014, + "grad_norm": 1.6640625, + "learning_rate": 1.8227111111111113e-05, + "loss": 0.4066, + "step": 3990 + }, + { + "epoch": 0.08887308922858159, + "grad_norm": 1.5703125, + "learning_rate": 1.8222666666666667e-05, + "loss": 0.4344, + "step": 4000 + }, + { + "epoch": 0.08909527195165304, + "grad_norm": 1.8515625, + "learning_rate": 1.8218222222222222e-05, + "loss": 0.4603, + "step": 4010 + }, + { + "epoch": 0.08931745467472449, + "grad_norm": 1.8125, + "learning_rate": 1.821377777777778e-05, + "loss": 0.4158, + "step": 4020 + }, + { + "epoch": 0.08953963739779595, + "grad_norm": 1.7421875, + "learning_rate": 1.8209333333333335e-05, + "loss": 0.4181, + "step": 4030 + }, + { + "epoch": 0.0897618201208674, + "grad_norm": 2.03125, + "learning_rate": 1.820488888888889e-05, + "loss": 0.4191, + "step": 4040 + }, + { + "epoch": 0.08998400284393886, + "grad_norm": 2.015625, + "learning_rate": 1.8200444444444445e-05, + "loss": 0.4887, + "step": 4050 + }, + { + "epoch": 0.09020618556701031, + "grad_norm": 1.859375, + "learning_rate": 1.8196000000000003e-05, + "loss": 0.4436, + "step": 4060 + }, + { + "epoch": 0.09042836829008176, + "grad_norm": 1.9453125, + "learning_rate": 1.8191555555555558e-05, + "loss": 0.4669, + "step": 4070 + }, + { + "epoch": 0.09065055101315321, + "grad_norm": 1.859375, + "learning_rate": 1.8187111111111113e-05, + "loss": 0.4529, + "step": 4080 + }, + { + "epoch": 0.09087273373622468, + "grad_norm": 2.015625, + "learning_rate": 1.8182666666666668e-05, + "loss": 0.4579, + "step": 4090 + }, + { + "epoch": 0.09109491645929613, + "grad_norm": 1.828125, + "learning_rate": 1.8178222222222223e-05, + "loss": 0.4856, + "step": 4100 + }, + { + "epoch": 0.09131709918236758, + "grad_norm": 2.140625, + "learning_rate": 1.8173777777777778e-05, + "loss": 0.4385, + "step": 4110 + }, + { + "epoch": 0.09153928190543903, + "grad_norm": 1.7890625, + "learning_rate": 1.8169333333333336e-05, + "loss": 0.4163, + "step": 4120 + }, + { + "epoch": 0.09176146462851048, + "grad_norm": 1.8125, + "learning_rate": 1.816488888888889e-05, + "loss": 0.4889, + "step": 4130 + }, + { + "epoch": 0.09198364735158195, + "grad_norm": 2.1875, + "learning_rate": 1.8160444444444445e-05, + "loss": 0.4412, + "step": 4140 + }, + { + "epoch": 0.0922058300746534, + "grad_norm": 2.109375, + "learning_rate": 1.8156000000000004e-05, + "loss": 0.5022, + "step": 4150 + }, + { + "epoch": 0.09242801279772485, + "grad_norm": 1.9453125, + "learning_rate": 1.815155555555556e-05, + "loss": 0.4063, + "step": 4160 + }, + { + "epoch": 0.0926501955207963, + "grad_norm": 2.09375, + "learning_rate": 1.8147111111111113e-05, + "loss": 0.4474, + "step": 4170 + }, + { + "epoch": 0.09287237824386775, + "grad_norm": 1.3515625, + "learning_rate": 1.8142666666666668e-05, + "loss": 0.4276, + "step": 4180 + }, + { + "epoch": 0.09309456096693922, + "grad_norm": 1.875, + "learning_rate": 1.8138222222222223e-05, + "loss": 0.4464, + "step": 4190 + }, + { + "epoch": 0.09331674369001067, + "grad_norm": 2.328125, + "learning_rate": 1.8133777777777778e-05, + "loss": 0.4756, + "step": 4200 + }, + { + "epoch": 0.09353892641308212, + "grad_norm": 2.0625, + "learning_rate": 1.8129333333333333e-05, + "loss": 0.4513, + "step": 4210 + }, + { + "epoch": 0.09376110913615357, + "grad_norm": 2.125, + "learning_rate": 1.812488888888889e-05, + "loss": 0.4588, + "step": 4220 + }, + { + "epoch": 0.09398329185922502, + "grad_norm": 2.171875, + "learning_rate": 1.8120444444444446e-05, + "loss": 0.424, + "step": 4230 + }, + { + "epoch": 0.09420547458229649, + "grad_norm": 2.078125, + "learning_rate": 1.8116000000000004e-05, + "loss": 0.4253, + "step": 4240 + }, + { + "epoch": 0.09442765730536794, + "grad_norm": 1.703125, + "learning_rate": 1.811155555555556e-05, + "loss": 0.4561, + "step": 4250 + }, + { + "epoch": 0.09464984002843939, + "grad_norm": 1.8203125, + "learning_rate": 1.8107111111111114e-05, + "loss": 0.3994, + "step": 4260 + }, + { + "epoch": 0.09487202275151084, + "grad_norm": 1.8046875, + "learning_rate": 1.810266666666667e-05, + "loss": 0.4645, + "step": 4270 + }, + { + "epoch": 0.09509420547458229, + "grad_norm": 2.109375, + "learning_rate": 1.8098222222222223e-05, + "loss": 0.4633, + "step": 4280 + }, + { + "epoch": 0.09531638819765376, + "grad_norm": 1.921875, + "learning_rate": 1.809377777777778e-05, + "loss": 0.4848, + "step": 4290 + }, + { + "epoch": 0.09553857092072521, + "grad_norm": 2.15625, + "learning_rate": 1.8089333333333333e-05, + "loss": 0.4327, + "step": 4300 + }, + { + "epoch": 0.09576075364379666, + "grad_norm": 1.953125, + "learning_rate": 1.8084888888888888e-05, + "loss": 0.4712, + "step": 4310 + }, + { + "epoch": 0.09598293636686811, + "grad_norm": 1.9375, + "learning_rate": 1.8080444444444446e-05, + "loss": 0.4267, + "step": 4320 + }, + { + "epoch": 0.09620511908993956, + "grad_norm": 1.4453125, + "learning_rate": 1.8076e-05, + "loss": 0.4325, + "step": 4330 + }, + { + "epoch": 0.09642730181301103, + "grad_norm": 1.9140625, + "learning_rate": 1.807155555555556e-05, + "loss": 0.4381, + "step": 4340 + }, + { + "epoch": 0.09664948453608248, + "grad_norm": 1.9140625, + "learning_rate": 1.8067111111111114e-05, + "loss": 0.4514, + "step": 4350 + }, + { + "epoch": 0.09687166725915393, + "grad_norm": 1.9921875, + "learning_rate": 1.806266666666667e-05, + "loss": 0.4969, + "step": 4360 + }, + { + "epoch": 0.09709384998222538, + "grad_norm": 1.84375, + "learning_rate": 1.8058222222222224e-05, + "loss": 0.4552, + "step": 4370 + }, + { + "epoch": 0.09731603270529683, + "grad_norm": 2.0625, + "learning_rate": 1.805377777777778e-05, + "loss": 0.432, + "step": 4380 + }, + { + "epoch": 0.0975382154283683, + "grad_norm": 1.984375, + "learning_rate": 1.8049333333333334e-05, + "loss": 0.4501, + "step": 4390 + }, + { + "epoch": 0.09776039815143975, + "grad_norm": 1.8515625, + "learning_rate": 1.804488888888889e-05, + "loss": 0.4332, + "step": 4400 + }, + { + "epoch": 0.0979825808745112, + "grad_norm": 1.703125, + "learning_rate": 1.8040444444444443e-05, + "loss": 0.4507, + "step": 4410 + }, + { + "epoch": 0.09820476359758265, + "grad_norm": 2.015625, + "learning_rate": 1.8036e-05, + "loss": 0.4568, + "step": 4420 + }, + { + "epoch": 0.0984269463206541, + "grad_norm": 1.796875, + "learning_rate": 1.8031555555555556e-05, + "loss": 0.4763, + "step": 4430 + }, + { + "epoch": 0.09864912904372555, + "grad_norm": 1.9453125, + "learning_rate": 1.8027111111111115e-05, + "loss": 0.4373, + "step": 4440 + }, + { + "epoch": 0.09887131176679702, + "grad_norm": 2.109375, + "learning_rate": 1.802266666666667e-05, + "loss": 0.4492, + "step": 4450 + }, + { + "epoch": 0.09909349448986847, + "grad_norm": 2.171875, + "learning_rate": 1.8018222222222224e-05, + "loss": 0.454, + "step": 4460 + }, + { + "epoch": 0.09931567721293992, + "grad_norm": 1.8203125, + "learning_rate": 1.801377777777778e-05, + "loss": 0.4747, + "step": 4470 + }, + { + "epoch": 0.09953785993601137, + "grad_norm": 1.8359375, + "learning_rate": 1.8009333333333334e-05, + "loss": 0.4355, + "step": 4480 + }, + { + "epoch": 0.09976004265908282, + "grad_norm": 2.15625, + "learning_rate": 1.800488888888889e-05, + "loss": 0.4867, + "step": 4490 + }, + { + "epoch": 0.09998222538215429, + "grad_norm": 2.234375, + "learning_rate": 1.8000444444444444e-05, + "loss": 0.4415, + "step": 4500 + }, + { + "epoch": 0.10020440810522574, + "grad_norm": 1.9375, + "learning_rate": 1.7996000000000002e-05, + "loss": 0.4903, + "step": 4510 + }, + { + "epoch": 0.10042659082829719, + "grad_norm": 2.15625, + "learning_rate": 1.7991555555555557e-05, + "loss": 0.4308, + "step": 4520 + }, + { + "epoch": 0.10064877355136864, + "grad_norm": 1.65625, + "learning_rate": 1.798711111111111e-05, + "loss": 0.5011, + "step": 4530 + }, + { + "epoch": 0.1008709562744401, + "grad_norm": 2.03125, + "learning_rate": 1.798266666666667e-05, + "loss": 0.4614, + "step": 4540 + }, + { + "epoch": 0.10109313899751156, + "grad_norm": 1.78125, + "learning_rate": 1.7978222222222225e-05, + "loss": 0.4547, + "step": 4550 + }, + { + "epoch": 0.10131532172058301, + "grad_norm": 1.8984375, + "learning_rate": 1.797377777777778e-05, + "loss": 0.4505, + "step": 4560 + }, + { + "epoch": 0.10153750444365446, + "grad_norm": 1.5625, + "learning_rate": 1.7969333333333334e-05, + "loss": 0.4285, + "step": 4570 + }, + { + "epoch": 0.10175968716672591, + "grad_norm": 1.7421875, + "learning_rate": 1.796488888888889e-05, + "loss": 0.4422, + "step": 4580 + }, + { + "epoch": 0.10198186988979736, + "grad_norm": 1.7578125, + "learning_rate": 1.7960444444444447e-05, + "loss": 0.4242, + "step": 4590 + }, + { + "epoch": 0.10220405261286883, + "grad_norm": 1.734375, + "learning_rate": 1.7956000000000002e-05, + "loss": 0.4183, + "step": 4600 + }, + { + "epoch": 0.10242623533594028, + "grad_norm": 1.8828125, + "learning_rate": 1.7951555555555557e-05, + "loss": 0.4375, + "step": 4610 + }, + { + "epoch": 0.10264841805901173, + "grad_norm": 1.9296875, + "learning_rate": 1.7947111111111112e-05, + "loss": 0.4561, + "step": 4620 + }, + { + "epoch": 0.10287060078208318, + "grad_norm": 1.5234375, + "learning_rate": 1.7942666666666667e-05, + "loss": 0.4326, + "step": 4630 + }, + { + "epoch": 0.10309278350515463, + "grad_norm": 2.0625, + "learning_rate": 1.7938222222222225e-05, + "loss": 0.4414, + "step": 4640 + }, + { + "epoch": 0.1033149662282261, + "grad_norm": 1.9609375, + "learning_rate": 1.793377777777778e-05, + "loss": 0.4279, + "step": 4650 + }, + { + "epoch": 0.10353714895129755, + "grad_norm": 1.859375, + "learning_rate": 1.7929333333333335e-05, + "loss": 0.4223, + "step": 4660 + }, + { + "epoch": 0.103759331674369, + "grad_norm": 2.078125, + "learning_rate": 1.792488888888889e-05, + "loss": 0.4071, + "step": 4670 + }, + { + "epoch": 0.10398151439744045, + "grad_norm": 1.8359375, + "learning_rate": 1.7920444444444448e-05, + "loss": 0.4339, + "step": 4680 + }, + { + "epoch": 0.1042036971205119, + "grad_norm": 1.8984375, + "learning_rate": 1.7916000000000003e-05, + "loss": 0.4988, + "step": 4690 + }, + { + "epoch": 0.10442587984358337, + "grad_norm": 1.7421875, + "learning_rate": 1.7911555555555557e-05, + "loss": 0.4143, + "step": 4700 + }, + { + "epoch": 0.10464806256665482, + "grad_norm": 1.4375, + "learning_rate": 1.7907111111111112e-05, + "loss": 0.4351, + "step": 4710 + }, + { + "epoch": 0.10487024528972627, + "grad_norm": 1.9609375, + "learning_rate": 1.7902666666666667e-05, + "loss": 0.4467, + "step": 4720 + }, + { + "epoch": 0.10509242801279772, + "grad_norm": 2.015625, + "learning_rate": 1.7898222222222222e-05, + "loss": 0.446, + "step": 4730 + }, + { + "epoch": 0.10531461073586917, + "grad_norm": 1.8125, + "learning_rate": 1.789377777777778e-05, + "loss": 0.4593, + "step": 4740 + }, + { + "epoch": 0.10553679345894064, + "grad_norm": 1.859375, + "learning_rate": 1.7889333333333335e-05, + "loss": 0.4329, + "step": 4750 + }, + { + "epoch": 0.10575897618201209, + "grad_norm": 1.6328125, + "learning_rate": 1.788488888888889e-05, + "loss": 0.389, + "step": 4760 + }, + { + "epoch": 0.10598115890508354, + "grad_norm": 1.71875, + "learning_rate": 1.7880444444444448e-05, + "loss": 0.4639, + "step": 4770 + }, + { + "epoch": 0.10620334162815499, + "grad_norm": 1.9375, + "learning_rate": 1.7876000000000003e-05, + "loss": 0.4327, + "step": 4780 + }, + { + "epoch": 0.10642552435122644, + "grad_norm": 1.9375, + "learning_rate": 1.7871555555555558e-05, + "loss": 0.4199, + "step": 4790 + }, + { + "epoch": 0.10664770707429791, + "grad_norm": 1.9765625, + "learning_rate": 1.7867111111111113e-05, + "loss": 0.4632, + "step": 4800 + }, + { + "epoch": 0.10686988979736936, + "grad_norm": 2.09375, + "learning_rate": 1.7862666666666668e-05, + "loss": 0.449, + "step": 4810 + }, + { + "epoch": 0.10709207252044081, + "grad_norm": 2.046875, + "learning_rate": 1.7858222222222222e-05, + "loss": 0.4737, + "step": 4820 + }, + { + "epoch": 0.10731425524351226, + "grad_norm": 1.5859375, + "learning_rate": 1.7853777777777777e-05, + "loss": 0.4336, + "step": 4830 + }, + { + "epoch": 0.10753643796658371, + "grad_norm": 1.9765625, + "learning_rate": 1.7849333333333335e-05, + "loss": 0.4554, + "step": 4840 + }, + { + "epoch": 0.10775862068965517, + "grad_norm": 1.5703125, + "learning_rate": 1.784488888888889e-05, + "loss": 0.3901, + "step": 4850 + }, + { + "epoch": 0.10798080341272663, + "grad_norm": 1.6953125, + "learning_rate": 1.784044444444445e-05, + "loss": 0.4239, + "step": 4860 + }, + { + "epoch": 0.10820298613579808, + "grad_norm": 1.8203125, + "learning_rate": 1.7836000000000003e-05, + "loss": 0.4552, + "step": 4870 + }, + { + "epoch": 0.10842516885886953, + "grad_norm": 1.828125, + "learning_rate": 1.7831555555555558e-05, + "loss": 0.4197, + "step": 4880 + }, + { + "epoch": 0.10864735158194098, + "grad_norm": 1.7421875, + "learning_rate": 1.7827111111111113e-05, + "loss": 0.404, + "step": 4890 + }, + { + "epoch": 0.10886953430501244, + "grad_norm": 2.015625, + "learning_rate": 1.7822666666666668e-05, + "loss": 0.4529, + "step": 4900 + }, + { + "epoch": 0.1090917170280839, + "grad_norm": 1.671875, + "learning_rate": 1.7818222222222223e-05, + "loss": 0.4506, + "step": 4910 + }, + { + "epoch": 0.10931389975115535, + "grad_norm": 1.953125, + "learning_rate": 1.7813777777777778e-05, + "loss": 0.4431, + "step": 4920 + }, + { + "epoch": 0.1095360824742268, + "grad_norm": 2.28125, + "learning_rate": 1.7809333333333332e-05, + "loss": 0.4971, + "step": 4930 + }, + { + "epoch": 0.10975826519729825, + "grad_norm": 1.5703125, + "learning_rate": 1.780488888888889e-05, + "loss": 0.4553, + "step": 4940 + }, + { + "epoch": 0.1099804479203697, + "grad_norm": 1.703125, + "learning_rate": 1.7800444444444446e-05, + "loss": 0.4072, + "step": 4950 + }, + { + "epoch": 0.11020263064344117, + "grad_norm": 1.90625, + "learning_rate": 1.7796000000000004e-05, + "loss": 0.4075, + "step": 4960 + }, + { + "epoch": 0.11042481336651262, + "grad_norm": 2.0625, + "learning_rate": 1.779155555555556e-05, + "loss": 0.4849, + "step": 4970 + }, + { + "epoch": 0.11064699608958407, + "grad_norm": 1.8203125, + "learning_rate": 1.7787111111111113e-05, + "loss": 0.4143, + "step": 4980 + }, + { + "epoch": 0.11086917881265552, + "grad_norm": 2.0625, + "learning_rate": 1.7782666666666668e-05, + "loss": 0.4471, + "step": 4990 + }, + { + "epoch": 0.11109136153572698, + "grad_norm": 1.890625, + "learning_rate": 1.7778222222222223e-05, + "loss": 0.4606, + "step": 5000 + }, + { + "epoch": 0.11131354425879844, + "grad_norm": 1.7734375, + "learning_rate": 1.7773777777777778e-05, + "loss": 0.4429, + "step": 5010 + }, + { + "epoch": 0.11153572698186989, + "grad_norm": 1.7890625, + "learning_rate": 1.7769333333333333e-05, + "loss": 0.4411, + "step": 5020 + }, + { + "epoch": 0.11175790970494134, + "grad_norm": 1.8203125, + "learning_rate": 1.7764888888888888e-05, + "loss": 0.4839, + "step": 5030 + }, + { + "epoch": 0.1119800924280128, + "grad_norm": 2.09375, + "learning_rate": 1.7760444444444446e-05, + "loss": 0.4667, + "step": 5040 + }, + { + "epoch": 0.11220227515108425, + "grad_norm": 1.7734375, + "learning_rate": 1.7756e-05, + "loss": 0.4371, + "step": 5050 + }, + { + "epoch": 0.11242445787415571, + "grad_norm": 1.8125, + "learning_rate": 1.775155555555556e-05, + "loss": 0.435, + "step": 5060 + }, + { + "epoch": 0.11264664059722716, + "grad_norm": 1.7265625, + "learning_rate": 1.7747111111111114e-05, + "loss": 0.426, + "step": 5070 + }, + { + "epoch": 0.11286882332029861, + "grad_norm": 2.03125, + "learning_rate": 1.774266666666667e-05, + "loss": 0.4797, + "step": 5080 + }, + { + "epoch": 0.11309100604337007, + "grad_norm": 1.734375, + "learning_rate": 1.7738222222222224e-05, + "loss": 0.4882, + "step": 5090 + }, + { + "epoch": 0.11331318876644152, + "grad_norm": 1.7265625, + "learning_rate": 1.773377777777778e-05, + "loss": 0.4936, + "step": 5100 + }, + { + "epoch": 0.11353537148951298, + "grad_norm": 1.765625, + "learning_rate": 1.7729333333333333e-05, + "loss": 0.4728, + "step": 5110 + }, + { + "epoch": 0.11375755421258443, + "grad_norm": 1.6640625, + "learning_rate": 1.7724888888888888e-05, + "loss": 0.4162, + "step": 5120 + }, + { + "epoch": 0.11397973693565588, + "grad_norm": 1.5234375, + "learning_rate": 1.7720444444444446e-05, + "loss": 0.4128, + "step": 5130 + }, + { + "epoch": 0.11420191965872734, + "grad_norm": 1.8359375, + "learning_rate": 1.7716e-05, + "loss": 0.4388, + "step": 5140 + }, + { + "epoch": 0.11442410238179879, + "grad_norm": 2.015625, + "learning_rate": 1.7711555555555556e-05, + "loss": 0.4442, + "step": 5150 + }, + { + "epoch": 0.11464628510487025, + "grad_norm": 2.140625, + "learning_rate": 1.7707111111111114e-05, + "loss": 0.4308, + "step": 5160 + }, + { + "epoch": 0.1148684678279417, + "grad_norm": 1.625, + "learning_rate": 1.770266666666667e-05, + "loss": 0.4208, + "step": 5170 + }, + { + "epoch": 0.11509065055101315, + "grad_norm": 1.84375, + "learning_rate": 1.7698222222222224e-05, + "loss": 0.4284, + "step": 5180 + }, + { + "epoch": 0.1153128332740846, + "grad_norm": 1.859375, + "learning_rate": 1.769377777777778e-05, + "loss": 0.4512, + "step": 5190 + }, + { + "epoch": 0.11553501599715606, + "grad_norm": 1.96875, + "learning_rate": 1.7689333333333334e-05, + "loss": 0.464, + "step": 5200 + }, + { + "epoch": 0.11575719872022751, + "grad_norm": 2.109375, + "learning_rate": 1.7684888888888892e-05, + "loss": 0.4381, + "step": 5210 + }, + { + "epoch": 0.11597938144329897, + "grad_norm": 1.7734375, + "learning_rate": 1.7680444444444447e-05, + "loss": 0.4139, + "step": 5220 + }, + { + "epoch": 0.11620156416637042, + "grad_norm": 1.5234375, + "learning_rate": 1.7676e-05, + "loss": 0.4377, + "step": 5230 + }, + { + "epoch": 0.11642374688944188, + "grad_norm": 2.0, + "learning_rate": 1.7671555555555556e-05, + "loss": 0.4682, + "step": 5240 + }, + { + "epoch": 0.11664592961251333, + "grad_norm": 1.890625, + "learning_rate": 1.766711111111111e-05, + "loss": 0.4179, + "step": 5250 + }, + { + "epoch": 0.11686811233558478, + "grad_norm": 1.859375, + "learning_rate": 1.766266666666667e-05, + "loss": 0.438, + "step": 5260 + }, + { + "epoch": 0.11709029505865624, + "grad_norm": 2.03125, + "learning_rate": 1.7658222222222224e-05, + "loss": 0.4374, + "step": 5270 + }, + { + "epoch": 0.1173124777817277, + "grad_norm": 1.9765625, + "learning_rate": 1.765377777777778e-05, + "loss": 0.4344, + "step": 5280 + }, + { + "epoch": 0.11753466050479915, + "grad_norm": 1.9921875, + "learning_rate": 1.7649333333333334e-05, + "loss": 0.4616, + "step": 5290 + }, + { + "epoch": 0.1177568432278706, + "grad_norm": 1.71875, + "learning_rate": 1.7644888888888892e-05, + "loss": 0.3837, + "step": 5300 + }, + { + "epoch": 0.11797902595094205, + "grad_norm": 1.9453125, + "learning_rate": 1.7640444444444447e-05, + "loss": 0.4316, + "step": 5310 + }, + { + "epoch": 0.11820120867401351, + "grad_norm": 1.9921875, + "learning_rate": 1.7636000000000002e-05, + "loss": 0.4124, + "step": 5320 + }, + { + "epoch": 0.11842339139708496, + "grad_norm": 2.015625, + "learning_rate": 1.7631555555555557e-05, + "loss": 0.4376, + "step": 5330 + }, + { + "epoch": 0.11864557412015642, + "grad_norm": 2.015625, + "learning_rate": 1.762711111111111e-05, + "loss": 0.437, + "step": 5340 + }, + { + "epoch": 0.11886775684322787, + "grad_norm": 1.765625, + "learning_rate": 1.7622666666666666e-05, + "loss": 0.4459, + "step": 5350 + }, + { + "epoch": 0.11908993956629932, + "grad_norm": 2.3125, + "learning_rate": 1.7618222222222225e-05, + "loss": 0.4244, + "step": 5360 + }, + { + "epoch": 0.11931212228937078, + "grad_norm": 2.5, + "learning_rate": 1.761377777777778e-05, + "loss": 0.4318, + "step": 5370 + }, + { + "epoch": 0.11953430501244224, + "grad_norm": 2.0, + "learning_rate": 1.7609333333333334e-05, + "loss": 0.4228, + "step": 5380 + }, + { + "epoch": 0.11975648773551369, + "grad_norm": 1.9921875, + "learning_rate": 1.7604888888888893e-05, + "loss": 0.4659, + "step": 5390 + }, + { + "epoch": 0.11997867045858514, + "grad_norm": 2.296875, + "learning_rate": 1.7600444444444447e-05, + "loss": 0.4646, + "step": 5400 + }, + { + "epoch": 0.12020085318165659, + "grad_norm": 1.7109375, + "learning_rate": 1.7596000000000002e-05, + "loss": 0.405, + "step": 5410 + }, + { + "epoch": 0.12042303590472805, + "grad_norm": 2.5625, + "learning_rate": 1.7591555555555557e-05, + "loss": 0.4746, + "step": 5420 + }, + { + "epoch": 0.1206452186277995, + "grad_norm": 2.0, + "learning_rate": 1.7587111111111112e-05, + "loss": 0.4529, + "step": 5430 + }, + { + "epoch": 0.12086740135087096, + "grad_norm": 1.6953125, + "learning_rate": 1.7582666666666667e-05, + "loss": 0.4221, + "step": 5440 + }, + { + "epoch": 0.12108958407394241, + "grad_norm": 1.734375, + "learning_rate": 1.757822222222222e-05, + "loss": 0.4198, + "step": 5450 + }, + { + "epoch": 0.12131176679701386, + "grad_norm": 1.84375, + "learning_rate": 1.757377777777778e-05, + "loss": 0.4673, + "step": 5460 + }, + { + "epoch": 0.12153394952008532, + "grad_norm": 1.8125, + "learning_rate": 1.7569333333333335e-05, + "loss": 0.4656, + "step": 5470 + }, + { + "epoch": 0.12175613224315678, + "grad_norm": 1.7421875, + "learning_rate": 1.7564888888888893e-05, + "loss": 0.4143, + "step": 5480 + }, + { + "epoch": 0.12197831496622823, + "grad_norm": 1.765625, + "learning_rate": 1.7560444444444448e-05, + "loss": 0.4781, + "step": 5490 + }, + { + "epoch": 0.12220049768929968, + "grad_norm": 1.5625, + "learning_rate": 1.7556000000000003e-05, + "loss": 0.4364, + "step": 5500 + }, + { + "epoch": 0.12242268041237113, + "grad_norm": 1.8125, + "learning_rate": 1.7551555555555558e-05, + "loss": 0.4422, + "step": 5510 + }, + { + "epoch": 0.1226448631354426, + "grad_norm": 1.359375, + "learning_rate": 1.7547111111111112e-05, + "loss": 0.4161, + "step": 5520 + }, + { + "epoch": 0.12286704585851405, + "grad_norm": 1.7578125, + "learning_rate": 1.7542666666666667e-05, + "loss": 0.4024, + "step": 5530 + }, + { + "epoch": 0.1230892285815855, + "grad_norm": 2.125, + "learning_rate": 1.7538222222222222e-05, + "loss": 0.4651, + "step": 5540 + }, + { + "epoch": 0.12331141130465695, + "grad_norm": 1.859375, + "learning_rate": 1.7533777777777777e-05, + "loss": 0.4519, + "step": 5550 + }, + { + "epoch": 0.1235335940277284, + "grad_norm": 1.9765625, + "learning_rate": 1.7529333333333335e-05, + "loss": 0.4236, + "step": 5560 + }, + { + "epoch": 0.12375577675079986, + "grad_norm": 2.0625, + "learning_rate": 1.752488888888889e-05, + "loss": 0.4643, + "step": 5570 + }, + { + "epoch": 0.12397795947387132, + "grad_norm": 1.8359375, + "learning_rate": 1.7520444444444448e-05, + "loss": 0.3911, + "step": 5580 + }, + { + "epoch": 0.12420014219694277, + "grad_norm": 1.84375, + "learning_rate": 1.7516000000000003e-05, + "loss": 0.4421, + "step": 5590 + }, + { + "epoch": 0.12442232492001422, + "grad_norm": 1.9921875, + "learning_rate": 1.7511555555555558e-05, + "loss": 0.4339, + "step": 5600 + }, + { + "epoch": 0.12464450764308567, + "grad_norm": 2.328125, + "learning_rate": 1.7507111111111113e-05, + "loss": 0.4642, + "step": 5610 + }, + { + "epoch": 0.12486669036615712, + "grad_norm": 1.71875, + "learning_rate": 1.7502666666666668e-05, + "loss": 0.4276, + "step": 5620 + }, + { + "epoch": 0.12508887308922859, + "grad_norm": 2.328125, + "learning_rate": 1.7498222222222222e-05, + "loss": 0.405, + "step": 5630 + }, + { + "epoch": 0.12531105581230004, + "grad_norm": 1.90625, + "learning_rate": 1.7493777777777777e-05, + "loss": 0.4389, + "step": 5640 + }, + { + "epoch": 0.1255332385353715, + "grad_norm": 1.9140625, + "learning_rate": 1.7489333333333332e-05, + "loss": 0.4203, + "step": 5650 + }, + { + "epoch": 0.12575542125844294, + "grad_norm": 1.890625, + "learning_rate": 1.748488888888889e-05, + "loss": 0.4125, + "step": 5660 + }, + { + "epoch": 0.1259776039815144, + "grad_norm": 1.546875, + "learning_rate": 1.7480444444444445e-05, + "loss": 0.4202, + "step": 5670 + }, + { + "epoch": 0.12619978670458584, + "grad_norm": 1.421875, + "learning_rate": 1.7476000000000003e-05, + "loss": 0.4097, + "step": 5680 + }, + { + "epoch": 0.1264219694276573, + "grad_norm": 2.265625, + "learning_rate": 1.7471555555555558e-05, + "loss": 0.4353, + "step": 5690 + }, + { + "epoch": 0.12664415215072877, + "grad_norm": 1.859375, + "learning_rate": 1.7467111111111113e-05, + "loss": 0.4267, + "step": 5700 + }, + { + "epoch": 0.12686633487380022, + "grad_norm": 2.171875, + "learning_rate": 1.7462666666666668e-05, + "loss": 0.4148, + "step": 5710 + }, + { + "epoch": 0.12708851759687168, + "grad_norm": 1.765625, + "learning_rate": 1.7458222222222223e-05, + "loss": 0.4694, + "step": 5720 + }, + { + "epoch": 0.12731070031994313, + "grad_norm": 1.5390625, + "learning_rate": 1.7453777777777778e-05, + "loss": 0.4575, + "step": 5730 + }, + { + "epoch": 0.12753288304301458, + "grad_norm": 1.9296875, + "learning_rate": 1.7449333333333333e-05, + "loss": 0.4206, + "step": 5740 + }, + { + "epoch": 0.12775506576608603, + "grad_norm": 2.171875, + "learning_rate": 1.744488888888889e-05, + "loss": 0.4359, + "step": 5750 + }, + { + "epoch": 0.12797724848915748, + "grad_norm": 2.25, + "learning_rate": 1.7440444444444446e-05, + "loss": 0.4633, + "step": 5760 + }, + { + "epoch": 0.12819943121222893, + "grad_norm": 1.9375, + "learning_rate": 1.7436e-05, + "loss": 0.4273, + "step": 5770 + }, + { + "epoch": 0.12842161393530038, + "grad_norm": 2.03125, + "learning_rate": 1.743155555555556e-05, + "loss": 0.4296, + "step": 5780 + }, + { + "epoch": 0.12864379665837183, + "grad_norm": 1.453125, + "learning_rate": 1.7427111111111114e-05, + "loss": 0.4253, + "step": 5790 + }, + { + "epoch": 0.12886597938144329, + "grad_norm": 2.046875, + "learning_rate": 1.742266666666667e-05, + "loss": 0.4561, + "step": 5800 + }, + { + "epoch": 0.12908816210451476, + "grad_norm": 1.6953125, + "learning_rate": 1.7418222222222223e-05, + "loss": 0.4183, + "step": 5810 + }, + { + "epoch": 0.12931034482758622, + "grad_norm": 1.8046875, + "learning_rate": 1.7413777777777778e-05, + "loss": 0.4209, + "step": 5820 + }, + { + "epoch": 0.12953252755065767, + "grad_norm": 1.9296875, + "learning_rate": 1.7409333333333336e-05, + "loss": 0.4257, + "step": 5830 + }, + { + "epoch": 0.12975471027372912, + "grad_norm": 1.8515625, + "learning_rate": 1.740488888888889e-05, + "loss": 0.4621, + "step": 5840 + }, + { + "epoch": 0.12997689299680057, + "grad_norm": 2.03125, + "learning_rate": 1.7400444444444446e-05, + "loss": 0.4583, + "step": 5850 + }, + { + "epoch": 0.13019907571987202, + "grad_norm": 2.078125, + "learning_rate": 1.7396e-05, + "loss": 0.4717, + "step": 5860 + }, + { + "epoch": 0.13042125844294347, + "grad_norm": 1.890625, + "learning_rate": 1.7391555555555556e-05, + "loss": 0.405, + "step": 5870 + }, + { + "epoch": 0.13064344116601492, + "grad_norm": 1.8515625, + "learning_rate": 1.7387111111111114e-05, + "loss": 0.4456, + "step": 5880 + }, + { + "epoch": 0.13086562388908637, + "grad_norm": 1.765625, + "learning_rate": 1.738266666666667e-05, + "loss": 0.4651, + "step": 5890 + }, + { + "epoch": 0.13108780661215783, + "grad_norm": 1.5703125, + "learning_rate": 1.7378222222222224e-05, + "loss": 0.3938, + "step": 5900 + }, + { + "epoch": 0.1313099893352293, + "grad_norm": 1.8046875, + "learning_rate": 1.737377777777778e-05, + "loss": 0.4213, + "step": 5910 + }, + { + "epoch": 0.13153217205830076, + "grad_norm": 1.6875, + "learning_rate": 1.7369333333333337e-05, + "loss": 0.438, + "step": 5920 + }, + { + "epoch": 0.1317543547813722, + "grad_norm": 1.890625, + "learning_rate": 1.736488888888889e-05, + "loss": 0.4435, + "step": 5930 + }, + { + "epoch": 0.13197653750444366, + "grad_norm": 1.828125, + "learning_rate": 1.7360444444444446e-05, + "loss": 0.4096, + "step": 5940 + }, + { + "epoch": 0.1321987202275151, + "grad_norm": 1.75, + "learning_rate": 1.7356e-05, + "loss": 0.4139, + "step": 5950 + }, + { + "epoch": 0.13242090295058656, + "grad_norm": 1.953125, + "learning_rate": 1.7351555555555556e-05, + "loss": 0.3856, + "step": 5960 + }, + { + "epoch": 0.132643085673658, + "grad_norm": 1.625, + "learning_rate": 1.734711111111111e-05, + "loss": 0.4187, + "step": 5970 + }, + { + "epoch": 0.13286526839672946, + "grad_norm": 1.9453125, + "learning_rate": 1.734266666666667e-05, + "loss": 0.4594, + "step": 5980 + }, + { + "epoch": 0.13308745111980091, + "grad_norm": 2.03125, + "learning_rate": 1.7338222222222224e-05, + "loss": 0.4173, + "step": 5990 + }, + { + "epoch": 0.13330963384287237, + "grad_norm": 1.7578125, + "learning_rate": 1.733377777777778e-05, + "loss": 0.4522, + "step": 6000 + }, + { + "epoch": 0.13353181656594384, + "grad_norm": 1.8125, + "learning_rate": 1.7329333333333337e-05, + "loss": 0.421, + "step": 6010 + }, + { + "epoch": 0.1337539992890153, + "grad_norm": 2.328125, + "learning_rate": 1.7324888888888892e-05, + "loss": 0.4665, + "step": 6020 + }, + { + "epoch": 0.13397618201208675, + "grad_norm": 1.890625, + "learning_rate": 1.7320444444444447e-05, + "loss": 0.4417, + "step": 6030 + }, + { + "epoch": 0.1341983647351582, + "grad_norm": 1.7734375, + "learning_rate": 1.7316e-05, + "loss": 0.3872, + "step": 6040 + }, + { + "epoch": 0.13442054745822965, + "grad_norm": 1.921875, + "learning_rate": 1.7311555555555556e-05, + "loss": 0.4432, + "step": 6050 + }, + { + "epoch": 0.1346427301813011, + "grad_norm": 2.0625, + "learning_rate": 1.730711111111111e-05, + "loss": 0.4623, + "step": 6060 + }, + { + "epoch": 0.13486491290437255, + "grad_norm": 2.109375, + "learning_rate": 1.7302666666666666e-05, + "loss": 0.4505, + "step": 6070 + }, + { + "epoch": 0.135087095627444, + "grad_norm": 1.8203125, + "learning_rate": 1.7298222222222224e-05, + "loss": 0.4451, + "step": 6080 + }, + { + "epoch": 0.13530927835051546, + "grad_norm": 1.6796875, + "learning_rate": 1.729377777777778e-05, + "loss": 0.4378, + "step": 6090 + }, + { + "epoch": 0.1355314610735869, + "grad_norm": 2.015625, + "learning_rate": 1.7289333333333337e-05, + "loss": 0.4491, + "step": 6100 + }, + { + "epoch": 0.13575364379665839, + "grad_norm": 2.03125, + "learning_rate": 1.7284888888888892e-05, + "loss": 0.421, + "step": 6110 + }, + { + "epoch": 0.13597582651972984, + "grad_norm": 2.09375, + "learning_rate": 1.7280444444444447e-05, + "loss": 0.4395, + "step": 6120 + }, + { + "epoch": 0.1361980092428013, + "grad_norm": 1.640625, + "learning_rate": 1.7276000000000002e-05, + "loss": 0.4214, + "step": 6130 + }, + { + "epoch": 0.13642019196587274, + "grad_norm": 1.6796875, + "learning_rate": 1.7271555555555557e-05, + "loss": 0.4264, + "step": 6140 + }, + { + "epoch": 0.1366423746889442, + "grad_norm": 1.546875, + "learning_rate": 1.726711111111111e-05, + "loss": 0.3855, + "step": 6150 + }, + { + "epoch": 0.13686455741201564, + "grad_norm": 1.8203125, + "learning_rate": 1.7262666666666667e-05, + "loss": 0.4683, + "step": 6160 + }, + { + "epoch": 0.1370867401350871, + "grad_norm": 1.9296875, + "learning_rate": 1.725822222222222e-05, + "loss": 0.4176, + "step": 6170 + }, + { + "epoch": 0.13730892285815854, + "grad_norm": 1.9453125, + "learning_rate": 1.725377777777778e-05, + "loss": 0.4196, + "step": 6180 + }, + { + "epoch": 0.13753110558123, + "grad_norm": 1.765625, + "learning_rate": 1.7249333333333334e-05, + "loss": 0.4262, + "step": 6190 + }, + { + "epoch": 0.13775328830430145, + "grad_norm": 2.265625, + "learning_rate": 1.7244888888888893e-05, + "loss": 0.4148, + "step": 6200 + }, + { + "epoch": 0.1379754710273729, + "grad_norm": 1.7734375, + "learning_rate": 1.7240444444444448e-05, + "loss": 0.398, + "step": 6210 + }, + { + "epoch": 0.13819765375044438, + "grad_norm": 1.90625, + "learning_rate": 1.7236000000000002e-05, + "loss": 0.425, + "step": 6220 + }, + { + "epoch": 0.13841983647351583, + "grad_norm": 1.78125, + "learning_rate": 1.7231555555555557e-05, + "loss": 0.4279, + "step": 6230 + }, + { + "epoch": 0.13864201919658728, + "grad_norm": 1.921875, + "learning_rate": 1.7227111111111112e-05, + "loss": 0.4503, + "step": 6240 + }, + { + "epoch": 0.13886420191965873, + "grad_norm": 1.890625, + "learning_rate": 1.7222666666666667e-05, + "loss": 0.445, + "step": 6250 + }, + { + "epoch": 0.13908638464273018, + "grad_norm": 1.8125, + "learning_rate": 1.7218222222222222e-05, + "loss": 0.4387, + "step": 6260 + }, + { + "epoch": 0.13930856736580163, + "grad_norm": 1.9140625, + "learning_rate": 1.7213777777777777e-05, + "loss": 0.4213, + "step": 6270 + }, + { + "epoch": 0.13953075008887308, + "grad_norm": 1.7265625, + "learning_rate": 1.7209333333333335e-05, + "loss": 0.4127, + "step": 6280 + }, + { + "epoch": 0.13975293281194454, + "grad_norm": 1.9140625, + "learning_rate": 1.720488888888889e-05, + "loss": 0.4115, + "step": 6290 + }, + { + "epoch": 0.139975115535016, + "grad_norm": 1.96875, + "learning_rate": 1.7200444444444448e-05, + "loss": 0.4444, + "step": 6300 + }, + { + "epoch": 0.14019729825808744, + "grad_norm": 1.953125, + "learning_rate": 1.7196000000000003e-05, + "loss": 0.4442, + "step": 6310 + }, + { + "epoch": 0.14041948098115892, + "grad_norm": 1.90625, + "learning_rate": 1.7191555555555558e-05, + "loss": 0.4216, + "step": 6320 + }, + { + "epoch": 0.14064166370423037, + "grad_norm": 2.078125, + "learning_rate": 1.7187111111111112e-05, + "loss": 0.4539, + "step": 6330 + }, + { + "epoch": 0.14086384642730182, + "grad_norm": 1.890625, + "learning_rate": 1.7182666666666667e-05, + "loss": 0.4116, + "step": 6340 + }, + { + "epoch": 0.14108602915037327, + "grad_norm": 2.234375, + "learning_rate": 1.7178222222222222e-05, + "loss": 0.4251, + "step": 6350 + }, + { + "epoch": 0.14130821187344472, + "grad_norm": 1.6171875, + "learning_rate": 1.7173777777777777e-05, + "loss": 0.4355, + "step": 6360 + }, + { + "epoch": 0.14153039459651617, + "grad_norm": 2.0, + "learning_rate": 1.7169333333333335e-05, + "loss": 0.409, + "step": 6370 + }, + { + "epoch": 0.14175257731958762, + "grad_norm": 1.859375, + "learning_rate": 1.716488888888889e-05, + "loss": 0.4058, + "step": 6380 + }, + { + "epoch": 0.14197476004265908, + "grad_norm": 1.703125, + "learning_rate": 1.7160444444444445e-05, + "loss": 0.4102, + "step": 6390 + }, + { + "epoch": 0.14219694276573053, + "grad_norm": 2.203125, + "learning_rate": 1.7156000000000003e-05, + "loss": 0.4403, + "step": 6400 + }, + { + "epoch": 0.14241912548880198, + "grad_norm": 1.8046875, + "learning_rate": 1.7151555555555558e-05, + "loss": 0.4555, + "step": 6410 + }, + { + "epoch": 0.14264130821187346, + "grad_norm": 1.7734375, + "learning_rate": 1.7147111111111113e-05, + "loss": 0.4371, + "step": 6420 + }, + { + "epoch": 0.1428634909349449, + "grad_norm": 1.8359375, + "learning_rate": 1.7142666666666668e-05, + "loss": 0.4506, + "step": 6430 + }, + { + "epoch": 0.14308567365801636, + "grad_norm": 1.640625, + "learning_rate": 1.7138222222222222e-05, + "loss": 0.402, + "step": 6440 + }, + { + "epoch": 0.1433078563810878, + "grad_norm": 1.9765625, + "learning_rate": 1.713377777777778e-05, + "loss": 0.4712, + "step": 6450 + }, + { + "epoch": 0.14353003910415926, + "grad_norm": 1.90625, + "learning_rate": 1.7129333333333336e-05, + "loss": 0.4108, + "step": 6460 + }, + { + "epoch": 0.14375222182723071, + "grad_norm": 1.890625, + "learning_rate": 1.712488888888889e-05, + "loss": 0.4138, + "step": 6470 + }, + { + "epoch": 0.14397440455030217, + "grad_norm": 2.046875, + "learning_rate": 1.7120444444444445e-05, + "loss": 0.4218, + "step": 6480 + }, + { + "epoch": 0.14419658727337362, + "grad_norm": 1.78125, + "learning_rate": 1.7116e-05, + "loss": 0.3997, + "step": 6490 + }, + { + "epoch": 0.14441876999644507, + "grad_norm": 2.171875, + "learning_rate": 1.711155555555556e-05, + "loss": 0.3893, + "step": 6500 + }, + { + "epoch": 0.14464095271951652, + "grad_norm": 2.015625, + "learning_rate": 1.7107111111111113e-05, + "loss": 0.453, + "step": 6510 + }, + { + "epoch": 0.144863135442588, + "grad_norm": 1.5390625, + "learning_rate": 1.7102666666666668e-05, + "loss": 0.4047, + "step": 6520 + }, + { + "epoch": 0.14508531816565945, + "grad_norm": 1.9609375, + "learning_rate": 1.7098222222222223e-05, + "loss": 0.3939, + "step": 6530 + }, + { + "epoch": 0.1453075008887309, + "grad_norm": 1.9921875, + "learning_rate": 1.709377777777778e-05, + "loss": 0.4508, + "step": 6540 + }, + { + "epoch": 0.14552968361180235, + "grad_norm": 1.5625, + "learning_rate": 1.7089333333333336e-05, + "loss": 0.429, + "step": 6550 + }, + { + "epoch": 0.1457518663348738, + "grad_norm": 2.078125, + "learning_rate": 1.708488888888889e-05, + "loss": 0.4291, + "step": 6560 + }, + { + "epoch": 0.14597404905794525, + "grad_norm": 2.109375, + "learning_rate": 1.7080444444444446e-05, + "loss": 0.4572, + "step": 6570 + }, + { + "epoch": 0.1461962317810167, + "grad_norm": 1.90625, + "learning_rate": 1.7076e-05, + "loss": 0.4417, + "step": 6580 + }, + { + "epoch": 0.14641841450408816, + "grad_norm": 1.984375, + "learning_rate": 1.7071555555555555e-05, + "loss": 0.4619, + "step": 6590 + }, + { + "epoch": 0.1466405972271596, + "grad_norm": 2.234375, + "learning_rate": 1.7067111111111114e-05, + "loss": 0.4344, + "step": 6600 + }, + { + "epoch": 0.14686277995023106, + "grad_norm": 1.71875, + "learning_rate": 1.706266666666667e-05, + "loss": 0.4255, + "step": 6610 + }, + { + "epoch": 0.1470849626733025, + "grad_norm": 1.953125, + "learning_rate": 1.7058222222222223e-05, + "loss": 0.4519, + "step": 6620 + }, + { + "epoch": 0.147307145396374, + "grad_norm": 2.125, + "learning_rate": 1.705377777777778e-05, + "loss": 0.4239, + "step": 6630 + }, + { + "epoch": 0.14752932811944544, + "grad_norm": 2.09375, + "learning_rate": 1.7049333333333336e-05, + "loss": 0.4371, + "step": 6640 + }, + { + "epoch": 0.1477515108425169, + "grad_norm": 1.9375, + "learning_rate": 1.704488888888889e-05, + "loss": 0.4028, + "step": 6650 + }, + { + "epoch": 0.14797369356558834, + "grad_norm": 2.296875, + "learning_rate": 1.7040444444444446e-05, + "loss": 0.4049, + "step": 6660 + }, + { + "epoch": 0.1481958762886598, + "grad_norm": 1.6875, + "learning_rate": 1.7036e-05, + "loss": 0.4177, + "step": 6670 + }, + { + "epoch": 0.14841805901173125, + "grad_norm": 2.046875, + "learning_rate": 1.7031555555555556e-05, + "loss": 0.4601, + "step": 6680 + }, + { + "epoch": 0.1486402417348027, + "grad_norm": 1.6875, + "learning_rate": 1.702711111111111e-05, + "loss": 0.4334, + "step": 6690 + }, + { + "epoch": 0.14886242445787415, + "grad_norm": 2.0625, + "learning_rate": 1.702266666666667e-05, + "loss": 0.4312, + "step": 6700 + }, + { + "epoch": 0.1490846071809456, + "grad_norm": 1.859375, + "learning_rate": 1.7018222222222224e-05, + "loss": 0.4537, + "step": 6710 + }, + { + "epoch": 0.14930678990401705, + "grad_norm": 1.6953125, + "learning_rate": 1.701377777777778e-05, + "loss": 0.3942, + "step": 6720 + }, + { + "epoch": 0.14952897262708853, + "grad_norm": 2.109375, + "learning_rate": 1.7009333333333337e-05, + "loss": 0.4384, + "step": 6730 + }, + { + "epoch": 0.14975115535015998, + "grad_norm": 1.96875, + "learning_rate": 1.700488888888889e-05, + "loss": 0.4478, + "step": 6740 + }, + { + "epoch": 0.14997333807323143, + "grad_norm": 1.640625, + "learning_rate": 1.7000444444444446e-05, + "loss": 0.4289, + "step": 6750 + }, + { + "epoch": 0.15019552079630288, + "grad_norm": 2.125, + "learning_rate": 1.6996e-05, + "loss": 0.4442, + "step": 6760 + }, + { + "epoch": 0.15041770351937434, + "grad_norm": 2.0625, + "learning_rate": 1.6991555555555556e-05, + "loss": 0.4368, + "step": 6770 + }, + { + "epoch": 0.1506398862424458, + "grad_norm": 2.03125, + "learning_rate": 1.698711111111111e-05, + "loss": 0.447, + "step": 6780 + }, + { + "epoch": 0.15086206896551724, + "grad_norm": 1.625, + "learning_rate": 1.6982666666666666e-05, + "loss": 0.4368, + "step": 6790 + }, + { + "epoch": 0.1510842516885887, + "grad_norm": 2.0625, + "learning_rate": 1.6978222222222224e-05, + "loss": 0.4489, + "step": 6800 + }, + { + "epoch": 0.15130643441166014, + "grad_norm": 2.171875, + "learning_rate": 1.697377777777778e-05, + "loss": 0.4006, + "step": 6810 + }, + { + "epoch": 0.1515286171347316, + "grad_norm": 1.8515625, + "learning_rate": 1.6969333333333337e-05, + "loss": 0.4404, + "step": 6820 + }, + { + "epoch": 0.15175079985780307, + "grad_norm": 2.015625, + "learning_rate": 1.6964888888888892e-05, + "loss": 0.4283, + "step": 6830 + }, + { + "epoch": 0.15197298258087452, + "grad_norm": 1.84375, + "learning_rate": 1.6960444444444447e-05, + "loss": 0.4502, + "step": 6840 + }, + { + "epoch": 0.15219516530394597, + "grad_norm": 1.703125, + "learning_rate": 1.6956e-05, + "loss": 0.3995, + "step": 6850 + }, + { + "epoch": 0.15241734802701742, + "grad_norm": 2.0, + "learning_rate": 1.6951555555555556e-05, + "loss": 0.4811, + "step": 6860 + }, + { + "epoch": 0.15263953075008888, + "grad_norm": 1.9609375, + "learning_rate": 1.694711111111111e-05, + "loss": 0.4137, + "step": 6870 + }, + { + "epoch": 0.15286171347316033, + "grad_norm": 1.6875, + "learning_rate": 1.6942666666666666e-05, + "loss": 0.4289, + "step": 6880 + }, + { + "epoch": 0.15308389619623178, + "grad_norm": 1.4921875, + "learning_rate": 1.693822222222222e-05, + "loss": 0.4144, + "step": 6890 + }, + { + "epoch": 0.15330607891930323, + "grad_norm": 2.328125, + "learning_rate": 1.693377777777778e-05, + "loss": 0.4346, + "step": 6900 + }, + { + "epoch": 0.15352826164237468, + "grad_norm": 1.6796875, + "learning_rate": 1.6929333333333334e-05, + "loss": 0.3889, + "step": 6910 + }, + { + "epoch": 0.15375044436544613, + "grad_norm": 2.140625, + "learning_rate": 1.6924888888888892e-05, + "loss": 0.4404, + "step": 6920 + }, + { + "epoch": 0.1539726270885176, + "grad_norm": 2.0625, + "learning_rate": 1.6920444444444447e-05, + "loss": 0.4582, + "step": 6930 + }, + { + "epoch": 0.15419480981158906, + "grad_norm": 1.9609375, + "learning_rate": 1.6916000000000002e-05, + "loss": 0.4138, + "step": 6940 + }, + { + "epoch": 0.1544169925346605, + "grad_norm": 2.21875, + "learning_rate": 1.6911555555555557e-05, + "loss": 0.4371, + "step": 6950 + }, + { + "epoch": 0.15463917525773196, + "grad_norm": 1.8671875, + "learning_rate": 1.6907111111111112e-05, + "loss": 0.4283, + "step": 6960 + }, + { + "epoch": 0.15486135798080342, + "grad_norm": 2.234375, + "learning_rate": 1.6902666666666667e-05, + "loss": 0.4166, + "step": 6970 + }, + { + "epoch": 0.15508354070387487, + "grad_norm": 1.9765625, + "learning_rate": 1.689822222222222e-05, + "loss": 0.4245, + "step": 6980 + }, + { + "epoch": 0.15530572342694632, + "grad_norm": 1.90625, + "learning_rate": 1.689377777777778e-05, + "loss": 0.4454, + "step": 6990 + }, + { + "epoch": 0.15552790615001777, + "grad_norm": 1.8671875, + "learning_rate": 1.6889333333333334e-05, + "loss": 0.4286, + "step": 7000 + }, + { + "epoch": 0.15575008887308922, + "grad_norm": 2.171875, + "learning_rate": 1.688488888888889e-05, + "loss": 0.429, + "step": 7010 + }, + { + "epoch": 0.15597227159616067, + "grad_norm": 2.109375, + "learning_rate": 1.6880444444444448e-05, + "loss": 0.4676, + "step": 7020 + }, + { + "epoch": 0.15619445431923212, + "grad_norm": 2.125, + "learning_rate": 1.6876000000000002e-05, + "loss": 0.4476, + "step": 7030 + }, + { + "epoch": 0.1564166370423036, + "grad_norm": 2.03125, + "learning_rate": 1.6871555555555557e-05, + "loss": 0.4274, + "step": 7040 + }, + { + "epoch": 0.15663881976537505, + "grad_norm": 2.046875, + "learning_rate": 1.6867111111111112e-05, + "loss": 0.45, + "step": 7050 + }, + { + "epoch": 0.1568610024884465, + "grad_norm": 1.78125, + "learning_rate": 1.6862666666666667e-05, + "loss": 0.3895, + "step": 7060 + }, + { + "epoch": 0.15708318521151796, + "grad_norm": 1.859375, + "learning_rate": 1.6858222222222225e-05, + "loss": 0.4266, + "step": 7070 + }, + { + "epoch": 0.1573053679345894, + "grad_norm": 1.9765625, + "learning_rate": 1.685377777777778e-05, + "loss": 0.41, + "step": 7080 + }, + { + "epoch": 0.15752755065766086, + "grad_norm": 1.5859375, + "learning_rate": 1.6849333333333335e-05, + "loss": 0.4325, + "step": 7090 + }, + { + "epoch": 0.1577497333807323, + "grad_norm": 1.75, + "learning_rate": 1.684488888888889e-05, + "loss": 0.4437, + "step": 7100 + }, + { + "epoch": 0.15797191610380376, + "grad_norm": 1.9609375, + "learning_rate": 1.6840444444444445e-05, + "loss": 0.4044, + "step": 7110 + }, + { + "epoch": 0.1581940988268752, + "grad_norm": 2.0, + "learning_rate": 1.6836000000000003e-05, + "loss": 0.4015, + "step": 7120 + }, + { + "epoch": 0.15841628154994666, + "grad_norm": 1.8984375, + "learning_rate": 1.6831555555555558e-05, + "loss": 0.4318, + "step": 7130 + }, + { + "epoch": 0.15863846427301814, + "grad_norm": 1.90625, + "learning_rate": 1.6827111111111112e-05, + "loss": 0.4308, + "step": 7140 + }, + { + "epoch": 0.1588606469960896, + "grad_norm": 2.140625, + "learning_rate": 1.6822666666666667e-05, + "loss": 0.4395, + "step": 7150 + }, + { + "epoch": 0.15908282971916105, + "grad_norm": 2.328125, + "learning_rate": 1.6818222222222226e-05, + "loss": 0.4215, + "step": 7160 + }, + { + "epoch": 0.1593050124422325, + "grad_norm": 2.125, + "learning_rate": 1.681377777777778e-05, + "loss": 0.4215, + "step": 7170 + }, + { + "epoch": 0.15952719516530395, + "grad_norm": 2.015625, + "learning_rate": 1.6809333333333335e-05, + "loss": 0.4018, + "step": 7180 + }, + { + "epoch": 0.1597493778883754, + "grad_norm": 1.5859375, + "learning_rate": 1.680488888888889e-05, + "loss": 0.4546, + "step": 7190 + }, + { + "epoch": 0.15997156061144685, + "grad_norm": 1.875, + "learning_rate": 1.6800444444444445e-05, + "loss": 0.4275, + "step": 7200 + }, + { + "epoch": 0.1601937433345183, + "grad_norm": 1.9453125, + "learning_rate": 1.6796e-05, + "loss": 0.4418, + "step": 7210 + }, + { + "epoch": 0.16041592605758975, + "grad_norm": 1.859375, + "learning_rate": 1.6791555555555558e-05, + "loss": 0.3983, + "step": 7220 + }, + { + "epoch": 0.1606381087806612, + "grad_norm": 1.875, + "learning_rate": 1.6787111111111113e-05, + "loss": 0.4474, + "step": 7230 + }, + { + "epoch": 0.16086029150373268, + "grad_norm": 1.71875, + "learning_rate": 1.6782666666666668e-05, + "loss": 0.4463, + "step": 7240 + }, + { + "epoch": 0.16108247422680413, + "grad_norm": 1.9296875, + "learning_rate": 1.6778222222222226e-05, + "loss": 0.4474, + "step": 7250 + }, + { + "epoch": 0.16130465694987559, + "grad_norm": 2.265625, + "learning_rate": 1.677377777777778e-05, + "loss": 0.4243, + "step": 7260 + }, + { + "epoch": 0.16152683967294704, + "grad_norm": 2.046875, + "learning_rate": 1.6769333333333336e-05, + "loss": 0.4301, + "step": 7270 + }, + { + "epoch": 0.1617490223960185, + "grad_norm": 1.8515625, + "learning_rate": 1.676488888888889e-05, + "loss": 0.4086, + "step": 7280 + }, + { + "epoch": 0.16197120511908994, + "grad_norm": 1.6171875, + "learning_rate": 1.6760444444444445e-05, + "loss": 0.4076, + "step": 7290 + }, + { + "epoch": 0.1621933878421614, + "grad_norm": 1.9765625, + "learning_rate": 1.6756e-05, + "loss": 0.4533, + "step": 7300 + }, + { + "epoch": 0.16241557056523284, + "grad_norm": 2.078125, + "learning_rate": 1.6751555555555555e-05, + "loss": 0.3845, + "step": 7310 + }, + { + "epoch": 0.1626377532883043, + "grad_norm": 1.9921875, + "learning_rate": 1.6747111111111113e-05, + "loss": 0.3745, + "step": 7320 + }, + { + "epoch": 0.16285993601137574, + "grad_norm": 1.875, + "learning_rate": 1.6742666666666668e-05, + "loss": 0.3866, + "step": 7330 + }, + { + "epoch": 0.1630821187344472, + "grad_norm": 2.046875, + "learning_rate": 1.6738222222222223e-05, + "loss": 0.447, + "step": 7340 + }, + { + "epoch": 0.16330430145751867, + "grad_norm": 2.28125, + "learning_rate": 1.673377777777778e-05, + "loss": 0.4418, + "step": 7350 + }, + { + "epoch": 0.16352648418059013, + "grad_norm": 2.0625, + "learning_rate": 1.6729333333333336e-05, + "loss": 0.3611, + "step": 7360 + }, + { + "epoch": 0.16374866690366158, + "grad_norm": 1.828125, + "learning_rate": 1.672488888888889e-05, + "loss": 0.4063, + "step": 7370 + }, + { + "epoch": 0.16397084962673303, + "grad_norm": 2.28125, + "learning_rate": 1.6720444444444446e-05, + "loss": 0.4245, + "step": 7380 + }, + { + "epoch": 0.16419303234980448, + "grad_norm": 1.9296875, + "learning_rate": 1.6716e-05, + "loss": 0.434, + "step": 7390 + }, + { + "epoch": 0.16441521507287593, + "grad_norm": 2.234375, + "learning_rate": 1.6711555555555555e-05, + "loss": 0.3997, + "step": 7400 + }, + { + "epoch": 0.16463739779594738, + "grad_norm": 2.1875, + "learning_rate": 1.670711111111111e-05, + "loss": 0.4215, + "step": 7410 + }, + { + "epoch": 0.16485958051901883, + "grad_norm": 2.203125, + "learning_rate": 1.670266666666667e-05, + "loss": 0.4563, + "step": 7420 + }, + { + "epoch": 0.16508176324209028, + "grad_norm": 2.421875, + "learning_rate": 1.6698222222222223e-05, + "loss": 0.4351, + "step": 7430 + }, + { + "epoch": 0.16530394596516174, + "grad_norm": 1.828125, + "learning_rate": 1.6693777777777778e-05, + "loss": 0.4374, + "step": 7440 + }, + { + "epoch": 0.16552612868823322, + "grad_norm": 2.03125, + "learning_rate": 1.6689333333333336e-05, + "loss": 0.4122, + "step": 7450 + }, + { + "epoch": 0.16574831141130467, + "grad_norm": 2.3125, + "learning_rate": 1.668488888888889e-05, + "loss": 0.4547, + "step": 7460 + }, + { + "epoch": 0.16597049413437612, + "grad_norm": 2.171875, + "learning_rate": 1.6680444444444446e-05, + "loss": 0.4414, + "step": 7470 + }, + { + "epoch": 0.16619267685744757, + "grad_norm": 1.71875, + "learning_rate": 1.6676e-05, + "loss": 0.4345, + "step": 7480 + }, + { + "epoch": 0.16641485958051902, + "grad_norm": 2.03125, + "learning_rate": 1.6671555555555556e-05, + "loss": 0.4033, + "step": 7490 + }, + { + "epoch": 0.16663704230359047, + "grad_norm": 1.9921875, + "learning_rate": 1.666711111111111e-05, + "loss": 0.4159, + "step": 7500 + }, + { + "epoch": 0.16685922502666192, + "grad_norm": 2.75, + "learning_rate": 1.6662666666666665e-05, + "loss": 0.4305, + "step": 7510 + }, + { + "epoch": 0.16708140774973337, + "grad_norm": 2.328125, + "learning_rate": 1.6658222222222224e-05, + "loss": 0.4756, + "step": 7520 + }, + { + "epoch": 0.16730359047280483, + "grad_norm": 1.8046875, + "learning_rate": 1.665377777777778e-05, + "loss": 0.4448, + "step": 7530 + }, + { + "epoch": 0.16752577319587628, + "grad_norm": 1.7421875, + "learning_rate": 1.6649333333333337e-05, + "loss": 0.3996, + "step": 7540 + }, + { + "epoch": 0.16774795591894776, + "grad_norm": 1.8515625, + "learning_rate": 1.664488888888889e-05, + "loss": 0.3974, + "step": 7550 + }, + { + "epoch": 0.1679701386420192, + "grad_norm": 1.6875, + "learning_rate": 1.6640444444444446e-05, + "loss": 0.4326, + "step": 7560 + }, + { + "epoch": 0.16819232136509066, + "grad_norm": 1.5859375, + "learning_rate": 1.6636e-05, + "loss": 0.4018, + "step": 7570 + }, + { + "epoch": 0.1684145040881621, + "grad_norm": 2.234375, + "learning_rate": 1.6631555555555556e-05, + "loss": 0.4749, + "step": 7580 + }, + { + "epoch": 0.16863668681123356, + "grad_norm": 1.65625, + "learning_rate": 1.662711111111111e-05, + "loss": 0.4123, + "step": 7590 + }, + { + "epoch": 0.168858869534305, + "grad_norm": 1.78125, + "learning_rate": 1.6622666666666666e-05, + "loss": 0.4242, + "step": 7600 + }, + { + "epoch": 0.16908105225737646, + "grad_norm": 1.6328125, + "learning_rate": 1.6618222222222224e-05, + "loss": 0.4025, + "step": 7610 + }, + { + "epoch": 0.16930323498044791, + "grad_norm": 2.296875, + "learning_rate": 1.661377777777778e-05, + "loss": 0.3951, + "step": 7620 + }, + { + "epoch": 0.16952541770351937, + "grad_norm": 1.6796875, + "learning_rate": 1.6609333333333334e-05, + "loss": 0.4199, + "step": 7630 + }, + { + "epoch": 0.16974760042659082, + "grad_norm": 1.65625, + "learning_rate": 1.6604888888888892e-05, + "loss": 0.4041, + "step": 7640 + }, + { + "epoch": 0.1699697831496623, + "grad_norm": 1.859375, + "learning_rate": 1.6600444444444447e-05, + "loss": 0.4304, + "step": 7650 + }, + { + "epoch": 0.17019196587273375, + "grad_norm": 1.9140625, + "learning_rate": 1.6596000000000002e-05, + "loss": 0.4008, + "step": 7660 + }, + { + "epoch": 0.1704141485958052, + "grad_norm": 1.8515625, + "learning_rate": 1.6591555555555557e-05, + "loss": 0.3829, + "step": 7670 + }, + { + "epoch": 0.17063633131887665, + "grad_norm": 2.09375, + "learning_rate": 1.658711111111111e-05, + "loss": 0.4511, + "step": 7680 + }, + { + "epoch": 0.1708585140419481, + "grad_norm": 1.71875, + "learning_rate": 1.658266666666667e-05, + "loss": 0.4274, + "step": 7690 + }, + { + "epoch": 0.17108069676501955, + "grad_norm": 1.84375, + "learning_rate": 1.6578222222222224e-05, + "loss": 0.426, + "step": 7700 + }, + { + "epoch": 0.171302879488091, + "grad_norm": 2.046875, + "learning_rate": 1.657377777777778e-05, + "loss": 0.4066, + "step": 7710 + }, + { + "epoch": 0.17152506221116245, + "grad_norm": 1.703125, + "learning_rate": 1.6569333333333334e-05, + "loss": 0.412, + "step": 7720 + }, + { + "epoch": 0.1717472449342339, + "grad_norm": 2.1875, + "learning_rate": 1.656488888888889e-05, + "loss": 0.4298, + "step": 7730 + }, + { + "epoch": 0.17196942765730536, + "grad_norm": 1.9921875, + "learning_rate": 1.6560444444444447e-05, + "loss": 0.4233, + "step": 7740 + }, + { + "epoch": 0.1721916103803768, + "grad_norm": 1.8046875, + "learning_rate": 1.6556000000000002e-05, + "loss": 0.4258, + "step": 7750 + }, + { + "epoch": 0.1724137931034483, + "grad_norm": 2.15625, + "learning_rate": 1.6551555555555557e-05, + "loss": 0.4082, + "step": 7760 + }, + { + "epoch": 0.17263597582651974, + "grad_norm": 1.5, + "learning_rate": 1.6547111111111112e-05, + "loss": 0.4062, + "step": 7770 + }, + { + "epoch": 0.1728581585495912, + "grad_norm": 1.65625, + "learning_rate": 1.654266666666667e-05, + "loss": 0.397, + "step": 7780 + }, + { + "epoch": 0.17308034127266264, + "grad_norm": 2.375, + "learning_rate": 1.6538222222222225e-05, + "loss": 0.4277, + "step": 7790 + }, + { + "epoch": 0.1733025239957341, + "grad_norm": 1.7109375, + "learning_rate": 1.653377777777778e-05, + "loss": 0.4284, + "step": 7800 + }, + { + "epoch": 0.17352470671880554, + "grad_norm": 1.8515625, + "learning_rate": 1.6529333333333335e-05, + "loss": 0.4554, + "step": 7810 + }, + { + "epoch": 0.173746889441877, + "grad_norm": 2.046875, + "learning_rate": 1.652488888888889e-05, + "loss": 0.4404, + "step": 7820 + }, + { + "epoch": 0.17396907216494845, + "grad_norm": 2.21875, + "learning_rate": 1.6520444444444444e-05, + "loss": 0.4119, + "step": 7830 + }, + { + "epoch": 0.1741912548880199, + "grad_norm": 2.359375, + "learning_rate": 1.6516000000000002e-05, + "loss": 0.4272, + "step": 7840 + }, + { + "epoch": 0.17441343761109135, + "grad_norm": 2.515625, + "learning_rate": 1.6511555555555557e-05, + "loss": 0.4163, + "step": 7850 + }, + { + "epoch": 0.17463562033416283, + "grad_norm": 1.75, + "learning_rate": 1.6507111111111112e-05, + "loss": 0.4186, + "step": 7860 + }, + { + "epoch": 0.17485780305723428, + "grad_norm": 2.015625, + "learning_rate": 1.650266666666667e-05, + "loss": 0.438, + "step": 7870 + }, + { + "epoch": 0.17507998578030573, + "grad_norm": 1.765625, + "learning_rate": 1.6498222222222225e-05, + "loss": 0.4119, + "step": 7880 + }, + { + "epoch": 0.17530216850337718, + "grad_norm": 2.109375, + "learning_rate": 1.649377777777778e-05, + "loss": 0.4031, + "step": 7890 + }, + { + "epoch": 0.17552435122644863, + "grad_norm": 2.140625, + "learning_rate": 1.6489333333333335e-05, + "loss": 0.4007, + "step": 7900 + }, + { + "epoch": 0.17574653394952008, + "grad_norm": 1.8984375, + "learning_rate": 1.648488888888889e-05, + "loss": 0.4203, + "step": 7910 + }, + { + "epoch": 0.17596871667259154, + "grad_norm": 1.84375, + "learning_rate": 1.6480444444444445e-05, + "loss": 0.4436, + "step": 7920 + }, + { + "epoch": 0.176190899395663, + "grad_norm": 2.078125, + "learning_rate": 1.6476e-05, + "loss": 0.415, + "step": 7930 + }, + { + "epoch": 0.17641308211873444, + "grad_norm": 2.515625, + "learning_rate": 1.6471555555555558e-05, + "loss": 0.4047, + "step": 7940 + }, + { + "epoch": 0.1766352648418059, + "grad_norm": 2.125, + "learning_rate": 1.6467111111111113e-05, + "loss": 0.4092, + "step": 7950 + }, + { + "epoch": 0.17685744756487737, + "grad_norm": 1.90625, + "learning_rate": 1.6462666666666667e-05, + "loss": 0.3885, + "step": 7960 + }, + { + "epoch": 0.17707963028794882, + "grad_norm": 1.734375, + "learning_rate": 1.6458222222222226e-05, + "loss": 0.424, + "step": 7970 + }, + { + "epoch": 0.17730181301102027, + "grad_norm": 1.5703125, + "learning_rate": 1.645377777777778e-05, + "loss": 0.3977, + "step": 7980 + }, + { + "epoch": 0.17752399573409172, + "grad_norm": 2.078125, + "learning_rate": 1.6449333333333335e-05, + "loss": 0.417, + "step": 7990 + }, + { + "epoch": 0.17774617845716317, + "grad_norm": 1.578125, + "learning_rate": 1.644488888888889e-05, + "loss": 0.3975, + "step": 8000 + }, + { + "epoch": 0.17796836118023462, + "grad_norm": 1.890625, + "learning_rate": 1.6440444444444445e-05, + "loss": 0.3919, + "step": 8010 + }, + { + "epoch": 0.17819054390330608, + "grad_norm": 2.046875, + "learning_rate": 1.6436e-05, + "loss": 0.4242, + "step": 8020 + }, + { + "epoch": 0.17841272662637753, + "grad_norm": 2.1875, + "learning_rate": 1.6431555555555555e-05, + "loss": 0.4207, + "step": 8030 + }, + { + "epoch": 0.17863490934944898, + "grad_norm": 1.859375, + "learning_rate": 1.6427111111111113e-05, + "loss": 0.3976, + "step": 8040 + }, + { + "epoch": 0.17885709207252043, + "grad_norm": 1.8046875, + "learning_rate": 1.6422666666666668e-05, + "loss": 0.4321, + "step": 8050 + }, + { + "epoch": 0.1790792747955919, + "grad_norm": 1.7578125, + "learning_rate": 1.6418222222222223e-05, + "loss": 0.4196, + "step": 8060 + }, + { + "epoch": 0.17930145751866336, + "grad_norm": 1.59375, + "learning_rate": 1.641377777777778e-05, + "loss": 0.4491, + "step": 8070 + }, + { + "epoch": 0.1795236402417348, + "grad_norm": 1.953125, + "learning_rate": 1.6409333333333336e-05, + "loss": 0.4271, + "step": 8080 + }, + { + "epoch": 0.17974582296480626, + "grad_norm": 2.390625, + "learning_rate": 1.640488888888889e-05, + "loss": 0.4652, + "step": 8090 + }, + { + "epoch": 0.1799680056878777, + "grad_norm": 1.9140625, + "learning_rate": 1.6400444444444445e-05, + "loss": 0.422, + "step": 8100 + }, + { + "epoch": 0.18019018841094916, + "grad_norm": 2.125, + "learning_rate": 1.6396e-05, + "loss": 0.3766, + "step": 8110 + }, + { + "epoch": 0.18041237113402062, + "grad_norm": 2.09375, + "learning_rate": 1.6391555555555555e-05, + "loss": 0.3981, + "step": 8120 + }, + { + "epoch": 0.18063455385709207, + "grad_norm": 1.796875, + "learning_rate": 1.638711111111111e-05, + "loss": 0.4288, + "step": 8130 + }, + { + "epoch": 0.18085673658016352, + "grad_norm": 2.0625, + "learning_rate": 1.6382666666666668e-05, + "loss": 0.4185, + "step": 8140 + }, + { + "epoch": 0.18107891930323497, + "grad_norm": 2.046875, + "learning_rate": 1.6378222222222223e-05, + "loss": 0.4175, + "step": 8150 + }, + { + "epoch": 0.18130110202630642, + "grad_norm": 2.015625, + "learning_rate": 1.637377777777778e-05, + "loss": 0.4535, + "step": 8160 + }, + { + "epoch": 0.1815232847493779, + "grad_norm": 1.3984375, + "learning_rate": 1.6369333333333336e-05, + "loss": 0.4269, + "step": 8170 + }, + { + "epoch": 0.18174546747244935, + "grad_norm": 1.8984375, + "learning_rate": 1.636488888888889e-05, + "loss": 0.4164, + "step": 8180 + }, + { + "epoch": 0.1819676501955208, + "grad_norm": 1.8671875, + "learning_rate": 1.6360444444444446e-05, + "loss": 0.3888, + "step": 8190 + }, + { + "epoch": 0.18218983291859225, + "grad_norm": 1.984375, + "learning_rate": 1.6356e-05, + "loss": 0.4377, + "step": 8200 + }, + { + "epoch": 0.1824120156416637, + "grad_norm": 1.984375, + "learning_rate": 1.6351555555555555e-05, + "loss": 0.4383, + "step": 8210 + }, + { + "epoch": 0.18263419836473516, + "grad_norm": 2.375, + "learning_rate": 1.634711111111111e-05, + "loss": 0.461, + "step": 8220 + }, + { + "epoch": 0.1828563810878066, + "grad_norm": 2.3125, + "learning_rate": 1.634266666666667e-05, + "loss": 0.4201, + "step": 8230 + }, + { + "epoch": 0.18307856381087806, + "grad_norm": 1.6953125, + "learning_rate": 1.6338222222222223e-05, + "loss": 0.4292, + "step": 8240 + }, + { + "epoch": 0.1833007465339495, + "grad_norm": 2.328125, + "learning_rate": 1.6333777777777778e-05, + "loss": 0.4403, + "step": 8250 + }, + { + "epoch": 0.18352292925702096, + "grad_norm": 1.8203125, + "learning_rate": 1.6329333333333336e-05, + "loss": 0.3969, + "step": 8260 + }, + { + "epoch": 0.18374511198009244, + "grad_norm": 2.28125, + "learning_rate": 1.632488888888889e-05, + "loss": 0.4344, + "step": 8270 + }, + { + "epoch": 0.1839672947031639, + "grad_norm": 1.7734375, + "learning_rate": 1.6320444444444446e-05, + "loss": 0.3917, + "step": 8280 + }, + { + "epoch": 0.18418947742623534, + "grad_norm": 2.078125, + "learning_rate": 1.6316e-05, + "loss": 0.4092, + "step": 8290 + }, + { + "epoch": 0.1844116601493068, + "grad_norm": 1.4765625, + "learning_rate": 1.6311555555555556e-05, + "loss": 0.4259, + "step": 8300 + }, + { + "epoch": 0.18463384287237825, + "grad_norm": 2.25, + "learning_rate": 1.630711111111111e-05, + "loss": 0.4341, + "step": 8310 + }, + { + "epoch": 0.1848560255954497, + "grad_norm": 1.703125, + "learning_rate": 1.630266666666667e-05, + "loss": 0.4374, + "step": 8320 + }, + { + "epoch": 0.18507820831852115, + "grad_norm": 1.921875, + "learning_rate": 1.6298222222222224e-05, + "loss": 0.4293, + "step": 8330 + }, + { + "epoch": 0.1853003910415926, + "grad_norm": 1.8828125, + "learning_rate": 1.629377777777778e-05, + "loss": 0.4398, + "step": 8340 + }, + { + "epoch": 0.18552257376466405, + "grad_norm": 1.8671875, + "learning_rate": 1.6289333333333333e-05, + "loss": 0.4505, + "step": 8350 + }, + { + "epoch": 0.1857447564877355, + "grad_norm": 1.703125, + "learning_rate": 1.628488888888889e-05, + "loss": 0.3886, + "step": 8360 + }, + { + "epoch": 0.18596693921080698, + "grad_norm": 1.640625, + "learning_rate": 1.6280444444444447e-05, + "loss": 0.3695, + "step": 8370 + }, + { + "epoch": 0.18618912193387843, + "grad_norm": 1.6171875, + "learning_rate": 1.6276e-05, + "loss": 0.4525, + "step": 8380 + }, + { + "epoch": 0.18641130465694988, + "grad_norm": 2.296875, + "learning_rate": 1.6271555555555556e-05, + "loss": 0.4285, + "step": 8390 + }, + { + "epoch": 0.18663348738002133, + "grad_norm": 2.140625, + "learning_rate": 1.6267111111111114e-05, + "loss": 0.4404, + "step": 8400 + }, + { + "epoch": 0.18685567010309279, + "grad_norm": 1.84375, + "learning_rate": 1.626266666666667e-05, + "loss": 0.405, + "step": 8410 + }, + { + "epoch": 0.18707785282616424, + "grad_norm": 2.203125, + "learning_rate": 1.6258222222222224e-05, + "loss": 0.4259, + "step": 8420 + }, + { + "epoch": 0.1873000355492357, + "grad_norm": 1.8515625, + "learning_rate": 1.625377777777778e-05, + "loss": 0.4658, + "step": 8430 + }, + { + "epoch": 0.18752221827230714, + "grad_norm": 2.15625, + "learning_rate": 1.6249333333333334e-05, + "loss": 0.4148, + "step": 8440 + }, + { + "epoch": 0.1877444009953786, + "grad_norm": 2.109375, + "learning_rate": 1.624488888888889e-05, + "loss": 0.4423, + "step": 8450 + }, + { + "epoch": 0.18796658371845004, + "grad_norm": 1.6171875, + "learning_rate": 1.6240444444444447e-05, + "loss": 0.4113, + "step": 8460 + }, + { + "epoch": 0.18818876644152152, + "grad_norm": 2.0625, + "learning_rate": 1.6236000000000002e-05, + "loss": 0.4338, + "step": 8470 + }, + { + "epoch": 0.18841094916459297, + "grad_norm": 2.296875, + "learning_rate": 1.6231555555555557e-05, + "loss": 0.4146, + "step": 8480 + }, + { + "epoch": 0.18863313188766442, + "grad_norm": 1.5234375, + "learning_rate": 1.6227111111111115e-05, + "loss": 0.399, + "step": 8490 + }, + { + "epoch": 0.18885531461073587, + "grad_norm": 1.96875, + "learning_rate": 1.622266666666667e-05, + "loss": 0.4089, + "step": 8500 + }, + { + "epoch": 0.18907749733380733, + "grad_norm": 2.0, + "learning_rate": 1.6218222222222225e-05, + "loss": 0.421, + "step": 8510 + }, + { + "epoch": 0.18929968005687878, + "grad_norm": 1.9609375, + "learning_rate": 1.621377777777778e-05, + "loss": 0.4608, + "step": 8520 + }, + { + "epoch": 0.18952186277995023, + "grad_norm": 1.9296875, + "learning_rate": 1.6209333333333334e-05, + "loss": 0.4154, + "step": 8530 + }, + { + "epoch": 0.18974404550302168, + "grad_norm": 2.0, + "learning_rate": 1.620488888888889e-05, + "loss": 0.394, + "step": 8540 + }, + { + "epoch": 0.18996622822609313, + "grad_norm": 2.046875, + "learning_rate": 1.6200444444444444e-05, + "loss": 0.4051, + "step": 8550 + }, + { + "epoch": 0.19018841094916458, + "grad_norm": 1.875, + "learning_rate": 1.6196000000000002e-05, + "loss": 0.3784, + "step": 8560 + }, + { + "epoch": 0.19041059367223603, + "grad_norm": 1.9453125, + "learning_rate": 1.6191555555555557e-05, + "loss": 0.4235, + "step": 8570 + }, + { + "epoch": 0.1906327763953075, + "grad_norm": 1.71875, + "learning_rate": 1.6187111111111112e-05, + "loss": 0.4422, + "step": 8580 + }, + { + "epoch": 0.19085495911837896, + "grad_norm": 1.4453125, + "learning_rate": 1.618266666666667e-05, + "loss": 0.387, + "step": 8590 + }, + { + "epoch": 0.19107714184145042, + "grad_norm": 1.859375, + "learning_rate": 1.6178222222222225e-05, + "loss": 0.4141, + "step": 8600 + }, + { + "epoch": 0.19129932456452187, + "grad_norm": 1.7734375, + "learning_rate": 1.617377777777778e-05, + "loss": 0.4182, + "step": 8610 + }, + { + "epoch": 0.19152150728759332, + "grad_norm": 1.8203125, + "learning_rate": 1.6169333333333335e-05, + "loss": 0.3894, + "step": 8620 + }, + { + "epoch": 0.19174369001066477, + "grad_norm": 1.875, + "learning_rate": 1.616488888888889e-05, + "loss": 0.4464, + "step": 8630 + }, + { + "epoch": 0.19196587273373622, + "grad_norm": 2.015625, + "learning_rate": 1.6160444444444444e-05, + "loss": 0.4745, + "step": 8640 + }, + { + "epoch": 0.19218805545680767, + "grad_norm": 2.203125, + "learning_rate": 1.6156e-05, + "loss": 0.4128, + "step": 8650 + }, + { + "epoch": 0.19241023817987912, + "grad_norm": 1.953125, + "learning_rate": 1.6151555555555557e-05, + "loss": 0.4236, + "step": 8660 + }, + { + "epoch": 0.19263242090295057, + "grad_norm": 2.3125, + "learning_rate": 1.6147111111111112e-05, + "loss": 0.4418, + "step": 8670 + }, + { + "epoch": 0.19285460362602205, + "grad_norm": 1.9921875, + "learning_rate": 1.6142666666666667e-05, + "loss": 0.4106, + "step": 8680 + }, + { + "epoch": 0.1930767863490935, + "grad_norm": 1.7734375, + "learning_rate": 1.6138222222222225e-05, + "loss": 0.4154, + "step": 8690 + }, + { + "epoch": 0.19329896907216496, + "grad_norm": 2.296875, + "learning_rate": 1.613377777777778e-05, + "loss": 0.4608, + "step": 8700 + }, + { + "epoch": 0.1935211517952364, + "grad_norm": 2.015625, + "learning_rate": 1.6129333333333335e-05, + "loss": 0.4428, + "step": 8710 + }, + { + "epoch": 0.19374333451830786, + "grad_norm": 1.6875, + "learning_rate": 1.612488888888889e-05, + "loss": 0.4212, + "step": 8720 + }, + { + "epoch": 0.1939655172413793, + "grad_norm": 1.65625, + "learning_rate": 1.6120444444444445e-05, + "loss": 0.4195, + "step": 8730 + }, + { + "epoch": 0.19418769996445076, + "grad_norm": 1.9609375, + "learning_rate": 1.6116e-05, + "loss": 0.4108, + "step": 8740 + }, + { + "epoch": 0.1944098826875222, + "grad_norm": 1.53125, + "learning_rate": 1.6111555555555554e-05, + "loss": 0.3959, + "step": 8750 + }, + { + "epoch": 0.19463206541059366, + "grad_norm": 1.7578125, + "learning_rate": 1.6107111111111113e-05, + "loss": 0.4463, + "step": 8760 + }, + { + "epoch": 0.19485424813366511, + "grad_norm": 2.03125, + "learning_rate": 1.6102666666666667e-05, + "loss": 0.4036, + "step": 8770 + }, + { + "epoch": 0.1950764308567366, + "grad_norm": 2.0, + "learning_rate": 1.6098222222222222e-05, + "loss": 0.3891, + "step": 8780 + }, + { + "epoch": 0.19529861357980804, + "grad_norm": 2.265625, + "learning_rate": 1.609377777777778e-05, + "loss": 0.5055, + "step": 8790 + }, + { + "epoch": 0.1955207963028795, + "grad_norm": 1.9453125, + "learning_rate": 1.6089333333333335e-05, + "loss": 0.4405, + "step": 8800 + }, + { + "epoch": 0.19574297902595095, + "grad_norm": 1.96875, + "learning_rate": 1.608488888888889e-05, + "loss": 0.4027, + "step": 8810 + }, + { + "epoch": 0.1959651617490224, + "grad_norm": 1.8828125, + "learning_rate": 1.6080444444444445e-05, + "loss": 0.4373, + "step": 8820 + }, + { + "epoch": 0.19618734447209385, + "grad_norm": 1.9765625, + "learning_rate": 1.6076e-05, + "loss": 0.4376, + "step": 8830 + }, + { + "epoch": 0.1964095271951653, + "grad_norm": 2.171875, + "learning_rate": 1.6071555555555555e-05, + "loss": 0.4551, + "step": 8840 + }, + { + "epoch": 0.19663170991823675, + "grad_norm": 1.765625, + "learning_rate": 1.6067111111111113e-05, + "loss": 0.4178, + "step": 8850 + }, + { + "epoch": 0.1968538926413082, + "grad_norm": 1.984375, + "learning_rate": 1.6062666666666668e-05, + "loss": 0.403, + "step": 8860 + }, + { + "epoch": 0.19707607536437965, + "grad_norm": 2.265625, + "learning_rate": 1.6058222222222223e-05, + "loss": 0.4423, + "step": 8870 + }, + { + "epoch": 0.1972982580874511, + "grad_norm": 1.84375, + "learning_rate": 1.605377777777778e-05, + "loss": 0.4375, + "step": 8880 + }, + { + "epoch": 0.19752044081052259, + "grad_norm": 2.046875, + "learning_rate": 1.6049333333333336e-05, + "loss": 0.4115, + "step": 8890 + }, + { + "epoch": 0.19774262353359404, + "grad_norm": 1.8203125, + "learning_rate": 1.604488888888889e-05, + "loss": 0.4264, + "step": 8900 + }, + { + "epoch": 0.1979648062566655, + "grad_norm": 1.671875, + "learning_rate": 1.6040444444444445e-05, + "loss": 0.4018, + "step": 8910 + }, + { + "epoch": 0.19818698897973694, + "grad_norm": 1.8359375, + "learning_rate": 1.6036e-05, + "loss": 0.4006, + "step": 8920 + }, + { + "epoch": 0.1984091717028084, + "grad_norm": 2.1875, + "learning_rate": 1.6031555555555555e-05, + "loss": 0.4268, + "step": 8930 + }, + { + "epoch": 0.19863135442587984, + "grad_norm": 2.03125, + "learning_rate": 1.6027111111111113e-05, + "loss": 0.4188, + "step": 8940 + }, + { + "epoch": 0.1988535371489513, + "grad_norm": 2.015625, + "learning_rate": 1.6022666666666668e-05, + "loss": 0.401, + "step": 8950 + }, + { + "epoch": 0.19907571987202274, + "grad_norm": 1.953125, + "learning_rate": 1.6018222222222223e-05, + "loss": 0.4369, + "step": 8960 + }, + { + "epoch": 0.1992979025950942, + "grad_norm": 2.09375, + "learning_rate": 1.6013777777777778e-05, + "loss": 0.4732, + "step": 8970 + }, + { + "epoch": 0.19952008531816565, + "grad_norm": 1.8984375, + "learning_rate": 1.6009333333333336e-05, + "loss": 0.3873, + "step": 8980 + }, + { + "epoch": 0.19974226804123713, + "grad_norm": 1.671875, + "learning_rate": 1.600488888888889e-05, + "loss": 0.4255, + "step": 8990 + }, + { + "epoch": 0.19996445076430858, + "grad_norm": 2.21875, + "learning_rate": 1.6000444444444446e-05, + "loss": 0.4358, + "step": 9000 + }, + { + "epoch": 0.20018663348738003, + "grad_norm": 1.9296875, + "learning_rate": 1.5996e-05, + "loss": 0.4104, + "step": 9010 + }, + { + "epoch": 0.20040881621045148, + "grad_norm": 2.21875, + "learning_rate": 1.599155555555556e-05, + "loss": 0.4249, + "step": 9020 + }, + { + "epoch": 0.20063099893352293, + "grad_norm": 2.1875, + "learning_rate": 1.5987111111111114e-05, + "loss": 0.4467, + "step": 9030 + }, + { + "epoch": 0.20085318165659438, + "grad_norm": 1.8828125, + "learning_rate": 1.598266666666667e-05, + "loss": 0.4227, + "step": 9040 + }, + { + "epoch": 0.20107536437966583, + "grad_norm": 1.5, + "learning_rate": 1.5978222222222223e-05, + "loss": 0.3909, + "step": 9050 + }, + { + "epoch": 0.20129754710273728, + "grad_norm": 2.046875, + "learning_rate": 1.5973777777777778e-05, + "loss": 0.4254, + "step": 9060 + }, + { + "epoch": 0.20151972982580874, + "grad_norm": 1.953125, + "learning_rate": 1.5969333333333333e-05, + "loss": 0.4576, + "step": 9070 + }, + { + "epoch": 0.2017419125488802, + "grad_norm": 1.9296875, + "learning_rate": 1.596488888888889e-05, + "loss": 0.4059, + "step": 9080 + }, + { + "epoch": 0.20196409527195167, + "grad_norm": 2.015625, + "learning_rate": 1.5960444444444446e-05, + "loss": 0.4446, + "step": 9090 + }, + { + "epoch": 0.20218627799502312, + "grad_norm": 1.9921875, + "learning_rate": 1.5956e-05, + "loss": 0.4466, + "step": 9100 + }, + { + "epoch": 0.20240846071809457, + "grad_norm": 1.7734375, + "learning_rate": 1.595155555555556e-05, + "loss": 0.4282, + "step": 9110 + }, + { + "epoch": 0.20263064344116602, + "grad_norm": 2.09375, + "learning_rate": 1.5947111111111114e-05, + "loss": 0.4246, + "step": 9120 + }, + { + "epoch": 0.20285282616423747, + "grad_norm": 2.78125, + "learning_rate": 1.594266666666667e-05, + "loss": 0.4582, + "step": 9130 + }, + { + "epoch": 0.20307500888730892, + "grad_norm": 2.21875, + "learning_rate": 1.5938222222222224e-05, + "loss": 0.4167, + "step": 9140 + }, + { + "epoch": 0.20329719161038037, + "grad_norm": 2.234375, + "learning_rate": 1.593377777777778e-05, + "loss": 0.4345, + "step": 9150 + }, + { + "epoch": 0.20351937433345182, + "grad_norm": 1.9609375, + "learning_rate": 1.5929333333333334e-05, + "loss": 0.4211, + "step": 9160 + }, + { + "epoch": 0.20374155705652328, + "grad_norm": 2.28125, + "learning_rate": 1.592488888888889e-05, + "loss": 0.4329, + "step": 9170 + }, + { + "epoch": 0.20396373977959473, + "grad_norm": 2.3125, + "learning_rate": 1.5920444444444447e-05, + "loss": 0.4123, + "step": 9180 + }, + { + "epoch": 0.2041859225026662, + "grad_norm": 1.53125, + "learning_rate": 1.5916e-05, + "loss": 0.4131, + "step": 9190 + }, + { + "epoch": 0.20440810522573766, + "grad_norm": 1.8046875, + "learning_rate": 1.5911555555555556e-05, + "loss": 0.3962, + "step": 9200 + }, + { + "epoch": 0.2046302879488091, + "grad_norm": 1.6484375, + "learning_rate": 1.5907111111111115e-05, + "loss": 0.4045, + "step": 9210 + }, + { + "epoch": 0.20485247067188056, + "grad_norm": 2.203125, + "learning_rate": 1.590266666666667e-05, + "loss": 0.4396, + "step": 9220 + }, + { + "epoch": 0.205074653394952, + "grad_norm": 2.09375, + "learning_rate": 1.5898222222222224e-05, + "loss": 0.4224, + "step": 9230 + }, + { + "epoch": 0.20529683611802346, + "grad_norm": 2.03125, + "learning_rate": 1.589377777777778e-05, + "loss": 0.4518, + "step": 9240 + }, + { + "epoch": 0.2055190188410949, + "grad_norm": 2.234375, + "learning_rate": 1.5889333333333334e-05, + "loss": 0.4198, + "step": 9250 + }, + { + "epoch": 0.20574120156416636, + "grad_norm": 2.125, + "learning_rate": 1.588488888888889e-05, + "loss": 0.4272, + "step": 9260 + }, + { + "epoch": 0.20596338428723782, + "grad_norm": 1.875, + "learning_rate": 1.5880444444444444e-05, + "loss": 0.4243, + "step": 9270 + }, + { + "epoch": 0.20618556701030927, + "grad_norm": 2.0625, + "learning_rate": 1.5876000000000002e-05, + "loss": 0.3997, + "step": 9280 + }, + { + "epoch": 0.20640774973338072, + "grad_norm": 1.953125, + "learning_rate": 1.5871555555555557e-05, + "loss": 0.4025, + "step": 9290 + }, + { + "epoch": 0.2066299324564522, + "grad_norm": 1.84375, + "learning_rate": 1.586711111111111e-05, + "loss": 0.3916, + "step": 9300 + }, + { + "epoch": 0.20685211517952365, + "grad_norm": 1.8671875, + "learning_rate": 1.586266666666667e-05, + "loss": 0.4201, + "step": 9310 + }, + { + "epoch": 0.2070742979025951, + "grad_norm": 1.9609375, + "learning_rate": 1.5858222222222225e-05, + "loss": 0.4181, + "step": 9320 + }, + { + "epoch": 0.20729648062566655, + "grad_norm": 2.21875, + "learning_rate": 1.585377777777778e-05, + "loss": 0.4092, + "step": 9330 + }, + { + "epoch": 0.207518663348738, + "grad_norm": 2.078125, + "learning_rate": 1.5849333333333334e-05, + "loss": 0.4069, + "step": 9340 + }, + { + "epoch": 0.20774084607180945, + "grad_norm": 1.7109375, + "learning_rate": 1.584488888888889e-05, + "loss": 0.3788, + "step": 9350 + }, + { + "epoch": 0.2079630287948809, + "grad_norm": 1.765625, + "learning_rate": 1.5840444444444444e-05, + "loss": 0.3968, + "step": 9360 + }, + { + "epoch": 0.20818521151795236, + "grad_norm": 1.875, + "learning_rate": 1.5836e-05, + "loss": 0.4224, + "step": 9370 + }, + { + "epoch": 0.2084073942410238, + "grad_norm": 1.6640625, + "learning_rate": 1.5831555555555557e-05, + "loss": 0.4123, + "step": 9380 + }, + { + "epoch": 0.20862957696409526, + "grad_norm": 1.71875, + "learning_rate": 1.5827111111111112e-05, + "loss": 0.4285, + "step": 9390 + }, + { + "epoch": 0.20885175968716674, + "grad_norm": 1.8828125, + "learning_rate": 1.5822666666666667e-05, + "loss": 0.4053, + "step": 9400 + }, + { + "epoch": 0.2090739424102382, + "grad_norm": 1.7890625, + "learning_rate": 1.5818222222222225e-05, + "loss": 0.3969, + "step": 9410 + }, + { + "epoch": 0.20929612513330964, + "grad_norm": 1.8046875, + "learning_rate": 1.581377777777778e-05, + "loss": 0.4062, + "step": 9420 + }, + { + "epoch": 0.2095183078563811, + "grad_norm": 2.03125, + "learning_rate": 1.5809333333333335e-05, + "loss": 0.4248, + "step": 9430 + }, + { + "epoch": 0.20974049057945254, + "grad_norm": 1.8125, + "learning_rate": 1.580488888888889e-05, + "loss": 0.4129, + "step": 9440 + }, + { + "epoch": 0.209962673302524, + "grad_norm": 1.7578125, + "learning_rate": 1.5800444444444444e-05, + "loss": 0.4518, + "step": 9450 + }, + { + "epoch": 0.21018485602559545, + "grad_norm": 2.109375, + "learning_rate": 1.5796e-05, + "loss": 0.4386, + "step": 9460 + }, + { + "epoch": 0.2104070387486669, + "grad_norm": 2.15625, + "learning_rate": 1.5791555555555557e-05, + "loss": 0.42, + "step": 9470 + }, + { + "epoch": 0.21062922147173835, + "grad_norm": 1.9140625, + "learning_rate": 1.5787111111111112e-05, + "loss": 0.4053, + "step": 9480 + }, + { + "epoch": 0.2108514041948098, + "grad_norm": 2.078125, + "learning_rate": 1.5782666666666667e-05, + "loss": 0.424, + "step": 9490 + }, + { + "epoch": 0.21107358691788128, + "grad_norm": 2.234375, + "learning_rate": 1.5778222222222225e-05, + "loss": 0.4544, + "step": 9500 + }, + { + "epoch": 0.21129576964095273, + "grad_norm": 1.7265625, + "learning_rate": 1.577377777777778e-05, + "loss": 0.4027, + "step": 9510 + }, + { + "epoch": 0.21151795236402418, + "grad_norm": 2.40625, + "learning_rate": 1.5769333333333335e-05, + "loss": 0.4137, + "step": 9520 + }, + { + "epoch": 0.21174013508709563, + "grad_norm": 1.8671875, + "learning_rate": 1.576488888888889e-05, + "loss": 0.4224, + "step": 9530 + }, + { + "epoch": 0.21196231781016708, + "grad_norm": 1.9453125, + "learning_rate": 1.5760444444444445e-05, + "loss": 0.4379, + "step": 9540 + }, + { + "epoch": 0.21218450053323853, + "grad_norm": 1.625, + "learning_rate": 1.5756e-05, + "loss": 0.4296, + "step": 9550 + }, + { + "epoch": 0.21240668325630999, + "grad_norm": 2.203125, + "learning_rate": 1.5751555555555558e-05, + "loss": 0.4358, + "step": 9560 + }, + { + "epoch": 0.21262886597938144, + "grad_norm": 2.25, + "learning_rate": 1.5747111111111113e-05, + "loss": 0.4208, + "step": 9570 + }, + { + "epoch": 0.2128510487024529, + "grad_norm": 1.9296875, + "learning_rate": 1.5742666666666668e-05, + "loss": 0.4214, + "step": 9580 + }, + { + "epoch": 0.21307323142552434, + "grad_norm": 1.6171875, + "learning_rate": 1.5738222222222222e-05, + "loss": 0.3849, + "step": 9590 + }, + { + "epoch": 0.21329541414859582, + "grad_norm": 1.9921875, + "learning_rate": 1.573377777777778e-05, + "loss": 0.4297, + "step": 9600 + }, + { + "epoch": 0.21351759687166727, + "grad_norm": 2.046875, + "learning_rate": 1.5729333333333335e-05, + "loss": 0.4194, + "step": 9610 + }, + { + "epoch": 0.21373977959473872, + "grad_norm": 1.984375, + "learning_rate": 1.572488888888889e-05, + "loss": 0.3898, + "step": 9620 + }, + { + "epoch": 0.21396196231781017, + "grad_norm": 2.046875, + "learning_rate": 1.5720444444444445e-05, + "loss": 0.4226, + "step": 9630 + }, + { + "epoch": 0.21418414504088162, + "grad_norm": 2.265625, + "learning_rate": 1.5716000000000003e-05, + "loss": 0.4438, + "step": 9640 + }, + { + "epoch": 0.21440632776395308, + "grad_norm": 2.03125, + "learning_rate": 1.5711555555555558e-05, + "loss": 0.4744, + "step": 9650 + }, + { + "epoch": 0.21462851048702453, + "grad_norm": 2.296875, + "learning_rate": 1.5707111111111113e-05, + "loss": 0.4753, + "step": 9660 + }, + { + "epoch": 0.21485069321009598, + "grad_norm": 2.25, + "learning_rate": 1.5702666666666668e-05, + "loss": 0.4282, + "step": 9670 + }, + { + "epoch": 0.21507287593316743, + "grad_norm": 1.8203125, + "learning_rate": 1.5698222222222223e-05, + "loss": 0.4125, + "step": 9680 + }, + { + "epoch": 0.21529505865623888, + "grad_norm": 1.8515625, + "learning_rate": 1.5693777777777778e-05, + "loss": 0.3632, + "step": 9690 + }, + { + "epoch": 0.21551724137931033, + "grad_norm": 1.9609375, + "learning_rate": 1.5689333333333336e-05, + "loss": 0.4184, + "step": 9700 + }, + { + "epoch": 0.2157394241023818, + "grad_norm": 1.9140625, + "learning_rate": 1.568488888888889e-05, + "loss": 0.431, + "step": 9710 + }, + { + "epoch": 0.21596160682545326, + "grad_norm": 2.015625, + "learning_rate": 1.5680444444444445e-05, + "loss": 0.4466, + "step": 9720 + }, + { + "epoch": 0.2161837895485247, + "grad_norm": 2.09375, + "learning_rate": 1.5676000000000004e-05, + "loss": 0.4611, + "step": 9730 + }, + { + "epoch": 0.21640597227159616, + "grad_norm": 1.8203125, + "learning_rate": 1.567155555555556e-05, + "loss": 0.4283, + "step": 9740 + }, + { + "epoch": 0.21662815499466762, + "grad_norm": 1.828125, + "learning_rate": 1.5667111111111113e-05, + "loss": 0.4084, + "step": 9750 + }, + { + "epoch": 0.21685033771773907, + "grad_norm": 2.359375, + "learning_rate": 1.5662666666666668e-05, + "loss": 0.4268, + "step": 9760 + }, + { + "epoch": 0.21707252044081052, + "grad_norm": 2.046875, + "learning_rate": 1.5658222222222223e-05, + "loss": 0.3887, + "step": 9770 + }, + { + "epoch": 0.21729470316388197, + "grad_norm": 2.0625, + "learning_rate": 1.5653777777777778e-05, + "loss": 0.3958, + "step": 9780 + }, + { + "epoch": 0.21751688588695342, + "grad_norm": 2.078125, + "learning_rate": 1.5649333333333333e-05, + "loss": 0.4138, + "step": 9790 + }, + { + "epoch": 0.21773906861002487, + "grad_norm": 1.328125, + "learning_rate": 1.564488888888889e-05, + "loss": 0.4339, + "step": 9800 + }, + { + "epoch": 0.21796125133309635, + "grad_norm": 2.046875, + "learning_rate": 1.5640444444444446e-05, + "loss": 0.414, + "step": 9810 + }, + { + "epoch": 0.2181834340561678, + "grad_norm": 1.7890625, + "learning_rate": 1.5636e-05, + "loss": 0.4075, + "step": 9820 + }, + { + "epoch": 0.21840561677923925, + "grad_norm": 1.6875, + "learning_rate": 1.563155555555556e-05, + "loss": 0.39, + "step": 9830 + }, + { + "epoch": 0.2186277995023107, + "grad_norm": 2.03125, + "learning_rate": 1.5627111111111114e-05, + "loss": 0.4215, + "step": 9840 + }, + { + "epoch": 0.21884998222538216, + "grad_norm": 2.171875, + "learning_rate": 1.562266666666667e-05, + "loss": 0.4007, + "step": 9850 + }, + { + "epoch": 0.2190721649484536, + "grad_norm": 2.015625, + "learning_rate": 1.5618222222222223e-05, + "loss": 0.391, + "step": 9860 + }, + { + "epoch": 0.21929434767152506, + "grad_norm": 1.9453125, + "learning_rate": 1.561377777777778e-05, + "loss": 0.4381, + "step": 9870 + }, + { + "epoch": 0.2195165303945965, + "grad_norm": 2.03125, + "learning_rate": 1.5609333333333333e-05, + "loss": 0.4479, + "step": 9880 + }, + { + "epoch": 0.21973871311766796, + "grad_norm": 2.515625, + "learning_rate": 1.5604888888888888e-05, + "loss": 0.4425, + "step": 9890 + }, + { + "epoch": 0.2199608958407394, + "grad_norm": 2.03125, + "learning_rate": 1.5600444444444446e-05, + "loss": 0.4015, + "step": 9900 + }, + { + "epoch": 0.2201830785638109, + "grad_norm": 1.921875, + "learning_rate": 1.5596e-05, + "loss": 0.4212, + "step": 9910 + }, + { + "epoch": 0.22040526128688234, + "grad_norm": 1.609375, + "learning_rate": 1.5591555555555556e-05, + "loss": 0.4013, + "step": 9920 + }, + { + "epoch": 0.2206274440099538, + "grad_norm": 2.1875, + "learning_rate": 1.5587111111111114e-05, + "loss": 0.4078, + "step": 9930 + }, + { + "epoch": 0.22084962673302524, + "grad_norm": 2.40625, + "learning_rate": 1.558266666666667e-05, + "loss": 0.4728, + "step": 9940 + }, + { + "epoch": 0.2210718094560967, + "grad_norm": 2.234375, + "learning_rate": 1.5578222222222224e-05, + "loss": 0.4154, + "step": 9950 + }, + { + "epoch": 0.22129399217916815, + "grad_norm": 1.859375, + "learning_rate": 1.557377777777778e-05, + "loss": 0.3855, + "step": 9960 + }, + { + "epoch": 0.2215161749022396, + "grad_norm": 2.453125, + "learning_rate": 1.5569333333333334e-05, + "loss": 0.4627, + "step": 9970 + }, + { + "epoch": 0.22173835762531105, + "grad_norm": 2.140625, + "learning_rate": 1.556488888888889e-05, + "loss": 0.4345, + "step": 9980 + }, + { + "epoch": 0.2219605403483825, + "grad_norm": 1.7578125, + "learning_rate": 1.5560444444444443e-05, + "loss": 0.4055, + "step": 9990 + }, + { + "epoch": 0.22218272307145395, + "grad_norm": 2.015625, + "learning_rate": 1.5556e-05, + "loss": 0.4263, + "step": 10000 + }, + { + "epoch": 0.22240490579452543, + "grad_norm": 1.59375, + "learning_rate": 1.5551555555555556e-05, + "loss": 0.3832, + "step": 10010 + }, + { + "epoch": 0.22262708851759688, + "grad_norm": 1.859375, + "learning_rate": 1.554711111111111e-05, + "loss": 0.3835, + "step": 10020 + }, + { + "epoch": 0.22284927124066833, + "grad_norm": 1.8359375, + "learning_rate": 1.554266666666667e-05, + "loss": 0.3895, + "step": 10030 + }, + { + "epoch": 0.22307145396373979, + "grad_norm": 2.0, + "learning_rate": 1.5538222222222224e-05, + "loss": 0.4213, + "step": 10040 + }, + { + "epoch": 0.22329363668681124, + "grad_norm": 1.921875, + "learning_rate": 1.553377777777778e-05, + "loss": 0.3773, + "step": 10050 + }, + { + "epoch": 0.2235158194098827, + "grad_norm": 1.7890625, + "learning_rate": 1.5529333333333334e-05, + "loss": 0.3892, + "step": 10060 + }, + { + "epoch": 0.22373800213295414, + "grad_norm": 2.078125, + "learning_rate": 1.552488888888889e-05, + "loss": 0.4141, + "step": 10070 + }, + { + "epoch": 0.2239601848560256, + "grad_norm": 1.78125, + "learning_rate": 1.5520444444444444e-05, + "loss": 0.3856, + "step": 10080 + }, + { + "epoch": 0.22418236757909704, + "grad_norm": 1.9765625, + "learning_rate": 1.5516000000000002e-05, + "loss": 0.4099, + "step": 10090 + }, + { + "epoch": 0.2244045503021685, + "grad_norm": 1.7421875, + "learning_rate": 1.5511555555555557e-05, + "loss": 0.4171, + "step": 10100 + }, + { + "epoch": 0.22462673302523994, + "grad_norm": 2.234375, + "learning_rate": 1.550711111111111e-05, + "loss": 0.4606, + "step": 10110 + }, + { + "epoch": 0.22484891574831142, + "grad_norm": 1.71875, + "learning_rate": 1.5502666666666666e-05, + "loss": 0.4292, + "step": 10120 + }, + { + "epoch": 0.22507109847138287, + "grad_norm": 2.0625, + "learning_rate": 1.5498222222222225e-05, + "loss": 0.3862, + "step": 10130 + }, + { + "epoch": 0.22529328119445433, + "grad_norm": 1.984375, + "learning_rate": 1.549377777777778e-05, + "loss": 0.4293, + "step": 10140 + }, + { + "epoch": 0.22551546391752578, + "grad_norm": 1.9609375, + "learning_rate": 1.5489333333333334e-05, + "loss": 0.4031, + "step": 10150 + }, + { + "epoch": 0.22573764664059723, + "grad_norm": 1.8125, + "learning_rate": 1.548488888888889e-05, + "loss": 0.4126, + "step": 10160 + }, + { + "epoch": 0.22595982936366868, + "grad_norm": 2.359375, + "learning_rate": 1.5480444444444444e-05, + "loss": 0.4381, + "step": 10170 + }, + { + "epoch": 0.22618201208674013, + "grad_norm": 2.171875, + "learning_rate": 1.5476000000000002e-05, + "loss": 0.4138, + "step": 10180 + }, + { + "epoch": 0.22640419480981158, + "grad_norm": 1.671875, + "learning_rate": 1.5471555555555557e-05, + "loss": 0.4036, + "step": 10190 + }, + { + "epoch": 0.22662637753288303, + "grad_norm": 2.15625, + "learning_rate": 1.5467111111111112e-05, + "loss": 0.4509, + "step": 10200 + }, + { + "epoch": 0.22684856025595448, + "grad_norm": 1.75, + "learning_rate": 1.5462666666666667e-05, + "loss": 0.4349, + "step": 10210 + }, + { + "epoch": 0.22707074297902596, + "grad_norm": 2.125, + "learning_rate": 1.5458222222222225e-05, + "loss": 0.4387, + "step": 10220 + }, + { + "epoch": 0.22729292570209741, + "grad_norm": 1.8125, + "learning_rate": 1.545377777777778e-05, + "loss": 0.3939, + "step": 10230 + }, + { + "epoch": 0.22751510842516887, + "grad_norm": 2.203125, + "learning_rate": 1.5449333333333335e-05, + "loss": 0.3793, + "step": 10240 + }, + { + "epoch": 0.22773729114824032, + "grad_norm": 2.1875, + "learning_rate": 1.544488888888889e-05, + "loss": 0.4314, + "step": 10250 + }, + { + "epoch": 0.22795947387131177, + "grad_norm": 2.15625, + "learning_rate": 1.5440444444444448e-05, + "loss": 0.4209, + "step": 10260 + }, + { + "epoch": 0.22818165659438322, + "grad_norm": 1.96875, + "learning_rate": 1.5436000000000003e-05, + "loss": 0.4622, + "step": 10270 + }, + { + "epoch": 0.22840383931745467, + "grad_norm": 1.703125, + "learning_rate": 1.5431555555555557e-05, + "loss": 0.4193, + "step": 10280 + }, + { + "epoch": 0.22862602204052612, + "grad_norm": 2.140625, + "learning_rate": 1.5427111111111112e-05, + "loss": 0.4271, + "step": 10290 + }, + { + "epoch": 0.22884820476359757, + "grad_norm": 1.7578125, + "learning_rate": 1.5422666666666667e-05, + "loss": 0.4034, + "step": 10300 + }, + { + "epoch": 0.22907038748666902, + "grad_norm": 1.9140625, + "learning_rate": 1.5418222222222222e-05, + "loss": 0.4247, + "step": 10310 + }, + { + "epoch": 0.2292925702097405, + "grad_norm": 1.9296875, + "learning_rate": 1.541377777777778e-05, + "loss": 0.4069, + "step": 10320 + }, + { + "epoch": 0.22951475293281196, + "grad_norm": 2.0, + "learning_rate": 1.5409333333333335e-05, + "loss": 0.4488, + "step": 10330 + }, + { + "epoch": 0.2297369356558834, + "grad_norm": 2.234375, + "learning_rate": 1.540488888888889e-05, + "loss": 0.4234, + "step": 10340 + }, + { + "epoch": 0.22995911837895486, + "grad_norm": 1.875, + "learning_rate": 1.5400444444444448e-05, + "loss": 0.4164, + "step": 10350 + }, + { + "epoch": 0.2301813011020263, + "grad_norm": 2.03125, + "learning_rate": 1.5396000000000003e-05, + "loss": 0.4142, + "step": 10360 + }, + { + "epoch": 0.23040348382509776, + "grad_norm": 2.125, + "learning_rate": 1.5391555555555558e-05, + "loss": 0.401, + "step": 10370 + }, + { + "epoch": 0.2306256665481692, + "grad_norm": 2.09375, + "learning_rate": 1.5387111111111113e-05, + "loss": 0.3975, + "step": 10380 + }, + { + "epoch": 0.23084784927124066, + "grad_norm": 2.53125, + "learning_rate": 1.5382666666666668e-05, + "loss": 0.4395, + "step": 10390 + }, + { + "epoch": 0.23107003199431211, + "grad_norm": 2.359375, + "learning_rate": 1.5378222222222222e-05, + "loss": 0.4033, + "step": 10400 + }, + { + "epoch": 0.23129221471738357, + "grad_norm": 2.1875, + "learning_rate": 1.5373777777777777e-05, + "loss": 0.4159, + "step": 10410 + }, + { + "epoch": 0.23151439744045502, + "grad_norm": 2.5625, + "learning_rate": 1.5369333333333335e-05, + "loss": 0.3769, + "step": 10420 + }, + { + "epoch": 0.2317365801635265, + "grad_norm": 2.109375, + "learning_rate": 1.536488888888889e-05, + "loss": 0.3959, + "step": 10430 + }, + { + "epoch": 0.23195876288659795, + "grad_norm": 2.078125, + "learning_rate": 1.5360444444444445e-05, + "loss": 0.3965, + "step": 10440 + }, + { + "epoch": 0.2321809456096694, + "grad_norm": 1.984375, + "learning_rate": 1.5356000000000003e-05, + "loss": 0.4088, + "step": 10450 + }, + { + "epoch": 0.23240312833274085, + "grad_norm": 1.796875, + "learning_rate": 1.5351555555555558e-05, + "loss": 0.3845, + "step": 10460 + }, + { + "epoch": 0.2326253110558123, + "grad_norm": 2.203125, + "learning_rate": 1.5347111111111113e-05, + "loss": 0.4088, + "step": 10470 + }, + { + "epoch": 0.23284749377888375, + "grad_norm": 2.125, + "learning_rate": 1.5342666666666668e-05, + "loss": 0.4099, + "step": 10480 + }, + { + "epoch": 0.2330696765019552, + "grad_norm": 1.90625, + "learning_rate": 1.5338222222222223e-05, + "loss": 0.4516, + "step": 10490 + }, + { + "epoch": 0.23329185922502665, + "grad_norm": 2.234375, + "learning_rate": 1.5333777777777778e-05, + "loss": 0.4197, + "step": 10500 + }, + { + "epoch": 0.2335140419480981, + "grad_norm": 1.5234375, + "learning_rate": 1.5329333333333332e-05, + "loss": 0.4455, + "step": 10510 + }, + { + "epoch": 0.23373622467116956, + "grad_norm": 2.296875, + "learning_rate": 1.532488888888889e-05, + "loss": 0.391, + "step": 10520 + }, + { + "epoch": 0.23395840739424104, + "grad_norm": 1.875, + "learning_rate": 1.5320444444444446e-05, + "loss": 0.4127, + "step": 10530 + }, + { + "epoch": 0.2341805901173125, + "grad_norm": 2.203125, + "learning_rate": 1.5316e-05, + "loss": 0.4628, + "step": 10540 + }, + { + "epoch": 0.23440277284038394, + "grad_norm": 1.765625, + "learning_rate": 1.531155555555556e-05, + "loss": 0.414, + "step": 10550 + }, + { + "epoch": 0.2346249555634554, + "grad_norm": 2.09375, + "learning_rate": 1.5307111111111113e-05, + "loss": 0.4388, + "step": 10560 + }, + { + "epoch": 0.23484713828652684, + "grad_norm": 1.984375, + "learning_rate": 1.530266666666667e-05, + "loss": 0.4559, + "step": 10570 + }, + { + "epoch": 0.2350693210095983, + "grad_norm": 1.8359375, + "learning_rate": 1.5298222222222223e-05, + "loss": 0.429, + "step": 10580 + }, + { + "epoch": 0.23529150373266974, + "grad_norm": 1.4765625, + "learning_rate": 1.5293777777777778e-05, + "loss": 0.4091, + "step": 10590 + }, + { + "epoch": 0.2355136864557412, + "grad_norm": 1.6640625, + "learning_rate": 1.5289333333333333e-05, + "loss": 0.3757, + "step": 10600 + }, + { + "epoch": 0.23573586917881265, + "grad_norm": 2.265625, + "learning_rate": 1.5284888888888888e-05, + "loss": 0.4064, + "step": 10610 + }, + { + "epoch": 0.2359580519018841, + "grad_norm": 2.234375, + "learning_rate": 1.5280444444444446e-05, + "loss": 0.4397, + "step": 10620 + }, + { + "epoch": 0.23618023462495558, + "grad_norm": 2.0, + "learning_rate": 1.5276e-05, + "loss": 0.4588, + "step": 10630 + }, + { + "epoch": 0.23640241734802703, + "grad_norm": 1.9921875, + "learning_rate": 1.5271555555555556e-05, + "loss": 0.4108, + "step": 10640 + }, + { + "epoch": 0.23662460007109848, + "grad_norm": 2.09375, + "learning_rate": 1.5267111111111114e-05, + "loss": 0.4755, + "step": 10650 + }, + { + "epoch": 0.23684678279416993, + "grad_norm": 1.9296875, + "learning_rate": 1.526266666666667e-05, + "loss": 0.4257, + "step": 10660 + }, + { + "epoch": 0.23706896551724138, + "grad_norm": 1.9140625, + "learning_rate": 1.5258222222222224e-05, + "loss": 0.403, + "step": 10670 + }, + { + "epoch": 0.23729114824031283, + "grad_norm": 2.09375, + "learning_rate": 1.5253777777777778e-05, + "loss": 0.4352, + "step": 10680 + }, + { + "epoch": 0.23751333096338428, + "grad_norm": 2.296875, + "learning_rate": 1.5249333333333333e-05, + "loss": 0.4014, + "step": 10690 + }, + { + "epoch": 0.23773551368645574, + "grad_norm": 2.15625, + "learning_rate": 1.524488888888889e-05, + "loss": 0.4451, + "step": 10700 + }, + { + "epoch": 0.2379576964095272, + "grad_norm": 1.765625, + "learning_rate": 1.5240444444444446e-05, + "loss": 0.4506, + "step": 10710 + }, + { + "epoch": 0.23817987913259864, + "grad_norm": 2.796875, + "learning_rate": 1.5236000000000001e-05, + "loss": 0.4418, + "step": 10720 + }, + { + "epoch": 0.23840206185567012, + "grad_norm": 1.7421875, + "learning_rate": 1.5231555555555558e-05, + "loss": 0.3964, + "step": 10730 + }, + { + "epoch": 0.23862424457874157, + "grad_norm": 2.234375, + "learning_rate": 1.5227111111111113e-05, + "loss": 0.429, + "step": 10740 + }, + { + "epoch": 0.23884642730181302, + "grad_norm": 1.8125, + "learning_rate": 1.5222666666666667e-05, + "loss": 0.3924, + "step": 10750 + }, + { + "epoch": 0.23906861002488447, + "grad_norm": 1.7578125, + "learning_rate": 1.5218222222222224e-05, + "loss": 0.4009, + "step": 10760 + }, + { + "epoch": 0.23929079274795592, + "grad_norm": 2.125, + "learning_rate": 1.5213777777777779e-05, + "loss": 0.411, + "step": 10770 + }, + { + "epoch": 0.23951297547102737, + "grad_norm": 1.9296875, + "learning_rate": 1.5209333333333334e-05, + "loss": 0.4171, + "step": 10780 + }, + { + "epoch": 0.23973515819409882, + "grad_norm": 1.96875, + "learning_rate": 1.5204888888888888e-05, + "loss": 0.4003, + "step": 10790 + }, + { + "epoch": 0.23995734091717028, + "grad_norm": 1.765625, + "learning_rate": 1.5200444444444447e-05, + "loss": 0.3866, + "step": 10800 + }, + { + "epoch": 0.24017952364024173, + "grad_norm": 2.03125, + "learning_rate": 1.5196000000000002e-05, + "loss": 0.4146, + "step": 10810 + }, + { + "epoch": 0.24040170636331318, + "grad_norm": 1.75, + "learning_rate": 1.5191555555555556e-05, + "loss": 0.3649, + "step": 10820 + }, + { + "epoch": 0.24062388908638463, + "grad_norm": 2.015625, + "learning_rate": 1.5187111111111113e-05, + "loss": 0.4661, + "step": 10830 + }, + { + "epoch": 0.2408460718094561, + "grad_norm": 2.1875, + "learning_rate": 1.5182666666666668e-05, + "loss": 0.4078, + "step": 10840 + }, + { + "epoch": 0.24106825453252756, + "grad_norm": 2.609375, + "learning_rate": 1.5178222222222223e-05, + "loss": 0.401, + "step": 10850 + }, + { + "epoch": 0.241290437255599, + "grad_norm": 2.03125, + "learning_rate": 1.517377777777778e-05, + "loss": 0.4201, + "step": 10860 + }, + { + "epoch": 0.24151261997867046, + "grad_norm": 1.890625, + "learning_rate": 1.5169333333333334e-05, + "loss": 0.3816, + "step": 10870 + }, + { + "epoch": 0.2417348027017419, + "grad_norm": 2.03125, + "learning_rate": 1.516488888888889e-05, + "loss": 0.4488, + "step": 10880 + }, + { + "epoch": 0.24195698542481336, + "grad_norm": 2.25, + "learning_rate": 1.5160444444444447e-05, + "loss": 0.3732, + "step": 10890 + }, + { + "epoch": 0.24217916814788482, + "grad_norm": 2.15625, + "learning_rate": 1.5156000000000002e-05, + "loss": 0.4348, + "step": 10900 + }, + { + "epoch": 0.24240135087095627, + "grad_norm": 2.109375, + "learning_rate": 1.5151555555555557e-05, + "loss": 0.4499, + "step": 10910 + }, + { + "epoch": 0.24262353359402772, + "grad_norm": 1.828125, + "learning_rate": 1.5147111111111112e-05, + "loss": 0.4267, + "step": 10920 + }, + { + "epoch": 0.24284571631709917, + "grad_norm": 1.9140625, + "learning_rate": 1.5142666666666668e-05, + "loss": 0.4251, + "step": 10930 + }, + { + "epoch": 0.24306789904017065, + "grad_norm": 2.109375, + "learning_rate": 1.5138222222222223e-05, + "loss": 0.3651, + "step": 10940 + }, + { + "epoch": 0.2432900817632421, + "grad_norm": 1.8515625, + "learning_rate": 1.5133777777777778e-05, + "loss": 0.3934, + "step": 10950 + }, + { + "epoch": 0.24351226448631355, + "grad_norm": 1.6484375, + "learning_rate": 1.5129333333333334e-05, + "loss": 0.3921, + "step": 10960 + }, + { + "epoch": 0.243734447209385, + "grad_norm": 1.9375, + "learning_rate": 1.5124888888888891e-05, + "loss": 0.4099, + "step": 10970 + }, + { + "epoch": 0.24395662993245645, + "grad_norm": 2.234375, + "learning_rate": 1.5120444444444446e-05, + "loss": 0.4271, + "step": 10980 + }, + { + "epoch": 0.2441788126555279, + "grad_norm": 1.65625, + "learning_rate": 1.5116000000000002e-05, + "loss": 0.4147, + "step": 10990 + }, + { + "epoch": 0.24440099537859936, + "grad_norm": 1.890625, + "learning_rate": 1.5111555555555557e-05, + "loss": 0.411, + "step": 11000 + }, + { + "epoch": 0.2446231781016708, + "grad_norm": 2.15625, + "learning_rate": 1.5107111111111112e-05, + "loss": 0.3897, + "step": 11010 + }, + { + "epoch": 0.24484536082474226, + "grad_norm": 2.09375, + "learning_rate": 1.5102666666666667e-05, + "loss": 0.4195, + "step": 11020 + }, + { + "epoch": 0.2450675435478137, + "grad_norm": 2.046875, + "learning_rate": 1.5098222222222223e-05, + "loss": 0.3904, + "step": 11030 + }, + { + "epoch": 0.2452897262708852, + "grad_norm": 2.15625, + "learning_rate": 1.5093777777777778e-05, + "loss": 0.3475, + "step": 11040 + }, + { + "epoch": 0.24551190899395664, + "grad_norm": 1.953125, + "learning_rate": 1.5089333333333333e-05, + "loss": 0.4163, + "step": 11050 + }, + { + "epoch": 0.2457340917170281, + "grad_norm": 1.984375, + "learning_rate": 1.5084888888888891e-05, + "loss": 0.4046, + "step": 11060 + }, + { + "epoch": 0.24595627444009954, + "grad_norm": 2.046875, + "learning_rate": 1.5080444444444446e-05, + "loss": 0.4096, + "step": 11070 + }, + { + "epoch": 0.246178457163171, + "grad_norm": 2.34375, + "learning_rate": 1.5076000000000001e-05, + "loss": 0.4577, + "step": 11080 + }, + { + "epoch": 0.24640063988624245, + "grad_norm": 1.6953125, + "learning_rate": 1.5071555555555558e-05, + "loss": 0.3678, + "step": 11090 + }, + { + "epoch": 0.2466228226093139, + "grad_norm": 2.484375, + "learning_rate": 1.5067111111111112e-05, + "loss": 0.4472, + "step": 11100 + }, + { + "epoch": 0.24684500533238535, + "grad_norm": 2.125, + "learning_rate": 1.5062666666666667e-05, + "loss": 0.4231, + "step": 11110 + }, + { + "epoch": 0.2470671880554568, + "grad_norm": 2.125, + "learning_rate": 1.5058222222222224e-05, + "loss": 0.3996, + "step": 11120 + }, + { + "epoch": 0.24728937077852825, + "grad_norm": 2.34375, + "learning_rate": 1.5053777777777779e-05, + "loss": 0.4105, + "step": 11130 + }, + { + "epoch": 0.24751155350159973, + "grad_norm": 1.765625, + "learning_rate": 1.5049333333333333e-05, + "loss": 0.4256, + "step": 11140 + }, + { + "epoch": 0.24773373622467118, + "grad_norm": 1.6015625, + "learning_rate": 1.5044888888888892e-05, + "loss": 0.4418, + "step": 11150 + }, + { + "epoch": 0.24795591894774263, + "grad_norm": 1.546875, + "learning_rate": 1.5040444444444447e-05, + "loss": 0.4127, + "step": 11160 + }, + { + "epoch": 0.24817810167081408, + "grad_norm": 2.125, + "learning_rate": 1.5036000000000001e-05, + "loss": 0.4138, + "step": 11170 + }, + { + "epoch": 0.24840028439388553, + "grad_norm": 2.25, + "learning_rate": 1.5031555555555556e-05, + "loss": 0.4248, + "step": 11180 + }, + { + "epoch": 0.24862246711695699, + "grad_norm": 1.90625, + "learning_rate": 1.5027111111111113e-05, + "loss": 0.3865, + "step": 11190 + }, + { + "epoch": 0.24884464984002844, + "grad_norm": 1.515625, + "learning_rate": 1.5022666666666668e-05, + "loss": 0.4174, + "step": 11200 + }, + { + "epoch": 0.2490668325630999, + "grad_norm": 2.015625, + "learning_rate": 1.5018222222222222e-05, + "loss": 0.393, + "step": 11210 + }, + { + "epoch": 0.24928901528617134, + "grad_norm": 1.8828125, + "learning_rate": 1.5013777777777779e-05, + "loss": 0.4152, + "step": 11220 + }, + { + "epoch": 0.2495111980092428, + "grad_norm": 2.203125, + "learning_rate": 1.5009333333333334e-05, + "loss": 0.4029, + "step": 11230 + }, + { + "epoch": 0.24973338073231424, + "grad_norm": 2.296875, + "learning_rate": 1.500488888888889e-05, + "loss": 0.3629, + "step": 11240 + }, + { + "epoch": 0.24995556345538572, + "grad_norm": 2.03125, + "learning_rate": 1.5000444444444447e-05, + "loss": 0.3671, + "step": 11250 + }, + { + "epoch": 0.25017774617845717, + "grad_norm": 1.9453125, + "learning_rate": 1.4996000000000002e-05, + "loss": 0.413, + "step": 11260 + }, + { + "epoch": 0.2503999289015286, + "grad_norm": 1.8203125, + "learning_rate": 1.4991555555555557e-05, + "loss": 0.404, + "step": 11270 + }, + { + "epoch": 0.2506221116246001, + "grad_norm": 2.03125, + "learning_rate": 1.4987111111111111e-05, + "loss": 0.4044, + "step": 11280 + }, + { + "epoch": 0.2508442943476715, + "grad_norm": 2.078125, + "learning_rate": 1.4982666666666668e-05, + "loss": 0.4293, + "step": 11290 + }, + { + "epoch": 0.251066477070743, + "grad_norm": 1.84375, + "learning_rate": 1.4978222222222223e-05, + "loss": 0.3781, + "step": 11300 + }, + { + "epoch": 0.25128865979381443, + "grad_norm": 2.140625, + "learning_rate": 1.4973777777777778e-05, + "loss": 0.458, + "step": 11310 + }, + { + "epoch": 0.2515108425168859, + "grad_norm": 1.6484375, + "learning_rate": 1.4969333333333334e-05, + "loss": 0.4059, + "step": 11320 + }, + { + "epoch": 0.25173302523995733, + "grad_norm": 2.21875, + "learning_rate": 1.496488888888889e-05, + "loss": 0.3664, + "step": 11330 + }, + { + "epoch": 0.2519552079630288, + "grad_norm": 2.296875, + "learning_rate": 1.4960444444444446e-05, + "loss": 0.4389, + "step": 11340 + }, + { + "epoch": 0.25217739068610023, + "grad_norm": 2.625, + "learning_rate": 1.4956000000000002e-05, + "loss": 0.4298, + "step": 11350 + }, + { + "epoch": 0.2523995734091717, + "grad_norm": 1.8359375, + "learning_rate": 1.4951555555555557e-05, + "loss": 0.4423, + "step": 11360 + }, + { + "epoch": 0.25262175613224314, + "grad_norm": 2.0625, + "learning_rate": 1.4947111111111112e-05, + "loss": 0.4159, + "step": 11370 + }, + { + "epoch": 0.2528439388553146, + "grad_norm": 1.7734375, + "learning_rate": 1.4942666666666668e-05, + "loss": 0.3997, + "step": 11380 + }, + { + "epoch": 0.25306612157838604, + "grad_norm": 2.0, + "learning_rate": 1.4938222222222223e-05, + "loss": 0.3869, + "step": 11390 + }, + { + "epoch": 0.25328830430145755, + "grad_norm": 2.09375, + "learning_rate": 1.4933777777777778e-05, + "loss": 0.4579, + "step": 11400 + }, + { + "epoch": 0.253510487024529, + "grad_norm": 2.34375, + "learning_rate": 1.4929333333333333e-05, + "loss": 0.4292, + "step": 11410 + }, + { + "epoch": 0.25373266974760045, + "grad_norm": 1.6875, + "learning_rate": 1.4924888888888891e-05, + "loss": 0.4276, + "step": 11420 + }, + { + "epoch": 0.2539548524706719, + "grad_norm": 2.109375, + "learning_rate": 1.4920444444444446e-05, + "loss": 0.3931, + "step": 11430 + }, + { + "epoch": 0.25417703519374335, + "grad_norm": 2.078125, + "learning_rate": 1.4916e-05, + "loss": 0.4296, + "step": 11440 + }, + { + "epoch": 0.2543992179168148, + "grad_norm": 1.5546875, + "learning_rate": 1.4911555555555557e-05, + "loss": 0.3877, + "step": 11450 + }, + { + "epoch": 0.25462140063988625, + "grad_norm": 1.7109375, + "learning_rate": 1.4907111111111112e-05, + "loss": 0.4044, + "step": 11460 + }, + { + "epoch": 0.2548435833629577, + "grad_norm": 2.15625, + "learning_rate": 1.4902666666666667e-05, + "loss": 0.4242, + "step": 11470 + }, + { + "epoch": 0.25506576608602916, + "grad_norm": 2.109375, + "learning_rate": 1.4898222222222224e-05, + "loss": 0.3976, + "step": 11480 + }, + { + "epoch": 0.2552879488091006, + "grad_norm": 1.921875, + "learning_rate": 1.4893777777777778e-05, + "loss": 0.4272, + "step": 11490 + }, + { + "epoch": 0.25551013153217206, + "grad_norm": 1.84375, + "learning_rate": 1.4889333333333335e-05, + "loss": 0.4176, + "step": 11500 + }, + { + "epoch": 0.2557323142552435, + "grad_norm": 1.9375, + "learning_rate": 1.4884888888888892e-05, + "loss": 0.4055, + "step": 11510 + }, + { + "epoch": 0.25595449697831496, + "grad_norm": 2.734375, + "learning_rate": 1.4880444444444446e-05, + "loss": 0.4689, + "step": 11520 + }, + { + "epoch": 0.2561766797013864, + "grad_norm": 2.328125, + "learning_rate": 1.4876000000000001e-05, + "loss": 0.4248, + "step": 11530 + }, + { + "epoch": 0.25639886242445786, + "grad_norm": 1.765625, + "learning_rate": 1.4871555555555556e-05, + "loss": 0.3976, + "step": 11540 + }, + { + "epoch": 0.2566210451475293, + "grad_norm": 2.078125, + "learning_rate": 1.4867111111111113e-05, + "loss": 0.4339, + "step": 11550 + }, + { + "epoch": 0.25684322787060077, + "grad_norm": 2.375, + "learning_rate": 1.4862666666666667e-05, + "loss": 0.3969, + "step": 11560 + }, + { + "epoch": 0.2570654105936722, + "grad_norm": 2.234375, + "learning_rate": 1.4858222222222222e-05, + "loss": 0.4235, + "step": 11570 + }, + { + "epoch": 0.25728759331674367, + "grad_norm": 1.9375, + "learning_rate": 1.4853777777777779e-05, + "loss": 0.4033, + "step": 11580 + }, + { + "epoch": 0.2575097760398151, + "grad_norm": 2.0625, + "learning_rate": 1.4849333333333335e-05, + "loss": 0.3886, + "step": 11590 + }, + { + "epoch": 0.25773195876288657, + "grad_norm": 2.265625, + "learning_rate": 1.484488888888889e-05, + "loss": 0.3896, + "step": 11600 + }, + { + "epoch": 0.2579541414859581, + "grad_norm": 1.9453125, + "learning_rate": 1.4840444444444447e-05, + "loss": 0.3802, + "step": 11610 + }, + { + "epoch": 0.25817632420902953, + "grad_norm": 2.03125, + "learning_rate": 1.4836000000000002e-05, + "loss": 0.4334, + "step": 11620 + }, + { + "epoch": 0.258398506932101, + "grad_norm": 2.0, + "learning_rate": 1.4831555555555556e-05, + "loss": 0.4029, + "step": 11630 + }, + { + "epoch": 0.25862068965517243, + "grad_norm": 2.59375, + "learning_rate": 1.4827111111111111e-05, + "loss": 0.3693, + "step": 11640 + }, + { + "epoch": 0.2588428723782439, + "grad_norm": 2.328125, + "learning_rate": 1.4822666666666668e-05, + "loss": 0.4093, + "step": 11650 + }, + { + "epoch": 0.25906505510131533, + "grad_norm": 2.578125, + "learning_rate": 1.4818222222222223e-05, + "loss": 0.4496, + "step": 11660 + }, + { + "epoch": 0.2592872378243868, + "grad_norm": 2.234375, + "learning_rate": 1.4813777777777778e-05, + "loss": 0.405, + "step": 11670 + }, + { + "epoch": 0.25950942054745824, + "grad_norm": 2.03125, + "learning_rate": 1.4809333333333336e-05, + "loss": 0.3803, + "step": 11680 + }, + { + "epoch": 0.2597316032705297, + "grad_norm": 1.9375, + "learning_rate": 1.480488888888889e-05, + "loss": 0.3986, + "step": 11690 + }, + { + "epoch": 0.25995378599360114, + "grad_norm": 1.71875, + "learning_rate": 1.4800444444444445e-05, + "loss": 0.4121, + "step": 11700 + }, + { + "epoch": 0.2601759687166726, + "grad_norm": 1.7265625, + "learning_rate": 1.4796000000000002e-05, + "loss": 0.3735, + "step": 11710 + }, + { + "epoch": 0.26039815143974404, + "grad_norm": 2.203125, + "learning_rate": 1.4791555555555557e-05, + "loss": 0.396, + "step": 11720 + }, + { + "epoch": 0.2606203341628155, + "grad_norm": 2.15625, + "learning_rate": 1.4787111111111112e-05, + "loss": 0.3862, + "step": 11730 + }, + { + "epoch": 0.26084251688588694, + "grad_norm": 2.25, + "learning_rate": 1.4782666666666668e-05, + "loss": 0.3994, + "step": 11740 + }, + { + "epoch": 0.2610646996089584, + "grad_norm": 2.4375, + "learning_rate": 1.4778222222222223e-05, + "loss": 0.3949, + "step": 11750 + }, + { + "epoch": 0.26128688233202985, + "grad_norm": 1.8359375, + "learning_rate": 1.4773777777777778e-05, + "loss": 0.4026, + "step": 11760 + }, + { + "epoch": 0.2615090650551013, + "grad_norm": 2.171875, + "learning_rate": 1.4769333333333336e-05, + "loss": 0.4095, + "step": 11770 + }, + { + "epoch": 0.26173124777817275, + "grad_norm": 2.28125, + "learning_rate": 1.4764888888888891e-05, + "loss": 0.4203, + "step": 11780 + }, + { + "epoch": 0.2619534305012442, + "grad_norm": 1.90625, + "learning_rate": 1.4760444444444446e-05, + "loss": 0.3837, + "step": 11790 + }, + { + "epoch": 0.26217561322431565, + "grad_norm": 2.25, + "learning_rate": 1.4756e-05, + "loss": 0.4313, + "step": 11800 + }, + { + "epoch": 0.26239779594738716, + "grad_norm": 2.015625, + "learning_rate": 1.4751555555555557e-05, + "loss": 0.4183, + "step": 11810 + }, + { + "epoch": 0.2626199786704586, + "grad_norm": 1.953125, + "learning_rate": 1.4747111111111112e-05, + "loss": 0.4251, + "step": 11820 + }, + { + "epoch": 0.26284216139353006, + "grad_norm": 2.3125, + "learning_rate": 1.4742666666666667e-05, + "loss": 0.4186, + "step": 11830 + }, + { + "epoch": 0.2630643441166015, + "grad_norm": 2.125, + "learning_rate": 1.4738222222222223e-05, + "loss": 0.3983, + "step": 11840 + }, + { + "epoch": 0.26328652683967296, + "grad_norm": 1.8359375, + "learning_rate": 1.4733777777777778e-05, + "loss": 0.4149, + "step": 11850 + }, + { + "epoch": 0.2635087095627444, + "grad_norm": 1.859375, + "learning_rate": 1.4729333333333335e-05, + "loss": 0.3967, + "step": 11860 + }, + { + "epoch": 0.26373089228581587, + "grad_norm": 1.8984375, + "learning_rate": 1.4724888888888891e-05, + "loss": 0.4256, + "step": 11870 + }, + { + "epoch": 0.2639530750088873, + "grad_norm": 2.09375, + "learning_rate": 1.4720444444444446e-05, + "loss": 0.4262, + "step": 11880 + }, + { + "epoch": 0.26417525773195877, + "grad_norm": 2.109375, + "learning_rate": 1.4716000000000001e-05, + "loss": 0.4139, + "step": 11890 + }, + { + "epoch": 0.2643974404550302, + "grad_norm": 1.546875, + "learning_rate": 1.4711555555555556e-05, + "loss": 0.4439, + "step": 11900 + }, + { + "epoch": 0.26461962317810167, + "grad_norm": 1.9296875, + "learning_rate": 1.4707111111111112e-05, + "loss": 0.4166, + "step": 11910 + }, + { + "epoch": 0.2648418059011731, + "grad_norm": 1.9921875, + "learning_rate": 1.4702666666666667e-05, + "loss": 0.4344, + "step": 11920 + }, + { + "epoch": 0.2650639886242446, + "grad_norm": 1.765625, + "learning_rate": 1.4698222222222222e-05, + "loss": 0.3895, + "step": 11930 + }, + { + "epoch": 0.265286171347316, + "grad_norm": 2.296875, + "learning_rate": 1.4693777777777779e-05, + "loss": 0.4079, + "step": 11940 + }, + { + "epoch": 0.2655083540703875, + "grad_norm": 1.7421875, + "learning_rate": 1.4689333333333335e-05, + "loss": 0.4007, + "step": 11950 + }, + { + "epoch": 0.2657305367934589, + "grad_norm": 1.9296875, + "learning_rate": 1.468488888888889e-05, + "loss": 0.404, + "step": 11960 + }, + { + "epoch": 0.2659527195165304, + "grad_norm": 1.875, + "learning_rate": 1.4680444444444447e-05, + "loss": 0.3996, + "step": 11970 + }, + { + "epoch": 0.26617490223960183, + "grad_norm": 1.7890625, + "learning_rate": 1.4676000000000001e-05, + "loss": 0.4015, + "step": 11980 + }, + { + "epoch": 0.2663970849626733, + "grad_norm": 1.9296875, + "learning_rate": 1.4671555555555556e-05, + "loss": 0.4102, + "step": 11990 + }, + { + "epoch": 0.26661926768574473, + "grad_norm": 2.03125, + "learning_rate": 1.4667111111111111e-05, + "loss": 0.398, + "step": 12000 + }, + { + "epoch": 0.2668414504088162, + "grad_norm": 1.921875, + "learning_rate": 1.4662666666666668e-05, + "loss": 0.4066, + "step": 12010 + }, + { + "epoch": 0.2670636331318877, + "grad_norm": 2.28125, + "learning_rate": 1.4658222222222223e-05, + "loss": 0.4213, + "step": 12020 + }, + { + "epoch": 0.26728581585495914, + "grad_norm": 2.15625, + "learning_rate": 1.4653777777777777e-05, + "loss": 0.398, + "step": 12030 + }, + { + "epoch": 0.2675079985780306, + "grad_norm": 1.6796875, + "learning_rate": 1.4649333333333336e-05, + "loss": 0.3696, + "step": 12040 + }, + { + "epoch": 0.26773018130110204, + "grad_norm": 1.8984375, + "learning_rate": 1.464488888888889e-05, + "loss": 0.3524, + "step": 12050 + }, + { + "epoch": 0.2679523640241735, + "grad_norm": 1.8515625, + "learning_rate": 1.4640444444444445e-05, + "loss": 0.4166, + "step": 12060 + }, + { + "epoch": 0.26817454674724495, + "grad_norm": 2.109375, + "learning_rate": 1.4636000000000002e-05, + "loss": 0.4267, + "step": 12070 + }, + { + "epoch": 0.2683967294703164, + "grad_norm": 1.953125, + "learning_rate": 1.4631555555555557e-05, + "loss": 0.4347, + "step": 12080 + }, + { + "epoch": 0.26861891219338785, + "grad_norm": 2.21875, + "learning_rate": 1.4627111111111112e-05, + "loss": 0.436, + "step": 12090 + }, + { + "epoch": 0.2688410949164593, + "grad_norm": 1.7109375, + "learning_rate": 1.4622666666666668e-05, + "loss": 0.3948, + "step": 12100 + }, + { + "epoch": 0.26906327763953075, + "grad_norm": 2.296875, + "learning_rate": 1.4618222222222223e-05, + "loss": 0.4133, + "step": 12110 + }, + { + "epoch": 0.2692854603626022, + "grad_norm": 2.015625, + "learning_rate": 1.461377777777778e-05, + "loss": 0.3911, + "step": 12120 + }, + { + "epoch": 0.26950764308567365, + "grad_norm": 2.0625, + "learning_rate": 1.4609333333333336e-05, + "loss": 0.4272, + "step": 12130 + }, + { + "epoch": 0.2697298258087451, + "grad_norm": 2.578125, + "learning_rate": 1.460488888888889e-05, + "loss": 0.4426, + "step": 12140 + }, + { + "epoch": 0.26995200853181656, + "grad_norm": 1.859375, + "learning_rate": 1.4600444444444446e-05, + "loss": 0.4244, + "step": 12150 + }, + { + "epoch": 0.270174191254888, + "grad_norm": 2.171875, + "learning_rate": 1.4596e-05, + "loss": 0.4009, + "step": 12160 + }, + { + "epoch": 0.27039637397795946, + "grad_norm": 1.9296875, + "learning_rate": 1.4591555555555557e-05, + "loss": 0.386, + "step": 12170 + }, + { + "epoch": 0.2706185567010309, + "grad_norm": 1.953125, + "learning_rate": 1.4587111111111112e-05, + "loss": 0.4325, + "step": 12180 + }, + { + "epoch": 0.27084073942410236, + "grad_norm": 1.71875, + "learning_rate": 1.4582666666666667e-05, + "loss": 0.3962, + "step": 12190 + }, + { + "epoch": 0.2710629221471738, + "grad_norm": 2.265625, + "learning_rate": 1.4578222222222223e-05, + "loss": 0.4439, + "step": 12200 + }, + { + "epoch": 0.27128510487024526, + "grad_norm": 1.9453125, + "learning_rate": 1.457377777777778e-05, + "loss": 0.4283, + "step": 12210 + }, + { + "epoch": 0.27150728759331677, + "grad_norm": 2.109375, + "learning_rate": 1.4569333333333335e-05, + "loss": 0.4495, + "step": 12220 + }, + { + "epoch": 0.2717294703163882, + "grad_norm": 1.90625, + "learning_rate": 1.4564888888888891e-05, + "loss": 0.3652, + "step": 12230 + }, + { + "epoch": 0.2719516530394597, + "grad_norm": 1.859375, + "learning_rate": 1.4560444444444446e-05, + "loss": 0.3978, + "step": 12240 + }, + { + "epoch": 0.2721738357625311, + "grad_norm": 1.6953125, + "learning_rate": 1.4556000000000001e-05, + "loss": 0.4179, + "step": 12250 + }, + { + "epoch": 0.2723960184856026, + "grad_norm": 2.265625, + "learning_rate": 1.4551555555555556e-05, + "loss": 0.4119, + "step": 12260 + }, + { + "epoch": 0.272618201208674, + "grad_norm": 2.234375, + "learning_rate": 1.4547111111111112e-05, + "loss": 0.414, + "step": 12270 + }, + { + "epoch": 0.2728403839317455, + "grad_norm": 1.734375, + "learning_rate": 1.4542666666666667e-05, + "loss": 0.4351, + "step": 12280 + }, + { + "epoch": 0.27306256665481693, + "grad_norm": 1.9921875, + "learning_rate": 1.4538222222222222e-05, + "loss": 0.3993, + "step": 12290 + }, + { + "epoch": 0.2732847493778884, + "grad_norm": 2.09375, + "learning_rate": 1.453377777777778e-05, + "loss": 0.4078, + "step": 12300 + }, + { + "epoch": 0.27350693210095983, + "grad_norm": 2.015625, + "learning_rate": 1.4529333333333335e-05, + "loss": 0.4145, + "step": 12310 + }, + { + "epoch": 0.2737291148240313, + "grad_norm": 2.359375, + "learning_rate": 1.452488888888889e-05, + "loss": 0.4601, + "step": 12320 + }, + { + "epoch": 0.27395129754710273, + "grad_norm": 2.3125, + "learning_rate": 1.4520444444444446e-05, + "loss": 0.4044, + "step": 12330 + }, + { + "epoch": 0.2741734802701742, + "grad_norm": 2.4375, + "learning_rate": 1.4516000000000001e-05, + "loss": 0.4171, + "step": 12340 + }, + { + "epoch": 0.27439566299324564, + "grad_norm": 2.046875, + "learning_rate": 1.4511555555555556e-05, + "loss": 0.4093, + "step": 12350 + }, + { + "epoch": 0.2746178457163171, + "grad_norm": 1.7890625, + "learning_rate": 1.4507111111111111e-05, + "loss": 0.3676, + "step": 12360 + }, + { + "epoch": 0.27484002843938854, + "grad_norm": 2.359375, + "learning_rate": 1.4502666666666668e-05, + "loss": 0.4007, + "step": 12370 + }, + { + "epoch": 0.27506221116246, + "grad_norm": 1.9296875, + "learning_rate": 1.4498222222222222e-05, + "loss": 0.3973, + "step": 12380 + }, + { + "epoch": 0.27528439388553144, + "grad_norm": 1.890625, + "learning_rate": 1.449377777777778e-05, + "loss": 0.3983, + "step": 12390 + }, + { + "epoch": 0.2755065766086029, + "grad_norm": 2.09375, + "learning_rate": 1.4489333333333335e-05, + "loss": 0.4332, + "step": 12400 + }, + { + "epoch": 0.27572875933167434, + "grad_norm": 2.59375, + "learning_rate": 1.448488888888889e-05, + "loss": 0.4434, + "step": 12410 + }, + { + "epoch": 0.2759509420547458, + "grad_norm": 1.859375, + "learning_rate": 1.4480444444444445e-05, + "loss": 0.4142, + "step": 12420 + }, + { + "epoch": 0.2761731247778173, + "grad_norm": 2.078125, + "learning_rate": 1.4476000000000002e-05, + "loss": 0.4278, + "step": 12430 + }, + { + "epoch": 0.27639530750088875, + "grad_norm": 2.15625, + "learning_rate": 1.4471555555555557e-05, + "loss": 0.4008, + "step": 12440 + }, + { + "epoch": 0.2766174902239602, + "grad_norm": 1.8984375, + "learning_rate": 1.4467111111111111e-05, + "loss": 0.3918, + "step": 12450 + }, + { + "epoch": 0.27683967294703166, + "grad_norm": 2.109375, + "learning_rate": 1.4462666666666668e-05, + "loss": 0.3931, + "step": 12460 + }, + { + "epoch": 0.2770618556701031, + "grad_norm": 2.421875, + "learning_rate": 1.4458222222222223e-05, + "loss": 0.4337, + "step": 12470 + }, + { + "epoch": 0.27728403839317456, + "grad_norm": 2.0625, + "learning_rate": 1.445377777777778e-05, + "loss": 0.4374, + "step": 12480 + }, + { + "epoch": 0.277506221116246, + "grad_norm": 2.015625, + "learning_rate": 1.4449333333333336e-05, + "loss": 0.4279, + "step": 12490 + }, + { + "epoch": 0.27772840383931746, + "grad_norm": 1.9921875, + "learning_rate": 1.444488888888889e-05, + "loss": 0.4163, + "step": 12500 + }, + { + "epoch": 0.2779505865623889, + "grad_norm": 2.203125, + "learning_rate": 1.4440444444444446e-05, + "loss": 0.4316, + "step": 12510 + }, + { + "epoch": 0.27817276928546036, + "grad_norm": 2.078125, + "learning_rate": 1.4436e-05, + "loss": 0.4183, + "step": 12520 + }, + { + "epoch": 0.2783949520085318, + "grad_norm": 2.390625, + "learning_rate": 1.4431555555555557e-05, + "loss": 0.4326, + "step": 12530 + }, + { + "epoch": 0.27861713473160327, + "grad_norm": 1.9609375, + "learning_rate": 1.4427111111111112e-05, + "loss": 0.4192, + "step": 12540 + }, + { + "epoch": 0.2788393174546747, + "grad_norm": 2.21875, + "learning_rate": 1.4422666666666667e-05, + "loss": 0.4446, + "step": 12550 + }, + { + "epoch": 0.27906150017774617, + "grad_norm": 2.265625, + "learning_rate": 1.4418222222222223e-05, + "loss": 0.3878, + "step": 12560 + }, + { + "epoch": 0.2792836829008176, + "grad_norm": 1.984375, + "learning_rate": 1.441377777777778e-05, + "loss": 0.4499, + "step": 12570 + }, + { + "epoch": 0.27950586562388907, + "grad_norm": 1.984375, + "learning_rate": 1.4409333333333335e-05, + "loss": 0.4303, + "step": 12580 + }, + { + "epoch": 0.2797280483469605, + "grad_norm": 2.25, + "learning_rate": 1.4404888888888891e-05, + "loss": 0.3757, + "step": 12590 + }, + { + "epoch": 0.279950231070032, + "grad_norm": 2.046875, + "learning_rate": 1.4400444444444446e-05, + "loss": 0.3978, + "step": 12600 + }, + { + "epoch": 0.2801724137931034, + "grad_norm": 2.046875, + "learning_rate": 1.4396e-05, + "loss": 0.4275, + "step": 12610 + }, + { + "epoch": 0.2803945965161749, + "grad_norm": 1.78125, + "learning_rate": 1.4391555555555556e-05, + "loss": 0.407, + "step": 12620 + }, + { + "epoch": 0.2806167792392464, + "grad_norm": 2.21875, + "learning_rate": 1.4387111111111112e-05, + "loss": 0.4344, + "step": 12630 + }, + { + "epoch": 0.28083896196231783, + "grad_norm": 1.9375, + "learning_rate": 1.4382666666666667e-05, + "loss": 0.3926, + "step": 12640 + }, + { + "epoch": 0.2810611446853893, + "grad_norm": 2.015625, + "learning_rate": 1.4378222222222222e-05, + "loss": 0.4002, + "step": 12650 + }, + { + "epoch": 0.28128332740846074, + "grad_norm": 2.078125, + "learning_rate": 1.437377777777778e-05, + "loss": 0.3673, + "step": 12660 + }, + { + "epoch": 0.2815055101315322, + "grad_norm": 2.765625, + "learning_rate": 1.4369333333333335e-05, + "loss": 0.3803, + "step": 12670 + }, + { + "epoch": 0.28172769285460364, + "grad_norm": 2.03125, + "learning_rate": 1.436488888888889e-05, + "loss": 0.4249, + "step": 12680 + }, + { + "epoch": 0.2819498755776751, + "grad_norm": 2.296875, + "learning_rate": 1.4360444444444446e-05, + "loss": 0.4255, + "step": 12690 + }, + { + "epoch": 0.28217205830074654, + "grad_norm": 2.515625, + "learning_rate": 1.4356000000000001e-05, + "loss": 0.4691, + "step": 12700 + }, + { + "epoch": 0.282394241023818, + "grad_norm": 1.96875, + "learning_rate": 1.4351555555555556e-05, + "loss": 0.4481, + "step": 12710 + }, + { + "epoch": 0.28261642374688944, + "grad_norm": 2.171875, + "learning_rate": 1.434711111111111e-05, + "loss": 0.3862, + "step": 12720 + }, + { + "epoch": 0.2828386064699609, + "grad_norm": 1.8515625, + "learning_rate": 1.4342666666666667e-05, + "loss": 0.404, + "step": 12730 + }, + { + "epoch": 0.28306078919303235, + "grad_norm": 2.265625, + "learning_rate": 1.4338222222222224e-05, + "loss": 0.3762, + "step": 12740 + }, + { + "epoch": 0.2832829719161038, + "grad_norm": 2.109375, + "learning_rate": 1.433377777777778e-05, + "loss": 0.4001, + "step": 12750 + }, + { + "epoch": 0.28350515463917525, + "grad_norm": 2.34375, + "learning_rate": 1.4329333333333335e-05, + "loss": 0.4061, + "step": 12760 + }, + { + "epoch": 0.2837273373622467, + "grad_norm": 1.71875, + "learning_rate": 1.432488888888889e-05, + "loss": 0.416, + "step": 12770 + }, + { + "epoch": 0.28394952008531815, + "grad_norm": 1.75, + "learning_rate": 1.4320444444444445e-05, + "loss": 0.3675, + "step": 12780 + }, + { + "epoch": 0.2841717028083896, + "grad_norm": 2.625, + "learning_rate": 1.4316000000000002e-05, + "loss": 0.3726, + "step": 12790 + }, + { + "epoch": 0.28439388553146105, + "grad_norm": 1.8984375, + "learning_rate": 1.4311555555555556e-05, + "loss": 0.4244, + "step": 12800 + }, + { + "epoch": 0.2846160682545325, + "grad_norm": 2.03125, + "learning_rate": 1.4307111111111111e-05, + "loss": 0.4231, + "step": 12810 + }, + { + "epoch": 0.28483825097760396, + "grad_norm": 1.8984375, + "learning_rate": 1.4302666666666668e-05, + "loss": 0.4255, + "step": 12820 + }, + { + "epoch": 0.2850604337006754, + "grad_norm": 2.453125, + "learning_rate": 1.4298222222222224e-05, + "loss": 0.4275, + "step": 12830 + }, + { + "epoch": 0.2852826164237469, + "grad_norm": 2.34375, + "learning_rate": 1.4293777777777779e-05, + "loss": 0.4344, + "step": 12840 + }, + { + "epoch": 0.28550479914681837, + "grad_norm": 2.671875, + "learning_rate": 1.4289333333333336e-05, + "loss": 0.4564, + "step": 12850 + }, + { + "epoch": 0.2857269818698898, + "grad_norm": 1.96875, + "learning_rate": 1.428488888888889e-05, + "loss": 0.3974, + "step": 12860 + }, + { + "epoch": 0.28594916459296127, + "grad_norm": 2.09375, + "learning_rate": 1.4280444444444445e-05, + "loss": 0.3708, + "step": 12870 + }, + { + "epoch": 0.2861713473160327, + "grad_norm": 2.15625, + "learning_rate": 1.4276e-05, + "loss": 0.4, + "step": 12880 + }, + { + "epoch": 0.28639353003910417, + "grad_norm": 2.09375, + "learning_rate": 1.4271555555555557e-05, + "loss": 0.3767, + "step": 12890 + }, + { + "epoch": 0.2866157127621756, + "grad_norm": 2.015625, + "learning_rate": 1.4267111111111112e-05, + "loss": 0.4141, + "step": 12900 + }, + { + "epoch": 0.2868378954852471, + "grad_norm": 1.7421875, + "learning_rate": 1.4262666666666666e-05, + "loss": 0.4175, + "step": 12910 + }, + { + "epoch": 0.2870600782083185, + "grad_norm": 1.953125, + "learning_rate": 1.4258222222222225e-05, + "loss": 0.4612, + "step": 12920 + }, + { + "epoch": 0.28728226093139, + "grad_norm": 2.5, + "learning_rate": 1.425377777777778e-05, + "loss": 0.4161, + "step": 12930 + }, + { + "epoch": 0.28750444365446143, + "grad_norm": 1.6875, + "learning_rate": 1.4249333333333334e-05, + "loss": 0.4, + "step": 12940 + }, + { + "epoch": 0.2877266263775329, + "grad_norm": 2.21875, + "learning_rate": 1.4244888888888891e-05, + "loss": 0.4594, + "step": 12950 + }, + { + "epoch": 0.28794880910060433, + "grad_norm": 1.9921875, + "learning_rate": 1.4240444444444446e-05, + "loss": 0.4308, + "step": 12960 + }, + { + "epoch": 0.2881709918236758, + "grad_norm": 2.109375, + "learning_rate": 1.4236e-05, + "loss": 0.4383, + "step": 12970 + }, + { + "epoch": 0.28839317454674723, + "grad_norm": 2.078125, + "learning_rate": 1.4231555555555555e-05, + "loss": 0.4272, + "step": 12980 + }, + { + "epoch": 0.2886153572698187, + "grad_norm": 1.8828125, + "learning_rate": 1.4227111111111112e-05, + "loss": 0.4004, + "step": 12990 + }, + { + "epoch": 0.28883753999289014, + "grad_norm": 2.0625, + "learning_rate": 1.4222666666666667e-05, + "loss": 0.4038, + "step": 13000 + }, + { + "epoch": 0.2890597227159616, + "grad_norm": 1.7890625, + "learning_rate": 1.4218222222222225e-05, + "loss": 0.3879, + "step": 13010 + }, + { + "epoch": 0.28928190543903304, + "grad_norm": 2.046875, + "learning_rate": 1.421377777777778e-05, + "loss": 0.3951, + "step": 13020 + }, + { + "epoch": 0.2895040881621045, + "grad_norm": 2.09375, + "learning_rate": 1.4209333333333335e-05, + "loss": 0.3893, + "step": 13030 + }, + { + "epoch": 0.289726270885176, + "grad_norm": 2.09375, + "learning_rate": 1.420488888888889e-05, + "loss": 0.4287, + "step": 13040 + }, + { + "epoch": 0.28994845360824745, + "grad_norm": 1.8828125, + "learning_rate": 1.4200444444444446e-05, + "loss": 0.3676, + "step": 13050 + }, + { + "epoch": 0.2901706363313189, + "grad_norm": 2.46875, + "learning_rate": 1.4196000000000001e-05, + "loss": 0.4039, + "step": 13060 + }, + { + "epoch": 0.29039281905439035, + "grad_norm": 2.28125, + "learning_rate": 1.4191555555555556e-05, + "loss": 0.4102, + "step": 13070 + }, + { + "epoch": 0.2906150017774618, + "grad_norm": 2.359375, + "learning_rate": 1.4187111111111112e-05, + "loss": 0.4129, + "step": 13080 + }, + { + "epoch": 0.29083718450053325, + "grad_norm": 2.0, + "learning_rate": 1.4182666666666667e-05, + "loss": 0.424, + "step": 13090 + }, + { + "epoch": 0.2910593672236047, + "grad_norm": 1.8203125, + "learning_rate": 1.4178222222222224e-05, + "loss": 0.3931, + "step": 13100 + }, + { + "epoch": 0.29128154994667615, + "grad_norm": 1.9921875, + "learning_rate": 1.417377777777778e-05, + "loss": 0.4318, + "step": 13110 + }, + { + "epoch": 0.2915037326697476, + "grad_norm": 1.8828125, + "learning_rate": 1.4169333333333335e-05, + "loss": 0.3904, + "step": 13120 + }, + { + "epoch": 0.29172591539281906, + "grad_norm": 2.34375, + "learning_rate": 1.416488888888889e-05, + "loss": 0.3825, + "step": 13130 + }, + { + "epoch": 0.2919480981158905, + "grad_norm": 2.296875, + "learning_rate": 1.4160444444444445e-05, + "loss": 0.3881, + "step": 13140 + }, + { + "epoch": 0.29217028083896196, + "grad_norm": 2.34375, + "learning_rate": 1.4156000000000001e-05, + "loss": 0.3714, + "step": 13150 + }, + { + "epoch": 0.2923924635620334, + "grad_norm": 2.109375, + "learning_rate": 1.4151555555555556e-05, + "loss": 0.413, + "step": 13160 + }, + { + "epoch": 0.29261464628510486, + "grad_norm": 2.25, + "learning_rate": 1.4147111111111111e-05, + "loss": 0.4085, + "step": 13170 + }, + { + "epoch": 0.2928368290081763, + "grad_norm": 2.03125, + "learning_rate": 1.4142666666666668e-05, + "loss": 0.4194, + "step": 13180 + }, + { + "epoch": 0.29305901173124776, + "grad_norm": 2.125, + "learning_rate": 1.4138222222222224e-05, + "loss": 0.3961, + "step": 13190 + }, + { + "epoch": 0.2932811944543192, + "grad_norm": 2.09375, + "learning_rate": 1.4133777777777779e-05, + "loss": 0.413, + "step": 13200 + }, + { + "epoch": 0.29350337717739067, + "grad_norm": 2.234375, + "learning_rate": 1.4129333333333335e-05, + "loss": 0.414, + "step": 13210 + }, + { + "epoch": 0.2937255599004621, + "grad_norm": 2.0, + "learning_rate": 1.412488888888889e-05, + "loss": 0.4066, + "step": 13220 + }, + { + "epoch": 0.29394774262353357, + "grad_norm": 2.140625, + "learning_rate": 1.4120444444444445e-05, + "loss": 0.4498, + "step": 13230 + }, + { + "epoch": 0.294169925346605, + "grad_norm": 1.7578125, + "learning_rate": 1.4116e-05, + "loss": 0.3566, + "step": 13240 + }, + { + "epoch": 0.29439210806967653, + "grad_norm": 1.9140625, + "learning_rate": 1.4111555555555557e-05, + "loss": 0.3573, + "step": 13250 + }, + { + "epoch": 0.294614290792748, + "grad_norm": 1.8984375, + "learning_rate": 1.4107111111111111e-05, + "loss": 0.3703, + "step": 13260 + }, + { + "epoch": 0.29483647351581943, + "grad_norm": 2.40625, + "learning_rate": 1.4102666666666666e-05, + "loss": 0.3922, + "step": 13270 + }, + { + "epoch": 0.2950586562388909, + "grad_norm": 2.375, + "learning_rate": 1.4098222222222224e-05, + "loss": 0.3918, + "step": 13280 + }, + { + "epoch": 0.29528083896196233, + "grad_norm": 2.15625, + "learning_rate": 1.409377777777778e-05, + "loss": 0.4299, + "step": 13290 + }, + { + "epoch": 0.2955030216850338, + "grad_norm": 2.21875, + "learning_rate": 1.4089333333333334e-05, + "loss": 0.4128, + "step": 13300 + }, + { + "epoch": 0.29572520440810524, + "grad_norm": 2.0625, + "learning_rate": 1.408488888888889e-05, + "loss": 0.4024, + "step": 13310 + }, + { + "epoch": 0.2959473871311767, + "grad_norm": 2.046875, + "learning_rate": 1.4080444444444446e-05, + "loss": 0.3516, + "step": 13320 + }, + { + "epoch": 0.29616956985424814, + "grad_norm": 2.140625, + "learning_rate": 1.4076e-05, + "loss": 0.4223, + "step": 13330 + }, + { + "epoch": 0.2963917525773196, + "grad_norm": 2.265625, + "learning_rate": 1.4071555555555555e-05, + "loss": 0.4104, + "step": 13340 + }, + { + "epoch": 0.29661393530039104, + "grad_norm": 1.8984375, + "learning_rate": 1.4067111111111112e-05, + "loss": 0.4217, + "step": 13350 + }, + { + "epoch": 0.2968361180234625, + "grad_norm": 2.359375, + "learning_rate": 1.4062666666666668e-05, + "loss": 0.4201, + "step": 13360 + }, + { + "epoch": 0.29705830074653394, + "grad_norm": 1.84375, + "learning_rate": 1.4058222222222225e-05, + "loss": 0.3873, + "step": 13370 + }, + { + "epoch": 0.2972804834696054, + "grad_norm": 1.9140625, + "learning_rate": 1.405377777777778e-05, + "loss": 0.4223, + "step": 13380 + }, + { + "epoch": 0.29750266619267685, + "grad_norm": 2.0, + "learning_rate": 1.4049333333333335e-05, + "loss": 0.3799, + "step": 13390 + }, + { + "epoch": 0.2977248489157483, + "grad_norm": 2.09375, + "learning_rate": 1.404488888888889e-05, + "loss": 0.4534, + "step": 13400 + }, + { + "epoch": 0.29794703163881975, + "grad_norm": 1.9609375, + "learning_rate": 1.4040444444444446e-05, + "loss": 0.3999, + "step": 13410 + }, + { + "epoch": 0.2981692143618912, + "grad_norm": 2.046875, + "learning_rate": 1.4036e-05, + "loss": 0.4292, + "step": 13420 + }, + { + "epoch": 0.29839139708496265, + "grad_norm": 1.890625, + "learning_rate": 1.4031555555555556e-05, + "loss": 0.3904, + "step": 13430 + }, + { + "epoch": 0.2986135798080341, + "grad_norm": 2.390625, + "learning_rate": 1.4027111111111112e-05, + "loss": 0.4645, + "step": 13440 + }, + { + "epoch": 0.2988357625311056, + "grad_norm": 2.484375, + "learning_rate": 1.4022666666666669e-05, + "loss": 0.4084, + "step": 13450 + }, + { + "epoch": 0.29905794525417706, + "grad_norm": 2.125, + "learning_rate": 1.4018222222222224e-05, + "loss": 0.437, + "step": 13460 + }, + { + "epoch": 0.2992801279772485, + "grad_norm": 2.203125, + "learning_rate": 1.401377777777778e-05, + "loss": 0.3942, + "step": 13470 + }, + { + "epoch": 0.29950231070031996, + "grad_norm": 1.953125, + "learning_rate": 1.4009333333333335e-05, + "loss": 0.397, + "step": 13480 + }, + { + "epoch": 0.2997244934233914, + "grad_norm": 2.3125, + "learning_rate": 1.400488888888889e-05, + "loss": 0.4051, + "step": 13490 + }, + { + "epoch": 0.29994667614646287, + "grad_norm": 1.8125, + "learning_rate": 1.4000444444444445e-05, + "loss": 0.369, + "step": 13500 + }, + { + "epoch": 0.3001688588695343, + "grad_norm": 2.375, + "learning_rate": 1.3996000000000001e-05, + "loss": 0.3987, + "step": 13510 + }, + { + "epoch": 0.30039104159260577, + "grad_norm": 2.34375, + "learning_rate": 1.3991555555555556e-05, + "loss": 0.4054, + "step": 13520 + }, + { + "epoch": 0.3006132243156772, + "grad_norm": 2.140625, + "learning_rate": 1.3987111111111111e-05, + "loss": 0.3922, + "step": 13530 + }, + { + "epoch": 0.30083540703874867, + "grad_norm": 1.828125, + "learning_rate": 1.3982666666666669e-05, + "loss": 0.3975, + "step": 13540 + }, + { + "epoch": 0.3010575897618201, + "grad_norm": 1.9765625, + "learning_rate": 1.3978222222222224e-05, + "loss": 0.3955, + "step": 13550 + }, + { + "epoch": 0.3012797724848916, + "grad_norm": 1.984375, + "learning_rate": 1.3973777777777779e-05, + "loss": 0.4114, + "step": 13560 + }, + { + "epoch": 0.301501955207963, + "grad_norm": 2.109375, + "learning_rate": 1.3969333333333335e-05, + "loss": 0.4023, + "step": 13570 + }, + { + "epoch": 0.3017241379310345, + "grad_norm": 2.1875, + "learning_rate": 1.396488888888889e-05, + "loss": 0.3926, + "step": 13580 + }, + { + "epoch": 0.3019463206541059, + "grad_norm": 2.296875, + "learning_rate": 1.3960444444444445e-05, + "loss": 0.4103, + "step": 13590 + }, + { + "epoch": 0.3021685033771774, + "grad_norm": 1.953125, + "learning_rate": 1.3956e-05, + "loss": 0.4331, + "step": 13600 + }, + { + "epoch": 0.30239068610024883, + "grad_norm": 2.28125, + "learning_rate": 1.3951555555555556e-05, + "loss": 0.3934, + "step": 13610 + }, + { + "epoch": 0.3026128688233203, + "grad_norm": 2.328125, + "learning_rate": 1.3947111111111111e-05, + "loss": 0.4256, + "step": 13620 + }, + { + "epoch": 0.30283505154639173, + "grad_norm": 2.0, + "learning_rate": 1.394266666666667e-05, + "loss": 0.4013, + "step": 13630 + }, + { + "epoch": 0.3030572342694632, + "grad_norm": 2.25, + "learning_rate": 1.3938222222222224e-05, + "loss": 0.4024, + "step": 13640 + }, + { + "epoch": 0.30327941699253463, + "grad_norm": 1.953125, + "learning_rate": 1.393377777777778e-05, + "loss": 0.3839, + "step": 13650 + }, + { + "epoch": 0.30350159971560614, + "grad_norm": 2.015625, + "learning_rate": 1.3929333333333334e-05, + "loss": 0.3905, + "step": 13660 + }, + { + "epoch": 0.3037237824386776, + "grad_norm": 2.140625, + "learning_rate": 1.392488888888889e-05, + "loss": 0.3926, + "step": 13670 + }, + { + "epoch": 0.30394596516174904, + "grad_norm": 1.921875, + "learning_rate": 1.3920444444444445e-05, + "loss": 0.3708, + "step": 13680 + }, + { + "epoch": 0.3041681478848205, + "grad_norm": 2.046875, + "learning_rate": 1.3916e-05, + "loss": 0.3952, + "step": 13690 + }, + { + "epoch": 0.30439033060789195, + "grad_norm": 2.296875, + "learning_rate": 1.3911555555555555e-05, + "loss": 0.428, + "step": 13700 + }, + { + "epoch": 0.3046125133309634, + "grad_norm": 1.78125, + "learning_rate": 1.3907111111111112e-05, + "loss": 0.3858, + "step": 13710 + }, + { + "epoch": 0.30483469605403485, + "grad_norm": 2.46875, + "learning_rate": 1.3902666666666668e-05, + "loss": 0.4229, + "step": 13720 + }, + { + "epoch": 0.3050568787771063, + "grad_norm": 2.3125, + "learning_rate": 1.3898222222222225e-05, + "loss": 0.4229, + "step": 13730 + }, + { + "epoch": 0.30527906150017775, + "grad_norm": 2.078125, + "learning_rate": 1.389377777777778e-05, + "loss": 0.3765, + "step": 13740 + }, + { + "epoch": 0.3055012442232492, + "grad_norm": 2.234375, + "learning_rate": 1.3889333333333334e-05, + "loss": 0.4365, + "step": 13750 + }, + { + "epoch": 0.30572342694632065, + "grad_norm": 2.171875, + "learning_rate": 1.388488888888889e-05, + "loss": 0.4774, + "step": 13760 + }, + { + "epoch": 0.3059456096693921, + "grad_norm": 2.03125, + "learning_rate": 1.3880444444444446e-05, + "loss": 0.4109, + "step": 13770 + }, + { + "epoch": 0.30616779239246356, + "grad_norm": 2.171875, + "learning_rate": 1.3876e-05, + "loss": 0.4125, + "step": 13780 + }, + { + "epoch": 0.306389975115535, + "grad_norm": 1.90625, + "learning_rate": 1.3871555555555555e-05, + "loss": 0.3687, + "step": 13790 + }, + { + "epoch": 0.30661215783860646, + "grad_norm": 2.171875, + "learning_rate": 1.3867111111111112e-05, + "loss": 0.4284, + "step": 13800 + }, + { + "epoch": 0.3068343405616779, + "grad_norm": 2.34375, + "learning_rate": 1.3862666666666669e-05, + "loss": 0.3744, + "step": 13810 + }, + { + "epoch": 0.30705652328474936, + "grad_norm": 1.828125, + "learning_rate": 1.3858222222222223e-05, + "loss": 0.4065, + "step": 13820 + }, + { + "epoch": 0.3072787060078208, + "grad_norm": 1.9453125, + "learning_rate": 1.385377777777778e-05, + "loss": 0.4315, + "step": 13830 + }, + { + "epoch": 0.30750088873089226, + "grad_norm": 2.140625, + "learning_rate": 1.3849333333333335e-05, + "loss": 0.3864, + "step": 13840 + }, + { + "epoch": 0.3077230714539637, + "grad_norm": 2.109375, + "learning_rate": 1.384488888888889e-05, + "loss": 0.3963, + "step": 13850 + }, + { + "epoch": 0.3079452541770352, + "grad_norm": 2.1875, + "learning_rate": 1.3840444444444444e-05, + "loss": 0.3705, + "step": 13860 + }, + { + "epoch": 0.3081674369001067, + "grad_norm": 2.0, + "learning_rate": 1.3836000000000001e-05, + "loss": 0.3835, + "step": 13870 + }, + { + "epoch": 0.3083896196231781, + "grad_norm": 2.625, + "learning_rate": 1.3831555555555556e-05, + "loss": 0.441, + "step": 13880 + }, + { + "epoch": 0.3086118023462496, + "grad_norm": 2.03125, + "learning_rate": 1.382711111111111e-05, + "loss": 0.4058, + "step": 13890 + }, + { + "epoch": 0.308833985069321, + "grad_norm": 2.390625, + "learning_rate": 1.3822666666666669e-05, + "loss": 0.4046, + "step": 13900 + }, + { + "epoch": 0.3090561677923925, + "grad_norm": 2.515625, + "learning_rate": 1.3818222222222224e-05, + "loss": 0.4499, + "step": 13910 + }, + { + "epoch": 0.30927835051546393, + "grad_norm": 2.359375, + "learning_rate": 1.3813777777777779e-05, + "loss": 0.4132, + "step": 13920 + }, + { + "epoch": 0.3095005332385354, + "grad_norm": 2.078125, + "learning_rate": 1.3809333333333335e-05, + "loss": 0.3895, + "step": 13930 + }, + { + "epoch": 0.30972271596160683, + "grad_norm": 2.0, + "learning_rate": 1.380488888888889e-05, + "loss": 0.417, + "step": 13940 + }, + { + "epoch": 0.3099448986846783, + "grad_norm": 2.078125, + "learning_rate": 1.3800444444444445e-05, + "loss": 0.3978, + "step": 13950 + }, + { + "epoch": 0.31016708140774973, + "grad_norm": 2.0625, + "learning_rate": 1.3796e-05, + "loss": 0.4105, + "step": 13960 + }, + { + "epoch": 0.3103892641308212, + "grad_norm": 2.171875, + "learning_rate": 1.3791555555555556e-05, + "loss": 0.3786, + "step": 13970 + }, + { + "epoch": 0.31061144685389264, + "grad_norm": 1.984375, + "learning_rate": 1.3787111111111113e-05, + "loss": 0.4309, + "step": 13980 + }, + { + "epoch": 0.3108336295769641, + "grad_norm": 2.3125, + "learning_rate": 1.378266666666667e-05, + "loss": 0.3947, + "step": 13990 + }, + { + "epoch": 0.31105581230003554, + "grad_norm": 2.078125, + "learning_rate": 1.3778222222222224e-05, + "loss": 0.3722, + "step": 14000 + }, + { + "epoch": 0.311277995023107, + "grad_norm": 2.5625, + "learning_rate": 1.3773777777777779e-05, + "loss": 0.4129, + "step": 14010 + }, + { + "epoch": 0.31150017774617844, + "grad_norm": 1.875, + "learning_rate": 1.3769333333333334e-05, + "loss": 0.4232, + "step": 14020 + }, + { + "epoch": 0.3117223604692499, + "grad_norm": 2.0, + "learning_rate": 1.376488888888889e-05, + "loss": 0.4003, + "step": 14030 + }, + { + "epoch": 0.31194454319232134, + "grad_norm": 2.546875, + "learning_rate": 1.3760444444444445e-05, + "loss": 0.4271, + "step": 14040 + }, + { + "epoch": 0.3121667259153928, + "grad_norm": 2.390625, + "learning_rate": 1.3756e-05, + "loss": 0.4188, + "step": 14050 + }, + { + "epoch": 0.31238890863846425, + "grad_norm": 2.0, + "learning_rate": 1.3751555555555555e-05, + "loss": 0.3805, + "step": 14060 + }, + { + "epoch": 0.31261109136153575, + "grad_norm": 2.03125, + "learning_rate": 1.3747111111111113e-05, + "loss": 0.4182, + "step": 14070 + }, + { + "epoch": 0.3128332740846072, + "grad_norm": 2.03125, + "learning_rate": 1.3742666666666668e-05, + "loss": 0.4018, + "step": 14080 + }, + { + "epoch": 0.31305545680767866, + "grad_norm": 2.0625, + "learning_rate": 1.3738222222222225e-05, + "loss": 0.4084, + "step": 14090 + }, + { + "epoch": 0.3132776395307501, + "grad_norm": 2.390625, + "learning_rate": 1.373377777777778e-05, + "loss": 0.3705, + "step": 14100 + }, + { + "epoch": 0.31349982225382156, + "grad_norm": 2.296875, + "learning_rate": 1.3729333333333334e-05, + "loss": 0.413, + "step": 14110 + }, + { + "epoch": 0.313722004976893, + "grad_norm": 2.609375, + "learning_rate": 1.3724888888888889e-05, + "loss": 0.4258, + "step": 14120 + }, + { + "epoch": 0.31394418769996446, + "grad_norm": 2.046875, + "learning_rate": 1.3720444444444446e-05, + "loss": 0.4217, + "step": 14130 + }, + { + "epoch": 0.3141663704230359, + "grad_norm": 2.109375, + "learning_rate": 1.3716e-05, + "loss": 0.428, + "step": 14140 + }, + { + "epoch": 0.31438855314610736, + "grad_norm": 1.9453125, + "learning_rate": 1.3711555555555555e-05, + "loss": 0.3744, + "step": 14150 + }, + { + "epoch": 0.3146107358691788, + "grad_norm": 2.203125, + "learning_rate": 1.3707111111111114e-05, + "loss": 0.4357, + "step": 14160 + }, + { + "epoch": 0.31483291859225027, + "grad_norm": 1.84375, + "learning_rate": 1.3702666666666668e-05, + "loss": 0.4138, + "step": 14170 + }, + { + "epoch": 0.3150551013153217, + "grad_norm": 2.171875, + "learning_rate": 1.3698222222222223e-05, + "loss": 0.3804, + "step": 14180 + }, + { + "epoch": 0.31527728403839317, + "grad_norm": 2.09375, + "learning_rate": 1.369377777777778e-05, + "loss": 0.4022, + "step": 14190 + }, + { + "epoch": 0.3154994667614646, + "grad_norm": 2.15625, + "learning_rate": 1.3689333333333335e-05, + "loss": 0.4232, + "step": 14200 + }, + { + "epoch": 0.31572164948453607, + "grad_norm": 2.15625, + "learning_rate": 1.368488888888889e-05, + "loss": 0.4258, + "step": 14210 + }, + { + "epoch": 0.3159438322076075, + "grad_norm": 2.34375, + "learning_rate": 1.3680444444444444e-05, + "loss": 0.4339, + "step": 14220 + }, + { + "epoch": 0.316166014930679, + "grad_norm": 1.90625, + "learning_rate": 1.3676000000000001e-05, + "loss": 0.3538, + "step": 14230 + }, + { + "epoch": 0.3163881976537504, + "grad_norm": 2.453125, + "learning_rate": 1.3671555555555556e-05, + "loss": 0.4367, + "step": 14240 + }, + { + "epoch": 0.3166103803768219, + "grad_norm": 2.03125, + "learning_rate": 1.3667111111111114e-05, + "loss": 0.3788, + "step": 14250 + }, + { + "epoch": 0.3168325630998933, + "grad_norm": 2.1875, + "learning_rate": 1.3662666666666669e-05, + "loss": 0.4369, + "step": 14260 + }, + { + "epoch": 0.3170547458229648, + "grad_norm": 2.296875, + "learning_rate": 1.3658222222222224e-05, + "loss": 0.4107, + "step": 14270 + }, + { + "epoch": 0.3172769285460363, + "grad_norm": 2.375, + "learning_rate": 1.3653777777777778e-05, + "loss": 0.4378, + "step": 14280 + }, + { + "epoch": 0.31749911126910774, + "grad_norm": 1.8515625, + "learning_rate": 1.3649333333333335e-05, + "loss": 0.3985, + "step": 14290 + }, + { + "epoch": 0.3177212939921792, + "grad_norm": 2.421875, + "learning_rate": 1.364488888888889e-05, + "loss": 0.3981, + "step": 14300 + }, + { + "epoch": 0.31794347671525064, + "grad_norm": 2.0, + "learning_rate": 1.3640444444444445e-05, + "loss": 0.3764, + "step": 14310 + }, + { + "epoch": 0.3181656594383221, + "grad_norm": 2.046875, + "learning_rate": 1.3636e-05, + "loss": 0.3738, + "step": 14320 + }, + { + "epoch": 0.31838784216139354, + "grad_norm": 1.6875, + "learning_rate": 1.3631555555555556e-05, + "loss": 0.4342, + "step": 14330 + }, + { + "epoch": 0.318610024884465, + "grad_norm": 1.9140625, + "learning_rate": 1.3627111111111113e-05, + "loss": 0.4101, + "step": 14340 + }, + { + "epoch": 0.31883220760753644, + "grad_norm": 2.34375, + "learning_rate": 1.362266666666667e-05, + "loss": 0.3919, + "step": 14350 + }, + { + "epoch": 0.3190543903306079, + "grad_norm": 2.34375, + "learning_rate": 1.3618222222222224e-05, + "loss": 0.3694, + "step": 14360 + }, + { + "epoch": 0.31927657305367935, + "grad_norm": 2.15625, + "learning_rate": 1.3613777777777779e-05, + "loss": 0.4038, + "step": 14370 + }, + { + "epoch": 0.3194987557767508, + "grad_norm": 2.015625, + "learning_rate": 1.3609333333333334e-05, + "loss": 0.4014, + "step": 14380 + }, + { + "epoch": 0.31972093849982225, + "grad_norm": 2.234375, + "learning_rate": 1.360488888888889e-05, + "loss": 0.377, + "step": 14390 + }, + { + "epoch": 0.3199431212228937, + "grad_norm": 2.078125, + "learning_rate": 1.3600444444444445e-05, + "loss": 0.4202, + "step": 14400 + }, + { + "epoch": 0.32016530394596515, + "grad_norm": 2.140625, + "learning_rate": 1.3596e-05, + "loss": 0.3957, + "step": 14410 + }, + { + "epoch": 0.3203874866690366, + "grad_norm": 2.0, + "learning_rate": 1.3591555555555555e-05, + "loss": 0.4077, + "step": 14420 + }, + { + "epoch": 0.32060966939210805, + "grad_norm": 2.46875, + "learning_rate": 1.3587111111111113e-05, + "loss": 0.4165, + "step": 14430 + }, + { + "epoch": 0.3208318521151795, + "grad_norm": 2.5, + "learning_rate": 1.3582666666666668e-05, + "loss": 0.4065, + "step": 14440 + }, + { + "epoch": 0.32105403483825096, + "grad_norm": 2.09375, + "learning_rate": 1.3578222222222224e-05, + "loss": 0.3884, + "step": 14450 + }, + { + "epoch": 0.3212762175613224, + "grad_norm": 2.0625, + "learning_rate": 1.357377777777778e-05, + "loss": 0.403, + "step": 14460 + }, + { + "epoch": 0.32149840028439386, + "grad_norm": 2.625, + "learning_rate": 1.3569333333333334e-05, + "loss": 0.4437, + "step": 14470 + }, + { + "epoch": 0.32172058300746537, + "grad_norm": 1.859375, + "learning_rate": 1.3564888888888889e-05, + "loss": 0.3638, + "step": 14480 + }, + { + "epoch": 0.3219427657305368, + "grad_norm": 2.09375, + "learning_rate": 1.3560444444444445e-05, + "loss": 0.4478, + "step": 14490 + }, + { + "epoch": 0.32216494845360827, + "grad_norm": 2.265625, + "learning_rate": 1.3556e-05, + "loss": 0.3968, + "step": 14500 + }, + { + "epoch": 0.3223871311766797, + "grad_norm": 2.859375, + "learning_rate": 1.3551555555555555e-05, + "loss": 0.4, + "step": 14510 + }, + { + "epoch": 0.32260931389975117, + "grad_norm": 1.90625, + "learning_rate": 1.3547111111111113e-05, + "loss": 0.4057, + "step": 14520 + }, + { + "epoch": 0.3228314966228226, + "grad_norm": 2.03125, + "learning_rate": 1.3542666666666668e-05, + "loss": 0.3609, + "step": 14530 + }, + { + "epoch": 0.3230536793458941, + "grad_norm": 1.9765625, + "learning_rate": 1.3538222222222223e-05, + "loss": 0.4431, + "step": 14540 + }, + { + "epoch": 0.3232758620689655, + "grad_norm": 2.25, + "learning_rate": 1.353377777777778e-05, + "loss": 0.3886, + "step": 14550 + }, + { + "epoch": 0.323498044792037, + "grad_norm": 2.0625, + "learning_rate": 1.3529333333333334e-05, + "loss": 0.3583, + "step": 14560 + }, + { + "epoch": 0.3237202275151084, + "grad_norm": 2.03125, + "learning_rate": 1.352488888888889e-05, + "loss": 0.3723, + "step": 14570 + }, + { + "epoch": 0.3239424102381799, + "grad_norm": 1.921875, + "learning_rate": 1.3520444444444444e-05, + "loss": 0.4102, + "step": 14580 + }, + { + "epoch": 0.32416459296125133, + "grad_norm": 1.9609375, + "learning_rate": 1.3516e-05, + "loss": 0.4085, + "step": 14590 + }, + { + "epoch": 0.3243867756843228, + "grad_norm": 2.75, + "learning_rate": 1.3511555555555557e-05, + "loss": 0.3869, + "step": 14600 + }, + { + "epoch": 0.32460895840739423, + "grad_norm": 2.359375, + "learning_rate": 1.3507111111111114e-05, + "loss": 0.4148, + "step": 14610 + }, + { + "epoch": 0.3248311411304657, + "grad_norm": 2.03125, + "learning_rate": 1.3502666666666669e-05, + "loss": 0.3853, + "step": 14620 + }, + { + "epoch": 0.32505332385353713, + "grad_norm": 1.8984375, + "learning_rate": 1.3498222222222223e-05, + "loss": 0.3862, + "step": 14630 + }, + { + "epoch": 0.3252755065766086, + "grad_norm": 2.140625, + "learning_rate": 1.3493777777777778e-05, + "loss": 0.3866, + "step": 14640 + }, + { + "epoch": 0.32549768929968004, + "grad_norm": 2.234375, + "learning_rate": 1.3489333333333335e-05, + "loss": 0.3989, + "step": 14650 + }, + { + "epoch": 0.3257198720227515, + "grad_norm": 1.890625, + "learning_rate": 1.348488888888889e-05, + "loss": 0.3554, + "step": 14660 + }, + { + "epoch": 0.32594205474582294, + "grad_norm": 1.828125, + "learning_rate": 1.3480444444444445e-05, + "loss": 0.3863, + "step": 14670 + }, + { + "epoch": 0.3261642374688944, + "grad_norm": 2.015625, + "learning_rate": 1.3476e-05, + "loss": 0.3912, + "step": 14680 + }, + { + "epoch": 0.3263864201919659, + "grad_norm": 1.8671875, + "learning_rate": 1.3471555555555558e-05, + "loss": 0.4403, + "step": 14690 + }, + { + "epoch": 0.32660860291503735, + "grad_norm": 2.453125, + "learning_rate": 1.3467111111111112e-05, + "loss": 0.4324, + "step": 14700 + }, + { + "epoch": 0.3268307856381088, + "grad_norm": 2.09375, + "learning_rate": 1.3462666666666669e-05, + "loss": 0.3766, + "step": 14710 + }, + { + "epoch": 0.32705296836118025, + "grad_norm": 1.890625, + "learning_rate": 1.3458222222222224e-05, + "loss": 0.4056, + "step": 14720 + }, + { + "epoch": 0.3272751510842517, + "grad_norm": 2.125, + "learning_rate": 1.3453777777777779e-05, + "loss": 0.3994, + "step": 14730 + }, + { + "epoch": 0.32749733380732315, + "grad_norm": 2.015625, + "learning_rate": 1.3449333333333334e-05, + "loss": 0.4055, + "step": 14740 + }, + { + "epoch": 0.3277195165303946, + "grad_norm": 2.046875, + "learning_rate": 1.344488888888889e-05, + "loss": 0.4287, + "step": 14750 + }, + { + "epoch": 0.32794169925346606, + "grad_norm": 1.765625, + "learning_rate": 1.3440444444444445e-05, + "loss": 0.3927, + "step": 14760 + }, + { + "epoch": 0.3281638819765375, + "grad_norm": 1.984375, + "learning_rate": 1.3436e-05, + "loss": 0.418, + "step": 14770 + }, + { + "epoch": 0.32838606469960896, + "grad_norm": 2.25, + "learning_rate": 1.3431555555555558e-05, + "loss": 0.379, + "step": 14780 + }, + { + "epoch": 0.3286082474226804, + "grad_norm": 2.09375, + "learning_rate": 1.3427111111111113e-05, + "loss": 0.4045, + "step": 14790 + }, + { + "epoch": 0.32883043014575186, + "grad_norm": 2.109375, + "learning_rate": 1.3422666666666668e-05, + "loss": 0.4052, + "step": 14800 + }, + { + "epoch": 0.3290526128688233, + "grad_norm": 2.46875, + "learning_rate": 1.3418222222222224e-05, + "loss": 0.4304, + "step": 14810 + }, + { + "epoch": 0.32927479559189476, + "grad_norm": 2.0625, + "learning_rate": 1.3413777777777779e-05, + "loss": 0.4112, + "step": 14820 + }, + { + "epoch": 0.3294969783149662, + "grad_norm": 2.359375, + "learning_rate": 1.3409333333333334e-05, + "loss": 0.3764, + "step": 14830 + }, + { + "epoch": 0.32971916103803767, + "grad_norm": 2.125, + "learning_rate": 1.3404888888888889e-05, + "loss": 0.3632, + "step": 14840 + }, + { + "epoch": 0.3299413437611091, + "grad_norm": 2.328125, + "learning_rate": 1.3400444444444445e-05, + "loss": 0.4074, + "step": 14850 + }, + { + "epoch": 0.33016352648418057, + "grad_norm": 2.28125, + "learning_rate": 1.3396e-05, + "loss": 0.3798, + "step": 14860 + }, + { + "epoch": 0.330385709207252, + "grad_norm": 2.546875, + "learning_rate": 1.3391555555555558e-05, + "loss": 0.4275, + "step": 14870 + }, + { + "epoch": 0.33060789193032347, + "grad_norm": 2.734375, + "learning_rate": 1.3387111111111113e-05, + "loss": 0.3696, + "step": 14880 + }, + { + "epoch": 0.330830074653395, + "grad_norm": 1.9765625, + "learning_rate": 1.3382666666666668e-05, + "loss": 0.3988, + "step": 14890 + }, + { + "epoch": 0.33105225737646643, + "grad_norm": 2.0, + "learning_rate": 1.3378222222222223e-05, + "loss": 0.3687, + "step": 14900 + }, + { + "epoch": 0.3312744400995379, + "grad_norm": 1.96875, + "learning_rate": 1.337377777777778e-05, + "loss": 0.3799, + "step": 14910 + }, + { + "epoch": 0.33149662282260933, + "grad_norm": 2.25, + "learning_rate": 1.3369333333333334e-05, + "loss": 0.3866, + "step": 14920 + }, + { + "epoch": 0.3317188055456808, + "grad_norm": 2.015625, + "learning_rate": 1.3364888888888889e-05, + "loss": 0.3815, + "step": 14930 + }, + { + "epoch": 0.33194098826875224, + "grad_norm": 2.46875, + "learning_rate": 1.3360444444444444e-05, + "loss": 0.3885, + "step": 14940 + }, + { + "epoch": 0.3321631709918237, + "grad_norm": 2.0625, + "learning_rate": 1.3356e-05, + "loss": 0.3973, + "step": 14950 + }, + { + "epoch": 0.33238535371489514, + "grad_norm": 1.921875, + "learning_rate": 1.3351555555555557e-05, + "loss": 0.3729, + "step": 14960 + }, + { + "epoch": 0.3326075364379666, + "grad_norm": 2.421875, + "learning_rate": 1.3347111111111114e-05, + "loss": 0.3958, + "step": 14970 + }, + { + "epoch": 0.33282971916103804, + "grad_norm": 2.6875, + "learning_rate": 1.3342666666666668e-05, + "loss": 0.4206, + "step": 14980 + }, + { + "epoch": 0.3330519018841095, + "grad_norm": 1.984375, + "learning_rate": 1.3338222222222223e-05, + "loss": 0.411, + "step": 14990 + }, + { + "epoch": 0.33327408460718094, + "grad_norm": 2.1875, + "learning_rate": 1.3333777777777778e-05, + "loss": 0.3738, + "step": 15000 + }, + { + "epoch": 0.3334962673302524, + "grad_norm": 1.7109375, + "learning_rate": 1.3329333333333335e-05, + "loss": 0.3963, + "step": 15010 + }, + { + "epoch": 0.33371845005332385, + "grad_norm": 2.3125, + "learning_rate": 1.332488888888889e-05, + "loss": 0.4623, + "step": 15020 + }, + { + "epoch": 0.3339406327763953, + "grad_norm": 1.96875, + "learning_rate": 1.3320444444444444e-05, + "loss": 0.4067, + "step": 15030 + }, + { + "epoch": 0.33416281549946675, + "grad_norm": 1.9921875, + "learning_rate": 1.3316e-05, + "loss": 0.3753, + "step": 15040 + }, + { + "epoch": 0.3343849982225382, + "grad_norm": 2.078125, + "learning_rate": 1.3311555555555557e-05, + "loss": 0.4527, + "step": 15050 + }, + { + "epoch": 0.33460718094560965, + "grad_norm": 2.15625, + "learning_rate": 1.3307111111111112e-05, + "loss": 0.3978, + "step": 15060 + }, + { + "epoch": 0.3348293636686811, + "grad_norm": 1.7109375, + "learning_rate": 1.3302666666666669e-05, + "loss": 0.3915, + "step": 15070 + }, + { + "epoch": 0.33505154639175255, + "grad_norm": 2.34375, + "learning_rate": 1.3298222222222224e-05, + "loss": 0.4272, + "step": 15080 + }, + { + "epoch": 0.335273729114824, + "grad_norm": 2.265625, + "learning_rate": 1.3293777777777779e-05, + "loss": 0.3927, + "step": 15090 + }, + { + "epoch": 0.3354959118378955, + "grad_norm": 1.96875, + "learning_rate": 1.3289333333333333e-05, + "loss": 0.4217, + "step": 15100 + }, + { + "epoch": 0.33571809456096696, + "grad_norm": 2.25, + "learning_rate": 1.328488888888889e-05, + "loss": 0.4208, + "step": 15110 + }, + { + "epoch": 0.3359402772840384, + "grad_norm": 2.140625, + "learning_rate": 1.3280444444444445e-05, + "loss": 0.3905, + "step": 15120 + }, + { + "epoch": 0.33616246000710986, + "grad_norm": 2.25, + "learning_rate": 1.3276e-05, + "loss": 0.4056, + "step": 15130 + }, + { + "epoch": 0.3363846427301813, + "grad_norm": 2.078125, + "learning_rate": 1.3271555555555558e-05, + "loss": 0.4163, + "step": 15140 + }, + { + "epoch": 0.33660682545325277, + "grad_norm": 2.21875, + "learning_rate": 1.3267111111111113e-05, + "loss": 0.4131, + "step": 15150 + }, + { + "epoch": 0.3368290081763242, + "grad_norm": 1.921875, + "learning_rate": 1.3262666666666668e-05, + "loss": 0.3995, + "step": 15160 + }, + { + "epoch": 0.33705119089939567, + "grad_norm": 2.265625, + "learning_rate": 1.3258222222222224e-05, + "loss": 0.4, + "step": 15170 + }, + { + "epoch": 0.3372733736224671, + "grad_norm": 2.46875, + "learning_rate": 1.3253777777777779e-05, + "loss": 0.4479, + "step": 15180 + }, + { + "epoch": 0.33749555634553857, + "grad_norm": 2.171875, + "learning_rate": 1.3249333333333334e-05, + "loss": 0.4247, + "step": 15190 + }, + { + "epoch": 0.33771773906861, + "grad_norm": 2.21875, + "learning_rate": 1.3244888888888889e-05, + "loss": 0.4119, + "step": 15200 + }, + { + "epoch": 0.3379399217916815, + "grad_norm": 2.125, + "learning_rate": 1.3240444444444445e-05, + "loss": 0.4206, + "step": 15210 + }, + { + "epoch": 0.3381621045147529, + "grad_norm": 2.015625, + "learning_rate": 1.3236000000000002e-05, + "loss": 0.3911, + "step": 15220 + }, + { + "epoch": 0.3383842872378244, + "grad_norm": 2.015625, + "learning_rate": 1.3231555555555558e-05, + "loss": 0.3802, + "step": 15230 + }, + { + "epoch": 0.33860646996089583, + "grad_norm": 2.546875, + "learning_rate": 1.3227111111111113e-05, + "loss": 0.3835, + "step": 15240 + }, + { + "epoch": 0.3388286526839673, + "grad_norm": 2.203125, + "learning_rate": 1.3222666666666668e-05, + "loss": 0.4121, + "step": 15250 + }, + { + "epoch": 0.33905083540703873, + "grad_norm": 2.203125, + "learning_rate": 1.3218222222222223e-05, + "loss": 0.4217, + "step": 15260 + }, + { + "epoch": 0.3392730181301102, + "grad_norm": 2.171875, + "learning_rate": 1.321377777777778e-05, + "loss": 0.3714, + "step": 15270 + }, + { + "epoch": 0.33949520085318163, + "grad_norm": 2.15625, + "learning_rate": 1.3209333333333334e-05, + "loss": 0.4171, + "step": 15280 + }, + { + "epoch": 0.3397173835762531, + "grad_norm": 2.03125, + "learning_rate": 1.3204888888888889e-05, + "loss": 0.4203, + "step": 15290 + }, + { + "epoch": 0.3399395662993246, + "grad_norm": 2.21875, + "learning_rate": 1.3200444444444444e-05, + "loss": 0.4552, + "step": 15300 + }, + { + "epoch": 0.34016174902239604, + "grad_norm": 2.34375, + "learning_rate": 1.3196000000000002e-05, + "loss": 0.3636, + "step": 15310 + }, + { + "epoch": 0.3403839317454675, + "grad_norm": 2.140625, + "learning_rate": 1.3191555555555557e-05, + "loss": 0.414, + "step": 15320 + }, + { + "epoch": 0.34060611446853895, + "grad_norm": 2.265625, + "learning_rate": 1.3187111111111113e-05, + "loss": 0.3958, + "step": 15330 + }, + { + "epoch": 0.3408282971916104, + "grad_norm": 2.78125, + "learning_rate": 1.3182666666666668e-05, + "loss": 0.3564, + "step": 15340 + }, + { + "epoch": 0.34105047991468185, + "grad_norm": 2.0625, + "learning_rate": 1.3178222222222223e-05, + "loss": 0.3744, + "step": 15350 + }, + { + "epoch": 0.3412726626377533, + "grad_norm": 1.890625, + "learning_rate": 1.3173777777777778e-05, + "loss": 0.4155, + "step": 15360 + }, + { + "epoch": 0.34149484536082475, + "grad_norm": 2.1875, + "learning_rate": 1.3169333333333335e-05, + "loss": 0.4235, + "step": 15370 + }, + { + "epoch": 0.3417170280838962, + "grad_norm": 2.46875, + "learning_rate": 1.316488888888889e-05, + "loss": 0.4195, + "step": 15380 + }, + { + "epoch": 0.34193921080696765, + "grad_norm": 2.234375, + "learning_rate": 1.3160444444444444e-05, + "loss": 0.3463, + "step": 15390 + }, + { + "epoch": 0.3421613935300391, + "grad_norm": 2.375, + "learning_rate": 1.3156000000000002e-05, + "loss": 0.3943, + "step": 15400 + }, + { + "epoch": 0.34238357625311056, + "grad_norm": 2.03125, + "learning_rate": 1.3151555555555557e-05, + "loss": 0.405, + "step": 15410 + }, + { + "epoch": 0.342605758976182, + "grad_norm": 1.96875, + "learning_rate": 1.3147111111111112e-05, + "loss": 0.3886, + "step": 15420 + }, + { + "epoch": 0.34282794169925346, + "grad_norm": 2.21875, + "learning_rate": 1.3142666666666669e-05, + "loss": 0.4052, + "step": 15430 + }, + { + "epoch": 0.3430501244223249, + "grad_norm": 2.28125, + "learning_rate": 1.3138222222222224e-05, + "loss": 0.4238, + "step": 15440 + }, + { + "epoch": 0.34327230714539636, + "grad_norm": 2.203125, + "learning_rate": 1.3133777777777778e-05, + "loss": 0.3883, + "step": 15450 + }, + { + "epoch": 0.3434944898684678, + "grad_norm": 2.109375, + "learning_rate": 1.3129333333333333e-05, + "loss": 0.3901, + "step": 15460 + }, + { + "epoch": 0.34371667259153926, + "grad_norm": 2.203125, + "learning_rate": 1.312488888888889e-05, + "loss": 0.3939, + "step": 15470 + }, + { + "epoch": 0.3439388553146107, + "grad_norm": 2.1875, + "learning_rate": 1.3120444444444445e-05, + "loss": 0.3887, + "step": 15480 + }, + { + "epoch": 0.34416103803768217, + "grad_norm": 1.9375, + "learning_rate": 1.3116000000000003e-05, + "loss": 0.3727, + "step": 15490 + }, + { + "epoch": 0.3443832207607536, + "grad_norm": 2.15625, + "learning_rate": 1.3111555555555558e-05, + "loss": 0.3968, + "step": 15500 + }, + { + "epoch": 0.3446054034838251, + "grad_norm": 2.53125, + "learning_rate": 1.3107111111111113e-05, + "loss": 0.4136, + "step": 15510 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 2.1875, + "learning_rate": 1.3102666666666667e-05, + "loss": 0.3942, + "step": 15520 + }, + { + "epoch": 0.345049768929968, + "grad_norm": 2.15625, + "learning_rate": 1.3098222222222224e-05, + "loss": 0.4188, + "step": 15530 + }, + { + "epoch": 0.3452719516530395, + "grad_norm": 2.296875, + "learning_rate": 1.3093777777777779e-05, + "loss": 0.3917, + "step": 15540 + }, + { + "epoch": 0.34549413437611093, + "grad_norm": 2.1875, + "learning_rate": 1.3089333333333334e-05, + "loss": 0.4118, + "step": 15550 + }, + { + "epoch": 0.3457163170991824, + "grad_norm": 2.34375, + "learning_rate": 1.3084888888888888e-05, + "loss": 0.4058, + "step": 15560 + }, + { + "epoch": 0.34593849982225383, + "grad_norm": 2.015625, + "learning_rate": 1.3080444444444445e-05, + "loss": 0.3819, + "step": 15570 + }, + { + "epoch": 0.3461606825453253, + "grad_norm": 2.4375, + "learning_rate": 1.3076000000000002e-05, + "loss": 0.4499, + "step": 15580 + }, + { + "epoch": 0.34638286526839673, + "grad_norm": 2.28125, + "learning_rate": 1.3071555555555558e-05, + "loss": 0.3809, + "step": 15590 + }, + { + "epoch": 0.3466050479914682, + "grad_norm": 1.8671875, + "learning_rate": 1.3067111111111113e-05, + "loss": 0.3994, + "step": 15600 + }, + { + "epoch": 0.34682723071453964, + "grad_norm": 2.109375, + "learning_rate": 1.3062666666666668e-05, + "loss": 0.4276, + "step": 15610 + }, + { + "epoch": 0.3470494134376111, + "grad_norm": 2.21875, + "learning_rate": 1.3058222222222223e-05, + "loss": 0.358, + "step": 15620 + }, + { + "epoch": 0.34727159616068254, + "grad_norm": 2.171875, + "learning_rate": 1.3053777777777779e-05, + "loss": 0.4068, + "step": 15630 + }, + { + "epoch": 0.347493778883754, + "grad_norm": 2.078125, + "learning_rate": 1.3049333333333334e-05, + "loss": 0.4042, + "step": 15640 + }, + { + "epoch": 0.34771596160682544, + "grad_norm": 2.125, + "learning_rate": 1.3044888888888889e-05, + "loss": 0.3948, + "step": 15650 + }, + { + "epoch": 0.3479381443298969, + "grad_norm": 2.3125, + "learning_rate": 1.3040444444444444e-05, + "loss": 0.4206, + "step": 15660 + }, + { + "epoch": 0.34816032705296834, + "grad_norm": 1.90625, + "learning_rate": 1.3036000000000002e-05, + "loss": 0.3708, + "step": 15670 + }, + { + "epoch": 0.3483825097760398, + "grad_norm": 2.578125, + "learning_rate": 1.3031555555555557e-05, + "loss": 0.3844, + "step": 15680 + }, + { + "epoch": 0.34860469249911125, + "grad_norm": 2.65625, + "learning_rate": 1.3027111111111113e-05, + "loss": 0.4132, + "step": 15690 + }, + { + "epoch": 0.3488268752221827, + "grad_norm": 1.921875, + "learning_rate": 1.3022666666666668e-05, + "loss": 0.3929, + "step": 15700 + }, + { + "epoch": 0.3490490579452542, + "grad_norm": 1.9609375, + "learning_rate": 1.3018222222222223e-05, + "loss": 0.4396, + "step": 15710 + }, + { + "epoch": 0.34927124066832566, + "grad_norm": 2.125, + "learning_rate": 1.3013777777777778e-05, + "loss": 0.4274, + "step": 15720 + }, + { + "epoch": 0.3494934233913971, + "grad_norm": 2.34375, + "learning_rate": 1.3009333333333334e-05, + "loss": 0.389, + "step": 15730 + }, + { + "epoch": 0.34971560611446856, + "grad_norm": 2.4375, + "learning_rate": 1.300488888888889e-05, + "loss": 0.4186, + "step": 15740 + }, + { + "epoch": 0.34993778883754, + "grad_norm": 2.046875, + "learning_rate": 1.3000444444444444e-05, + "loss": 0.3345, + "step": 15750 + }, + { + "epoch": 0.35015997156061146, + "grad_norm": 2.21875, + "learning_rate": 1.2996000000000002e-05, + "loss": 0.4343, + "step": 15760 + }, + { + "epoch": 0.3503821542836829, + "grad_norm": 2.28125, + "learning_rate": 1.2991555555555557e-05, + "loss": 0.392, + "step": 15770 + }, + { + "epoch": 0.35060433700675436, + "grad_norm": 2.421875, + "learning_rate": 1.2987111111111112e-05, + "loss": 0.3902, + "step": 15780 + }, + { + "epoch": 0.3508265197298258, + "grad_norm": 2.375, + "learning_rate": 1.2982666666666669e-05, + "loss": 0.4401, + "step": 15790 + }, + { + "epoch": 0.35104870245289727, + "grad_norm": 2.546875, + "learning_rate": 1.2978222222222223e-05, + "loss": 0.3895, + "step": 15800 + }, + { + "epoch": 0.3512708851759687, + "grad_norm": 2.25, + "learning_rate": 1.2973777777777778e-05, + "loss": 0.3998, + "step": 15810 + }, + { + "epoch": 0.35149306789904017, + "grad_norm": 1.890625, + "learning_rate": 1.2969333333333333e-05, + "loss": 0.3912, + "step": 15820 + }, + { + "epoch": 0.3517152506221116, + "grad_norm": 2.0625, + "learning_rate": 1.296488888888889e-05, + "loss": 0.3873, + "step": 15830 + }, + { + "epoch": 0.35193743334518307, + "grad_norm": 2.15625, + "learning_rate": 1.2960444444444446e-05, + "loss": 0.3816, + "step": 15840 + }, + { + "epoch": 0.3521596160682545, + "grad_norm": 2.078125, + "learning_rate": 1.2956000000000003e-05, + "loss": 0.3876, + "step": 15850 + }, + { + "epoch": 0.352381798791326, + "grad_norm": 2.609375, + "learning_rate": 1.2951555555555558e-05, + "loss": 0.4435, + "step": 15860 + }, + { + "epoch": 0.3526039815143974, + "grad_norm": 2.28125, + "learning_rate": 1.2947111111111112e-05, + "loss": 0.4185, + "step": 15870 + }, + { + "epoch": 0.3528261642374689, + "grad_norm": 2.265625, + "learning_rate": 1.2942666666666667e-05, + "loss": 0.4111, + "step": 15880 + }, + { + "epoch": 0.3530483469605403, + "grad_norm": 2.046875, + "learning_rate": 1.2938222222222224e-05, + "loss": 0.3857, + "step": 15890 + }, + { + "epoch": 0.3532705296836118, + "grad_norm": 1.984375, + "learning_rate": 1.2933777777777779e-05, + "loss": 0.4239, + "step": 15900 + }, + { + "epoch": 0.35349271240668323, + "grad_norm": 2.359375, + "learning_rate": 1.2929333333333333e-05, + "loss": 0.4113, + "step": 15910 + }, + { + "epoch": 0.35371489512975474, + "grad_norm": 2.140625, + "learning_rate": 1.2924888888888888e-05, + "loss": 0.3829, + "step": 15920 + }, + { + "epoch": 0.3539370778528262, + "grad_norm": 2.046875, + "learning_rate": 1.2920444444444447e-05, + "loss": 0.4038, + "step": 15930 + }, + { + "epoch": 0.35415926057589764, + "grad_norm": 2.765625, + "learning_rate": 1.2916000000000001e-05, + "loss": 0.4277, + "step": 15940 + }, + { + "epoch": 0.3543814432989691, + "grad_norm": 2.3125, + "learning_rate": 1.2911555555555558e-05, + "loss": 0.3843, + "step": 15950 + }, + { + "epoch": 0.35460362602204054, + "grad_norm": 2.171875, + "learning_rate": 1.2907111111111113e-05, + "loss": 0.4202, + "step": 15960 + }, + { + "epoch": 0.354825808745112, + "grad_norm": 2.328125, + "learning_rate": 1.2902666666666668e-05, + "loss": 0.3613, + "step": 15970 + }, + { + "epoch": 0.35504799146818344, + "grad_norm": 2.046875, + "learning_rate": 1.2898222222222222e-05, + "loss": 0.3687, + "step": 15980 + }, + { + "epoch": 0.3552701741912549, + "grad_norm": 2.46875, + "learning_rate": 1.2893777777777779e-05, + "loss": 0.3963, + "step": 15990 + }, + { + "epoch": 0.35549235691432635, + "grad_norm": 2.109375, + "learning_rate": 1.2889333333333334e-05, + "loss": 0.3936, + "step": 16000 + }, + { + "epoch": 0.3557145396373978, + "grad_norm": 2.140625, + "learning_rate": 1.2884888888888889e-05, + "loss": 0.3891, + "step": 16010 + }, + { + "epoch": 0.35593672236046925, + "grad_norm": 2.296875, + "learning_rate": 1.2880444444444447e-05, + "loss": 0.406, + "step": 16020 + }, + { + "epoch": 0.3561589050835407, + "grad_norm": 2.078125, + "learning_rate": 1.2876000000000002e-05, + "loss": 0.401, + "step": 16030 + }, + { + "epoch": 0.35638108780661215, + "grad_norm": 2.46875, + "learning_rate": 1.2871555555555557e-05, + "loss": 0.4107, + "step": 16040 + }, + { + "epoch": 0.3566032705296836, + "grad_norm": 1.984375, + "learning_rate": 1.2867111111111113e-05, + "loss": 0.3754, + "step": 16050 + }, + { + "epoch": 0.35682545325275505, + "grad_norm": 2.4375, + "learning_rate": 1.2862666666666668e-05, + "loss": 0.3904, + "step": 16060 + }, + { + "epoch": 0.3570476359758265, + "grad_norm": 2.1875, + "learning_rate": 1.2858222222222223e-05, + "loss": 0.4039, + "step": 16070 + }, + { + "epoch": 0.35726981869889796, + "grad_norm": 2.203125, + "learning_rate": 1.2853777777777778e-05, + "loss": 0.3954, + "step": 16080 + }, + { + "epoch": 0.3574920014219694, + "grad_norm": 2.296875, + "learning_rate": 1.2849333333333334e-05, + "loss": 0.3688, + "step": 16090 + }, + { + "epoch": 0.35771418414504086, + "grad_norm": 2.1875, + "learning_rate": 1.2844888888888889e-05, + "loss": 0.4089, + "step": 16100 + }, + { + "epoch": 0.3579363668681123, + "grad_norm": 1.9765625, + "learning_rate": 1.2840444444444447e-05, + "loss": 0.3689, + "step": 16110 + }, + { + "epoch": 0.3581585495911838, + "grad_norm": 2.625, + "learning_rate": 1.2836000000000002e-05, + "loss": 0.3942, + "step": 16120 + }, + { + "epoch": 0.35838073231425527, + "grad_norm": 2.046875, + "learning_rate": 1.2831555555555557e-05, + "loss": 0.4092, + "step": 16130 + }, + { + "epoch": 0.3586029150373267, + "grad_norm": 2.390625, + "learning_rate": 1.2827111111111112e-05, + "loss": 0.3869, + "step": 16140 + }, + { + "epoch": 0.35882509776039817, + "grad_norm": 2.546875, + "learning_rate": 1.2822666666666668e-05, + "loss": 0.4189, + "step": 16150 + }, + { + "epoch": 0.3590472804834696, + "grad_norm": 2.1875, + "learning_rate": 1.2818222222222223e-05, + "loss": 0.4198, + "step": 16160 + }, + { + "epoch": 0.3592694632065411, + "grad_norm": 2.125, + "learning_rate": 1.2813777777777778e-05, + "loss": 0.4085, + "step": 16170 + }, + { + "epoch": 0.3594916459296125, + "grad_norm": 2.3125, + "learning_rate": 1.2809333333333333e-05, + "loss": 0.4151, + "step": 16180 + }, + { + "epoch": 0.359713828652684, + "grad_norm": 2.53125, + "learning_rate": 1.280488888888889e-05, + "loss": 0.3998, + "step": 16190 + }, + { + "epoch": 0.3599360113757554, + "grad_norm": 2.390625, + "learning_rate": 1.2800444444444446e-05, + "loss": 0.431, + "step": 16200 + }, + { + "epoch": 0.3601581940988269, + "grad_norm": 2.203125, + "learning_rate": 1.2796000000000003e-05, + "loss": 0.4133, + "step": 16210 + }, + { + "epoch": 0.36038037682189833, + "grad_norm": 2.375, + "learning_rate": 1.2791555555555557e-05, + "loss": 0.3874, + "step": 16220 + }, + { + "epoch": 0.3606025595449698, + "grad_norm": 2.125, + "learning_rate": 1.2787111111111112e-05, + "loss": 0.3901, + "step": 16230 + }, + { + "epoch": 0.36082474226804123, + "grad_norm": 2.625, + "learning_rate": 1.2782666666666667e-05, + "loss": 0.4536, + "step": 16240 + }, + { + "epoch": 0.3610469249911127, + "grad_norm": 2.5, + "learning_rate": 1.2778222222222224e-05, + "loss": 0.3992, + "step": 16250 + }, + { + "epoch": 0.36126910771418413, + "grad_norm": 2.328125, + "learning_rate": 1.2773777777777778e-05, + "loss": 0.3939, + "step": 16260 + }, + { + "epoch": 0.3614912904372556, + "grad_norm": 2.390625, + "learning_rate": 1.2769333333333333e-05, + "loss": 0.3851, + "step": 16270 + }, + { + "epoch": 0.36171347316032704, + "grad_norm": 1.984375, + "learning_rate": 1.2764888888888888e-05, + "loss": 0.4086, + "step": 16280 + }, + { + "epoch": 0.3619356558833985, + "grad_norm": 1.984375, + "learning_rate": 1.2760444444444446e-05, + "loss": 0.4101, + "step": 16290 + }, + { + "epoch": 0.36215783860646994, + "grad_norm": 1.953125, + "learning_rate": 1.2756000000000001e-05, + "loss": 0.4031, + "step": 16300 + }, + { + "epoch": 0.3623800213295414, + "grad_norm": 2.390625, + "learning_rate": 1.2751555555555558e-05, + "loss": 0.3995, + "step": 16310 + }, + { + "epoch": 0.36260220405261284, + "grad_norm": 1.8203125, + "learning_rate": 1.2747111111111113e-05, + "loss": 0.3927, + "step": 16320 + }, + { + "epoch": 0.36282438677568435, + "grad_norm": 2.25, + "learning_rate": 1.2742666666666667e-05, + "loss": 0.4459, + "step": 16330 + }, + { + "epoch": 0.3630465694987558, + "grad_norm": 1.84375, + "learning_rate": 1.2738222222222222e-05, + "loss": 0.3973, + "step": 16340 + }, + { + "epoch": 0.36326875222182725, + "grad_norm": 1.8203125, + "learning_rate": 1.2733777777777779e-05, + "loss": 0.3703, + "step": 16350 + }, + { + "epoch": 0.3634909349448987, + "grad_norm": 1.9140625, + "learning_rate": 1.2729333333333334e-05, + "loss": 0.4159, + "step": 16360 + }, + { + "epoch": 0.36371311766797015, + "grad_norm": 1.90625, + "learning_rate": 1.2724888888888889e-05, + "loss": 0.4324, + "step": 16370 + }, + { + "epoch": 0.3639353003910416, + "grad_norm": 2.21875, + "learning_rate": 1.2720444444444447e-05, + "loss": 0.4476, + "step": 16380 + }, + { + "epoch": 0.36415748311411306, + "grad_norm": 2.078125, + "learning_rate": 1.2716000000000002e-05, + "loss": 0.4055, + "step": 16390 + }, + { + "epoch": 0.3643796658371845, + "grad_norm": 2.46875, + "learning_rate": 1.2711555555555556e-05, + "loss": 0.3853, + "step": 16400 + }, + { + "epoch": 0.36460184856025596, + "grad_norm": 2.375, + "learning_rate": 1.2707111111111113e-05, + "loss": 0.4151, + "step": 16410 + }, + { + "epoch": 0.3648240312833274, + "grad_norm": 2.203125, + "learning_rate": 1.2702666666666668e-05, + "loss": 0.3948, + "step": 16420 + }, + { + "epoch": 0.36504621400639886, + "grad_norm": 2.015625, + "learning_rate": 1.2698222222222223e-05, + "loss": 0.3796, + "step": 16430 + }, + { + "epoch": 0.3652683967294703, + "grad_norm": 2.359375, + "learning_rate": 1.2693777777777778e-05, + "loss": 0.428, + "step": 16440 + }, + { + "epoch": 0.36549057945254176, + "grad_norm": 1.8125, + "learning_rate": 1.2689333333333334e-05, + "loss": 0.4008, + "step": 16450 + }, + { + "epoch": 0.3657127621756132, + "grad_norm": 2.171875, + "learning_rate": 1.268488888888889e-05, + "loss": 0.4065, + "step": 16460 + }, + { + "epoch": 0.36593494489868467, + "grad_norm": 1.78125, + "learning_rate": 1.2680444444444447e-05, + "loss": 0.3712, + "step": 16470 + }, + { + "epoch": 0.3661571276217561, + "grad_norm": 2.125, + "learning_rate": 1.2676000000000002e-05, + "loss": 0.3857, + "step": 16480 + }, + { + "epoch": 0.36637931034482757, + "grad_norm": 2.328125, + "learning_rate": 1.2671555555555557e-05, + "loss": 0.3649, + "step": 16490 + }, + { + "epoch": 0.366601493067899, + "grad_norm": 2.21875, + "learning_rate": 1.2667111111111112e-05, + "loss": 0.4072, + "step": 16500 + }, + { + "epoch": 0.36682367579097047, + "grad_norm": 2.34375, + "learning_rate": 1.2662666666666668e-05, + "loss": 0.3622, + "step": 16510 + }, + { + "epoch": 0.3670458585140419, + "grad_norm": 2.171875, + "learning_rate": 1.2658222222222223e-05, + "loss": 0.3704, + "step": 16520 + }, + { + "epoch": 0.36726804123711343, + "grad_norm": 2.421875, + "learning_rate": 1.2653777777777778e-05, + "loss": 0.3951, + "step": 16530 + }, + { + "epoch": 0.3674902239601849, + "grad_norm": 2.5, + "learning_rate": 1.2649333333333333e-05, + "loss": 0.3834, + "step": 16540 + }, + { + "epoch": 0.36771240668325633, + "grad_norm": 2.59375, + "learning_rate": 1.2644888888888891e-05, + "loss": 0.3831, + "step": 16550 + }, + { + "epoch": 0.3679345894063278, + "grad_norm": 2.125, + "learning_rate": 1.2640444444444446e-05, + "loss": 0.4009, + "step": 16560 + }, + { + "epoch": 0.36815677212939923, + "grad_norm": 2.1875, + "learning_rate": 1.2636000000000002e-05, + "loss": 0.3755, + "step": 16570 + }, + { + "epoch": 0.3683789548524707, + "grad_norm": 2.328125, + "learning_rate": 1.2631555555555557e-05, + "loss": 0.3906, + "step": 16580 + }, + { + "epoch": 0.36860113757554214, + "grad_norm": 2.265625, + "learning_rate": 1.2627111111111112e-05, + "loss": 0.386, + "step": 16590 + }, + { + "epoch": 0.3688233202986136, + "grad_norm": 2.25, + "learning_rate": 1.2622666666666667e-05, + "loss": 0.3982, + "step": 16600 + }, + { + "epoch": 0.36904550302168504, + "grad_norm": 2.203125, + "learning_rate": 1.2618222222222223e-05, + "loss": 0.3969, + "step": 16610 + }, + { + "epoch": 0.3692676857447565, + "grad_norm": 2.375, + "learning_rate": 1.2613777777777778e-05, + "loss": 0.3981, + "step": 16620 + }, + { + "epoch": 0.36948986846782794, + "grad_norm": 2.125, + "learning_rate": 1.2609333333333333e-05, + "loss": 0.3991, + "step": 16630 + }, + { + "epoch": 0.3697120511908994, + "grad_norm": 2.234375, + "learning_rate": 1.2604888888888891e-05, + "loss": 0.3996, + "step": 16640 + }, + { + "epoch": 0.36993423391397084, + "grad_norm": 1.8125, + "learning_rate": 1.2600444444444446e-05, + "loss": 0.3984, + "step": 16650 + }, + { + "epoch": 0.3701564166370423, + "grad_norm": 2.171875, + "learning_rate": 1.2596000000000001e-05, + "loss": 0.3777, + "step": 16660 + }, + { + "epoch": 0.37037859936011375, + "grad_norm": 2.09375, + "learning_rate": 1.2591555555555558e-05, + "loss": 0.4213, + "step": 16670 + }, + { + "epoch": 0.3706007820831852, + "grad_norm": 1.71875, + "learning_rate": 1.2587111111111112e-05, + "loss": 0.3577, + "step": 16680 + }, + { + "epoch": 0.37082296480625665, + "grad_norm": 1.9609375, + "learning_rate": 1.2582666666666667e-05, + "loss": 0.4019, + "step": 16690 + }, + { + "epoch": 0.3710451475293281, + "grad_norm": 2.046875, + "learning_rate": 1.2578222222222222e-05, + "loss": 0.3884, + "step": 16700 + }, + { + "epoch": 0.37126733025239955, + "grad_norm": 2.40625, + "learning_rate": 1.2573777777777779e-05, + "loss": 0.3848, + "step": 16710 + }, + { + "epoch": 0.371489512975471, + "grad_norm": 1.8515625, + "learning_rate": 1.2569333333333333e-05, + "loss": 0.3885, + "step": 16720 + }, + { + "epoch": 0.37171169569854245, + "grad_norm": 2.421875, + "learning_rate": 1.2564888888888892e-05, + "loss": 0.404, + "step": 16730 + }, + { + "epoch": 0.37193387842161396, + "grad_norm": 2.015625, + "learning_rate": 1.2560444444444447e-05, + "loss": 0.4297, + "step": 16740 + }, + { + "epoch": 0.3721560611446854, + "grad_norm": 2.078125, + "learning_rate": 1.2556000000000001e-05, + "loss": 0.39, + "step": 16750 + }, + { + "epoch": 0.37237824386775686, + "grad_norm": 2.03125, + "learning_rate": 1.2551555555555556e-05, + "loss": 0.3653, + "step": 16760 + }, + { + "epoch": 0.3726004265908283, + "grad_norm": 2.375, + "learning_rate": 1.2547111111111113e-05, + "loss": 0.4062, + "step": 16770 + }, + { + "epoch": 0.37282260931389977, + "grad_norm": 2.140625, + "learning_rate": 1.2542666666666668e-05, + "loss": 0.3857, + "step": 16780 + }, + { + "epoch": 0.3730447920369712, + "grad_norm": 2.1875, + "learning_rate": 1.2538222222222222e-05, + "loss": 0.3776, + "step": 16790 + }, + { + "epoch": 0.37326697476004267, + "grad_norm": 2.15625, + "learning_rate": 1.2533777777777777e-05, + "loss": 0.391, + "step": 16800 + }, + { + "epoch": 0.3734891574831141, + "grad_norm": 2.28125, + "learning_rate": 1.2529333333333334e-05, + "loss": 0.4218, + "step": 16810 + }, + { + "epoch": 0.37371134020618557, + "grad_norm": 2.75, + "learning_rate": 1.252488888888889e-05, + "loss": 0.398, + "step": 16820 + }, + { + "epoch": 0.373933522929257, + "grad_norm": 2.109375, + "learning_rate": 1.2520444444444447e-05, + "loss": 0.3775, + "step": 16830 + }, + { + "epoch": 0.3741557056523285, + "grad_norm": 2.21875, + "learning_rate": 1.2516000000000002e-05, + "loss": 0.3884, + "step": 16840 + }, + { + "epoch": 0.3743778883753999, + "grad_norm": 2.875, + "learning_rate": 1.2511555555555557e-05, + "loss": 0.4421, + "step": 16850 + }, + { + "epoch": 0.3746000710984714, + "grad_norm": 1.9453125, + "learning_rate": 1.2507111111111111e-05, + "loss": 0.3946, + "step": 16860 + }, + { + "epoch": 0.37482225382154283, + "grad_norm": 2.390625, + "learning_rate": 1.2502666666666668e-05, + "loss": 0.4128, + "step": 16870 + }, + { + "epoch": 0.3750444365446143, + "grad_norm": 2.171875, + "learning_rate": 1.2498222222222223e-05, + "loss": 0.4037, + "step": 16880 + }, + { + "epoch": 0.37526661926768573, + "grad_norm": 2.53125, + "learning_rate": 1.2493777777777778e-05, + "loss": 0.3919, + "step": 16890 + }, + { + "epoch": 0.3754888019907572, + "grad_norm": 2.28125, + "learning_rate": 1.2489333333333333e-05, + "loss": 0.3728, + "step": 16900 + }, + { + "epoch": 0.37571098471382863, + "grad_norm": 2.28125, + "learning_rate": 1.248488888888889e-05, + "loss": 0.3716, + "step": 16910 + }, + { + "epoch": 0.3759331674369001, + "grad_norm": 2.1875, + "learning_rate": 1.2480444444444446e-05, + "loss": 0.4272, + "step": 16920 + }, + { + "epoch": 0.37615535015997154, + "grad_norm": 1.9765625, + "learning_rate": 1.2476000000000002e-05, + "loss": 0.3697, + "step": 16930 + }, + { + "epoch": 0.37637753288304304, + "grad_norm": 2.515625, + "learning_rate": 1.2471555555555557e-05, + "loss": 0.3907, + "step": 16940 + }, + { + "epoch": 0.3765997156061145, + "grad_norm": 2.390625, + "learning_rate": 1.2467111111111112e-05, + "loss": 0.4133, + "step": 16950 + }, + { + "epoch": 0.37682189832918594, + "grad_norm": 2.09375, + "learning_rate": 1.2462666666666667e-05, + "loss": 0.4295, + "step": 16960 + }, + { + "epoch": 0.3770440810522574, + "grad_norm": 2.34375, + "learning_rate": 1.2458222222222223e-05, + "loss": 0.3938, + "step": 16970 + }, + { + "epoch": 0.37726626377532885, + "grad_norm": 2.25, + "learning_rate": 1.2453777777777778e-05, + "loss": 0.4059, + "step": 16980 + }, + { + "epoch": 0.3774884464984003, + "grad_norm": 2.15625, + "learning_rate": 1.2449333333333333e-05, + "loss": 0.366, + "step": 16990 + }, + { + "epoch": 0.37771062922147175, + "grad_norm": 2.546875, + "learning_rate": 1.2444888888888891e-05, + "loss": 0.4216, + "step": 17000 + }, + { + "epoch": 0.3779328119445432, + "grad_norm": 2.484375, + "learning_rate": 1.2440444444444446e-05, + "loss": 0.3814, + "step": 17010 + }, + { + "epoch": 0.37815499466761465, + "grad_norm": 2.03125, + "learning_rate": 1.2436000000000001e-05, + "loss": 0.3967, + "step": 17020 + }, + { + "epoch": 0.3783771773906861, + "grad_norm": 2.15625, + "learning_rate": 1.2431555555555557e-05, + "loss": 0.399, + "step": 17030 + }, + { + "epoch": 0.37859936011375755, + "grad_norm": 2.015625, + "learning_rate": 1.2427111111111112e-05, + "loss": 0.3599, + "step": 17040 + }, + { + "epoch": 0.378821542836829, + "grad_norm": 2.734375, + "learning_rate": 1.2422666666666667e-05, + "loss": 0.4512, + "step": 17050 + }, + { + "epoch": 0.37904372555990046, + "grad_norm": 1.9609375, + "learning_rate": 1.2418222222222222e-05, + "loss": 0.4067, + "step": 17060 + }, + { + "epoch": 0.3792659082829719, + "grad_norm": 2.21875, + "learning_rate": 1.2413777777777778e-05, + "loss": 0.4283, + "step": 17070 + }, + { + "epoch": 0.37948809100604336, + "grad_norm": 2.65625, + "learning_rate": 1.2409333333333335e-05, + "loss": 0.4055, + "step": 17080 + }, + { + "epoch": 0.3797102737291148, + "grad_norm": 2.203125, + "learning_rate": 1.2404888888888892e-05, + "loss": 0.3983, + "step": 17090 + }, + { + "epoch": 0.37993245645218626, + "grad_norm": 2.109375, + "learning_rate": 1.2400444444444446e-05, + "loss": 0.4344, + "step": 17100 + }, + { + "epoch": 0.3801546391752577, + "grad_norm": 2.203125, + "learning_rate": 1.2396000000000001e-05, + "loss": 0.4133, + "step": 17110 + }, + { + "epoch": 0.38037682189832916, + "grad_norm": 2.15625, + "learning_rate": 1.2391555555555556e-05, + "loss": 0.4089, + "step": 17120 + }, + { + "epoch": 0.3805990046214006, + "grad_norm": 2.046875, + "learning_rate": 1.2387111111111113e-05, + "loss": 0.3972, + "step": 17130 + }, + { + "epoch": 0.38082118734447207, + "grad_norm": 2.375, + "learning_rate": 1.2382666666666667e-05, + "loss": 0.3929, + "step": 17140 + }, + { + "epoch": 0.3810433700675436, + "grad_norm": 1.8515625, + "learning_rate": 1.2378222222222222e-05, + "loss": 0.4264, + "step": 17150 + }, + { + "epoch": 0.381265552790615, + "grad_norm": 2.28125, + "learning_rate": 1.2373777777777777e-05, + "loss": 0.3825, + "step": 17160 + }, + { + "epoch": 0.3814877355136865, + "grad_norm": 2.359375, + "learning_rate": 1.2369333333333335e-05, + "loss": 0.3898, + "step": 17170 + }, + { + "epoch": 0.38170991823675793, + "grad_norm": 2.109375, + "learning_rate": 1.236488888888889e-05, + "loss": 0.3757, + "step": 17180 + }, + { + "epoch": 0.3819321009598294, + "grad_norm": 2.1875, + "learning_rate": 1.2360444444444447e-05, + "loss": 0.4031, + "step": 17190 + }, + { + "epoch": 0.38215428368290083, + "grad_norm": 2.8125, + "learning_rate": 1.2356000000000002e-05, + "loss": 0.4101, + "step": 17200 + }, + { + "epoch": 0.3823764664059723, + "grad_norm": 2.078125, + "learning_rate": 1.2351555555555556e-05, + "loss": 0.4174, + "step": 17210 + }, + { + "epoch": 0.38259864912904373, + "grad_norm": 2.03125, + "learning_rate": 1.2347111111111111e-05, + "loss": 0.3623, + "step": 17220 + }, + { + "epoch": 0.3828208318521152, + "grad_norm": 1.8359375, + "learning_rate": 1.2342666666666668e-05, + "loss": 0.3731, + "step": 17230 + }, + { + "epoch": 0.38304301457518664, + "grad_norm": 2.375, + "learning_rate": 1.2338222222222223e-05, + "loss": 0.4104, + "step": 17240 + }, + { + "epoch": 0.3832651972982581, + "grad_norm": 2.390625, + "learning_rate": 1.2333777777777778e-05, + "loss": 0.4195, + "step": 17250 + }, + { + "epoch": 0.38348738002132954, + "grad_norm": 1.953125, + "learning_rate": 1.2329333333333336e-05, + "loss": 0.3738, + "step": 17260 + }, + { + "epoch": 0.383709562744401, + "grad_norm": 1.703125, + "learning_rate": 1.232488888888889e-05, + "loss": 0.3437, + "step": 17270 + }, + { + "epoch": 0.38393174546747244, + "grad_norm": 1.84375, + "learning_rate": 1.2320444444444445e-05, + "loss": 0.3965, + "step": 17280 + }, + { + "epoch": 0.3841539281905439, + "grad_norm": 2.203125, + "learning_rate": 1.2316000000000002e-05, + "loss": 0.4006, + "step": 17290 + }, + { + "epoch": 0.38437611091361534, + "grad_norm": 2.03125, + "learning_rate": 1.2311555555555557e-05, + "loss": 0.3767, + "step": 17300 + }, + { + "epoch": 0.3845982936366868, + "grad_norm": 3.0625, + "learning_rate": 1.2307111111111112e-05, + "loss": 0.3993, + "step": 17310 + }, + { + "epoch": 0.38482047635975825, + "grad_norm": 2.484375, + "learning_rate": 1.2302666666666667e-05, + "loss": 0.3828, + "step": 17320 + }, + { + "epoch": 0.3850426590828297, + "grad_norm": 2.609375, + "learning_rate": 1.2298222222222223e-05, + "loss": 0.3965, + "step": 17330 + }, + { + "epoch": 0.38526484180590115, + "grad_norm": 2.296875, + "learning_rate": 1.2293777777777778e-05, + "loss": 0.3843, + "step": 17340 + }, + { + "epoch": 0.3854870245289726, + "grad_norm": 2.0, + "learning_rate": 1.2289333333333336e-05, + "loss": 0.4183, + "step": 17350 + }, + { + "epoch": 0.3857092072520441, + "grad_norm": 2.140625, + "learning_rate": 1.2284888888888891e-05, + "loss": 0.399, + "step": 17360 + }, + { + "epoch": 0.38593138997511556, + "grad_norm": 1.96875, + "learning_rate": 1.2280444444444446e-05, + "loss": 0.3745, + "step": 17370 + }, + { + "epoch": 0.386153572698187, + "grad_norm": 2.265625, + "learning_rate": 1.2276e-05, + "loss": 0.3713, + "step": 17380 + }, + { + "epoch": 0.38637575542125846, + "grad_norm": 2.359375, + "learning_rate": 1.2271555555555557e-05, + "loss": 0.4097, + "step": 17390 + }, + { + "epoch": 0.3865979381443299, + "grad_norm": 2.28125, + "learning_rate": 1.2267111111111112e-05, + "loss": 0.4068, + "step": 17400 + }, + { + "epoch": 0.38682012086740136, + "grad_norm": 2.28125, + "learning_rate": 1.2262666666666667e-05, + "loss": 0.4166, + "step": 17410 + }, + { + "epoch": 0.3870423035904728, + "grad_norm": 2.046875, + "learning_rate": 1.2258222222222222e-05, + "loss": 0.4135, + "step": 17420 + }, + { + "epoch": 0.38726448631354426, + "grad_norm": 2.046875, + "learning_rate": 1.2253777777777778e-05, + "loss": 0.3857, + "step": 17430 + }, + { + "epoch": 0.3874866690366157, + "grad_norm": 1.734375, + "learning_rate": 1.2249333333333335e-05, + "loss": 0.3891, + "step": 17440 + }, + { + "epoch": 0.38770885175968717, + "grad_norm": 2.09375, + "learning_rate": 1.2244888888888891e-05, + "loss": 0.3675, + "step": 17450 + }, + { + "epoch": 0.3879310344827586, + "grad_norm": 2.484375, + "learning_rate": 1.2240444444444446e-05, + "loss": 0.3991, + "step": 17460 + }, + { + "epoch": 0.38815321720583007, + "grad_norm": 2.59375, + "learning_rate": 1.2236000000000001e-05, + "loss": 0.3881, + "step": 17470 + }, + { + "epoch": 0.3883753999289015, + "grad_norm": 2.25, + "learning_rate": 1.2231555555555556e-05, + "loss": 0.4251, + "step": 17480 + }, + { + "epoch": 0.388597582651973, + "grad_norm": 2.234375, + "learning_rate": 1.2227111111111112e-05, + "loss": 0.4082, + "step": 17490 + }, + { + "epoch": 0.3888197653750444, + "grad_norm": 2.15625, + "learning_rate": 1.2222666666666667e-05, + "loss": 0.38, + "step": 17500 + }, + { + "epoch": 0.3890419480981159, + "grad_norm": 2.1875, + "learning_rate": 1.2218222222222222e-05, + "loss": 0.4046, + "step": 17510 + }, + { + "epoch": 0.3892641308211873, + "grad_norm": 2.109375, + "learning_rate": 1.2213777777777777e-05, + "loss": 0.3826, + "step": 17520 + }, + { + "epoch": 0.3894863135442588, + "grad_norm": 2.484375, + "learning_rate": 1.2209333333333335e-05, + "loss": 0.3907, + "step": 17530 + }, + { + "epoch": 0.38970849626733023, + "grad_norm": 2.234375, + "learning_rate": 1.220488888888889e-05, + "loss": 0.371, + "step": 17540 + }, + { + "epoch": 0.3899306789904017, + "grad_norm": 2.296875, + "learning_rate": 1.2200444444444447e-05, + "loss": 0.3836, + "step": 17550 + }, + { + "epoch": 0.3901528617134732, + "grad_norm": 2.015625, + "learning_rate": 1.2196000000000001e-05, + "loss": 0.4352, + "step": 17560 + }, + { + "epoch": 0.39037504443654464, + "grad_norm": 1.953125, + "learning_rate": 1.2191555555555556e-05, + "loss": 0.3747, + "step": 17570 + }, + { + "epoch": 0.3905972271596161, + "grad_norm": 2.53125, + "learning_rate": 1.2187111111111111e-05, + "loss": 0.3985, + "step": 17580 + }, + { + "epoch": 0.39081940988268754, + "grad_norm": 2.3125, + "learning_rate": 1.2182666666666668e-05, + "loss": 0.4214, + "step": 17590 + }, + { + "epoch": 0.391041592605759, + "grad_norm": 2.046875, + "learning_rate": 1.2178222222222223e-05, + "loss": 0.3523, + "step": 17600 + }, + { + "epoch": 0.39126377532883044, + "grad_norm": 2.296875, + "learning_rate": 1.2173777777777777e-05, + "loss": 0.389, + "step": 17610 + }, + { + "epoch": 0.3914859580519019, + "grad_norm": 2.6875, + "learning_rate": 1.2169333333333336e-05, + "loss": 0.4053, + "step": 17620 + }, + { + "epoch": 0.39170814077497335, + "grad_norm": 2.25, + "learning_rate": 1.216488888888889e-05, + "loss": 0.3832, + "step": 17630 + }, + { + "epoch": 0.3919303234980448, + "grad_norm": 2.046875, + "learning_rate": 1.2160444444444445e-05, + "loss": 0.3999, + "step": 17640 + }, + { + "epoch": 0.39215250622111625, + "grad_norm": 2.234375, + "learning_rate": 1.2156000000000002e-05, + "loss": 0.4054, + "step": 17650 + }, + { + "epoch": 0.3923746889441877, + "grad_norm": 2.3125, + "learning_rate": 1.2151555555555557e-05, + "loss": 0.4095, + "step": 17660 + }, + { + "epoch": 0.39259687166725915, + "grad_norm": 2.171875, + "learning_rate": 1.2147111111111112e-05, + "loss": 0.4301, + "step": 17670 + }, + { + "epoch": 0.3928190543903306, + "grad_norm": 1.7421875, + "learning_rate": 1.2142666666666666e-05, + "loss": 0.3823, + "step": 17680 + }, + { + "epoch": 0.39304123711340205, + "grad_norm": 2.640625, + "learning_rate": 1.2138222222222223e-05, + "loss": 0.4237, + "step": 17690 + }, + { + "epoch": 0.3932634198364735, + "grad_norm": 2.25, + "learning_rate": 1.213377777777778e-05, + "loss": 0.4074, + "step": 17700 + }, + { + "epoch": 0.39348560255954496, + "grad_norm": 2.234375, + "learning_rate": 1.2129333333333336e-05, + "loss": 0.4239, + "step": 17710 + }, + { + "epoch": 0.3937077852826164, + "grad_norm": 2.296875, + "learning_rate": 1.2124888888888891e-05, + "loss": 0.3956, + "step": 17720 + }, + { + "epoch": 0.39392996800568786, + "grad_norm": 2.0, + "learning_rate": 1.2120444444444446e-05, + "loss": 0.3783, + "step": 17730 + }, + { + "epoch": 0.3941521507287593, + "grad_norm": 1.8984375, + "learning_rate": 1.2116e-05, + "loss": 0.3882, + "step": 17740 + }, + { + "epoch": 0.39437433345183076, + "grad_norm": 2.203125, + "learning_rate": 1.2111555555555557e-05, + "loss": 0.3935, + "step": 17750 + }, + { + "epoch": 0.3945965161749022, + "grad_norm": 2.296875, + "learning_rate": 1.2107111111111112e-05, + "loss": 0.4248, + "step": 17760 + }, + { + "epoch": 0.3948186988979737, + "grad_norm": 2.828125, + "learning_rate": 1.2102666666666667e-05, + "loss": 0.4196, + "step": 17770 + }, + { + "epoch": 0.39504088162104517, + "grad_norm": 2.109375, + "learning_rate": 1.2098222222222222e-05, + "loss": 0.4248, + "step": 17780 + }, + { + "epoch": 0.3952630643441166, + "grad_norm": 2.234375, + "learning_rate": 1.209377777777778e-05, + "loss": 0.3692, + "step": 17790 + }, + { + "epoch": 0.3954852470671881, + "grad_norm": 2.09375, + "learning_rate": 1.2089333333333335e-05, + "loss": 0.3917, + "step": 17800 + }, + { + "epoch": 0.3957074297902595, + "grad_norm": 2.046875, + "learning_rate": 1.2084888888888891e-05, + "loss": 0.3894, + "step": 17810 + }, + { + "epoch": 0.395929612513331, + "grad_norm": 2.46875, + "learning_rate": 1.2080444444444446e-05, + "loss": 0.3956, + "step": 17820 + }, + { + "epoch": 0.3961517952364024, + "grad_norm": 2.15625, + "learning_rate": 1.2076000000000001e-05, + "loss": 0.3929, + "step": 17830 + }, + { + "epoch": 0.3963739779594739, + "grad_norm": 2.171875, + "learning_rate": 1.2071555555555556e-05, + "loss": 0.406, + "step": 17840 + }, + { + "epoch": 0.39659616068254533, + "grad_norm": 1.9609375, + "learning_rate": 1.2067111111111112e-05, + "loss": 0.3919, + "step": 17850 + }, + { + "epoch": 0.3968183434056168, + "grad_norm": 2.109375, + "learning_rate": 1.2062666666666667e-05, + "loss": 0.417, + "step": 17860 + }, + { + "epoch": 0.39704052612868823, + "grad_norm": 1.9921875, + "learning_rate": 1.2058222222222222e-05, + "loss": 0.3916, + "step": 17870 + }, + { + "epoch": 0.3972627088517597, + "grad_norm": 2.1875, + "learning_rate": 1.205377777777778e-05, + "loss": 0.4051, + "step": 17880 + }, + { + "epoch": 0.39748489157483113, + "grad_norm": 2.078125, + "learning_rate": 1.2049333333333335e-05, + "loss": 0.3559, + "step": 17890 + }, + { + "epoch": 0.3977070742979026, + "grad_norm": 2.828125, + "learning_rate": 1.204488888888889e-05, + "loss": 0.4128, + "step": 17900 + }, + { + "epoch": 0.39792925702097404, + "grad_norm": 2.109375, + "learning_rate": 1.2040444444444446e-05, + "loss": 0.3935, + "step": 17910 + }, + { + "epoch": 0.3981514397440455, + "grad_norm": 2.34375, + "learning_rate": 1.2036000000000001e-05, + "loss": 0.396, + "step": 17920 + }, + { + "epoch": 0.39837362246711694, + "grad_norm": 2.28125, + "learning_rate": 1.2031555555555556e-05, + "loss": 0.388, + "step": 17930 + }, + { + "epoch": 0.3985958051901884, + "grad_norm": 2.28125, + "learning_rate": 1.2027111111111111e-05, + "loss": 0.4001, + "step": 17940 + }, + { + "epoch": 0.39881798791325984, + "grad_norm": 2.015625, + "learning_rate": 1.2022666666666668e-05, + "loss": 0.3741, + "step": 17950 + }, + { + "epoch": 0.3990401706363313, + "grad_norm": 2.5, + "learning_rate": 1.2018222222222222e-05, + "loss": 0.4268, + "step": 17960 + }, + { + "epoch": 0.3992623533594028, + "grad_norm": 2.1875, + "learning_rate": 1.2013777777777779e-05, + "loss": 0.4013, + "step": 17970 + }, + { + "epoch": 0.39948453608247425, + "grad_norm": 2.328125, + "learning_rate": 1.2009333333333335e-05, + "loss": 0.3873, + "step": 17980 + }, + { + "epoch": 0.3997067188055457, + "grad_norm": 2.046875, + "learning_rate": 1.200488888888889e-05, + "loss": 0.3654, + "step": 17990 + }, + { + "epoch": 0.39992890152861715, + "grad_norm": 2.71875, + "learning_rate": 1.2000444444444445e-05, + "loss": 0.3867, + "step": 18000 + }, + { + "epoch": 0.4001510842516886, + "grad_norm": 2.109375, + "learning_rate": 1.1996000000000002e-05, + "loss": 0.415, + "step": 18010 + }, + { + "epoch": 0.40037326697476006, + "grad_norm": 2.03125, + "learning_rate": 1.1991555555555557e-05, + "loss": 0.3893, + "step": 18020 + }, + { + "epoch": 0.4005954496978315, + "grad_norm": 2.296875, + "learning_rate": 1.1987111111111111e-05, + "loss": 0.398, + "step": 18030 + }, + { + "epoch": 0.40081763242090296, + "grad_norm": 2.421875, + "learning_rate": 1.1982666666666666e-05, + "loss": 0.3704, + "step": 18040 + }, + { + "epoch": 0.4010398151439744, + "grad_norm": 2.3125, + "learning_rate": 1.1978222222222223e-05, + "loss": 0.3863, + "step": 18050 + }, + { + "epoch": 0.40126199786704586, + "grad_norm": 2.6875, + "learning_rate": 1.197377777777778e-05, + "loss": 0.3964, + "step": 18060 + }, + { + "epoch": 0.4014841805901173, + "grad_norm": 2.09375, + "learning_rate": 1.1969333333333336e-05, + "loss": 0.3743, + "step": 18070 + }, + { + "epoch": 0.40170636331318876, + "grad_norm": 2.53125, + "learning_rate": 1.196488888888889e-05, + "loss": 0.3623, + "step": 18080 + }, + { + "epoch": 0.4019285460362602, + "grad_norm": 1.96875, + "learning_rate": 1.1960444444444446e-05, + "loss": 0.358, + "step": 18090 + }, + { + "epoch": 0.40215072875933167, + "grad_norm": 1.859375, + "learning_rate": 1.1956e-05, + "loss": 0.4099, + "step": 18100 + }, + { + "epoch": 0.4023729114824031, + "grad_norm": 2.203125, + "learning_rate": 1.1951555555555557e-05, + "loss": 0.3787, + "step": 18110 + }, + { + "epoch": 0.40259509420547457, + "grad_norm": 2.046875, + "learning_rate": 1.1947111111111112e-05, + "loss": 0.3866, + "step": 18120 + }, + { + "epoch": 0.402817276928546, + "grad_norm": 2.140625, + "learning_rate": 1.1942666666666667e-05, + "loss": 0.3891, + "step": 18130 + }, + { + "epoch": 0.40303945965161747, + "grad_norm": 2.03125, + "learning_rate": 1.1938222222222221e-05, + "loss": 0.3637, + "step": 18140 + }, + { + "epoch": 0.4032616423746889, + "grad_norm": 2.203125, + "learning_rate": 1.193377777777778e-05, + "loss": 0.3642, + "step": 18150 + }, + { + "epoch": 0.4034838250977604, + "grad_norm": 2.09375, + "learning_rate": 1.1929333333333335e-05, + "loss": 0.3591, + "step": 18160 + }, + { + "epoch": 0.4037060078208318, + "grad_norm": 2.5, + "learning_rate": 1.1924888888888891e-05, + "loss": 0.3992, + "step": 18170 + }, + { + "epoch": 0.40392819054390333, + "grad_norm": 2.265625, + "learning_rate": 1.1920444444444446e-05, + "loss": 0.3903, + "step": 18180 + }, + { + "epoch": 0.4041503732669748, + "grad_norm": 2.140625, + "learning_rate": 1.1916e-05, + "loss": 0.4211, + "step": 18190 + }, + { + "epoch": 0.40437255599004623, + "grad_norm": 2.390625, + "learning_rate": 1.1911555555555556e-05, + "loss": 0.4168, + "step": 18200 + }, + { + "epoch": 0.4045947387131177, + "grad_norm": 2.359375, + "learning_rate": 1.1907111111111112e-05, + "loss": 0.4052, + "step": 18210 + }, + { + "epoch": 0.40481692143618914, + "grad_norm": 2.375, + "learning_rate": 1.1902666666666667e-05, + "loss": 0.404, + "step": 18220 + }, + { + "epoch": 0.4050391041592606, + "grad_norm": 2.71875, + "learning_rate": 1.1898222222222222e-05, + "loss": 0.4127, + "step": 18230 + }, + { + "epoch": 0.40526128688233204, + "grad_norm": 2.09375, + "learning_rate": 1.189377777777778e-05, + "loss": 0.3987, + "step": 18240 + }, + { + "epoch": 0.4054834696054035, + "grad_norm": 1.8203125, + "learning_rate": 1.1889333333333335e-05, + "loss": 0.3983, + "step": 18250 + }, + { + "epoch": 0.40570565232847494, + "grad_norm": 2.296875, + "learning_rate": 1.188488888888889e-05, + "loss": 0.3852, + "step": 18260 + }, + { + "epoch": 0.4059278350515464, + "grad_norm": 2.625, + "learning_rate": 1.1880444444444446e-05, + "loss": 0.382, + "step": 18270 + }, + { + "epoch": 0.40615001777461784, + "grad_norm": 2.34375, + "learning_rate": 1.1876000000000001e-05, + "loss": 0.3908, + "step": 18280 + }, + { + "epoch": 0.4063722004976893, + "grad_norm": 2.296875, + "learning_rate": 1.1871555555555556e-05, + "loss": 0.4179, + "step": 18290 + }, + { + "epoch": 0.40659438322076075, + "grad_norm": 2.484375, + "learning_rate": 1.186711111111111e-05, + "loss": 0.4292, + "step": 18300 + }, + { + "epoch": 0.4068165659438322, + "grad_norm": 2.09375, + "learning_rate": 1.1862666666666667e-05, + "loss": 0.4112, + "step": 18310 + }, + { + "epoch": 0.40703874866690365, + "grad_norm": 2.28125, + "learning_rate": 1.1858222222222224e-05, + "loss": 0.4137, + "step": 18320 + }, + { + "epoch": 0.4072609313899751, + "grad_norm": 2.359375, + "learning_rate": 1.1853777777777779e-05, + "loss": 0.4142, + "step": 18330 + }, + { + "epoch": 0.40748311411304655, + "grad_norm": 2.4375, + "learning_rate": 1.1849333333333335e-05, + "loss": 0.3724, + "step": 18340 + }, + { + "epoch": 0.407705296836118, + "grad_norm": 2.125, + "learning_rate": 1.184488888888889e-05, + "loss": 0.3825, + "step": 18350 + }, + { + "epoch": 0.40792747955918945, + "grad_norm": 2.5625, + "learning_rate": 1.1840444444444445e-05, + "loss": 0.4147, + "step": 18360 + }, + { + "epoch": 0.4081496622822609, + "grad_norm": 2.171875, + "learning_rate": 1.1836000000000002e-05, + "loss": 0.38, + "step": 18370 + }, + { + "epoch": 0.4083718450053324, + "grad_norm": 2.3125, + "learning_rate": 1.1831555555555556e-05, + "loss": 0.3932, + "step": 18380 + }, + { + "epoch": 0.40859402772840386, + "grad_norm": 2.171875, + "learning_rate": 1.1827111111111111e-05, + "loss": 0.4194, + "step": 18390 + }, + { + "epoch": 0.4088162104514753, + "grad_norm": 2.015625, + "learning_rate": 1.1822666666666666e-05, + "loss": 0.4206, + "step": 18400 + }, + { + "epoch": 0.40903839317454677, + "grad_norm": 2.234375, + "learning_rate": 1.1818222222222224e-05, + "loss": 0.3969, + "step": 18410 + }, + { + "epoch": 0.4092605758976182, + "grad_norm": 2.609375, + "learning_rate": 1.1813777777777779e-05, + "loss": 0.3743, + "step": 18420 + }, + { + "epoch": 0.40948275862068967, + "grad_norm": 2.15625, + "learning_rate": 1.1809333333333336e-05, + "loss": 0.4294, + "step": 18430 + }, + { + "epoch": 0.4097049413437611, + "grad_norm": 2.171875, + "learning_rate": 1.180488888888889e-05, + "loss": 0.3588, + "step": 18440 + }, + { + "epoch": 0.40992712406683257, + "grad_norm": 2.21875, + "learning_rate": 1.1800444444444445e-05, + "loss": 0.404, + "step": 18450 + }, + { + "epoch": 0.410149306789904, + "grad_norm": 2.234375, + "learning_rate": 1.1796e-05, + "loss": 0.422, + "step": 18460 + }, + { + "epoch": 0.4103714895129755, + "grad_norm": 2.0625, + "learning_rate": 1.1791555555555557e-05, + "loss": 0.3502, + "step": 18470 + }, + { + "epoch": 0.4105936722360469, + "grad_norm": 2.09375, + "learning_rate": 1.1787111111111112e-05, + "loss": 0.3651, + "step": 18480 + }, + { + "epoch": 0.4108158549591184, + "grad_norm": 2.34375, + "learning_rate": 1.1782666666666666e-05, + "loss": 0.4026, + "step": 18490 + }, + { + "epoch": 0.4110380376821898, + "grad_norm": 2.125, + "learning_rate": 1.1778222222222225e-05, + "loss": 0.4164, + "step": 18500 + }, + { + "epoch": 0.4112602204052613, + "grad_norm": 1.96875, + "learning_rate": 1.177377777777778e-05, + "loss": 0.3597, + "step": 18510 + }, + { + "epoch": 0.41148240312833273, + "grad_norm": 2.0, + "learning_rate": 1.1769333333333334e-05, + "loss": 0.3407, + "step": 18520 + }, + { + "epoch": 0.4117045858514042, + "grad_norm": 2.140625, + "learning_rate": 1.1764888888888891e-05, + "loss": 0.4096, + "step": 18530 + }, + { + "epoch": 0.41192676857447563, + "grad_norm": 2.25, + "learning_rate": 1.1760444444444446e-05, + "loss": 0.3792, + "step": 18540 + }, + { + "epoch": 0.4121489512975471, + "grad_norm": 2.109375, + "learning_rate": 1.1756e-05, + "loss": 0.391, + "step": 18550 + }, + { + "epoch": 0.41237113402061853, + "grad_norm": 2.140625, + "learning_rate": 1.1751555555555555e-05, + "loss": 0.3822, + "step": 18560 + }, + { + "epoch": 0.41259331674369, + "grad_norm": 2.640625, + "learning_rate": 1.1747111111111112e-05, + "loss": 0.3866, + "step": 18570 + }, + { + "epoch": 0.41281549946676144, + "grad_norm": 2.234375, + "learning_rate": 1.1742666666666667e-05, + "loss": 0.3711, + "step": 18580 + }, + { + "epoch": 0.41303768218983294, + "grad_norm": 2.59375, + "learning_rate": 1.1738222222222223e-05, + "loss": 0.4253, + "step": 18590 + }, + { + "epoch": 0.4132598649129044, + "grad_norm": 2.34375, + "learning_rate": 1.173377777777778e-05, + "loss": 0.3965, + "step": 18600 + }, + { + "epoch": 0.41348204763597585, + "grad_norm": 2.234375, + "learning_rate": 1.1729333333333335e-05, + "loss": 0.4161, + "step": 18610 + }, + { + "epoch": 0.4137042303590473, + "grad_norm": 2.109375, + "learning_rate": 1.172488888888889e-05, + "loss": 0.3879, + "step": 18620 + }, + { + "epoch": 0.41392641308211875, + "grad_norm": 2.78125, + "learning_rate": 1.1720444444444446e-05, + "loss": 0.4, + "step": 18630 + }, + { + "epoch": 0.4141485958051902, + "grad_norm": 2.234375, + "learning_rate": 1.1716000000000001e-05, + "loss": 0.4214, + "step": 18640 + }, + { + "epoch": 0.41437077852826165, + "grad_norm": 2.265625, + "learning_rate": 1.1711555555555556e-05, + "loss": 0.4266, + "step": 18650 + }, + { + "epoch": 0.4145929612513331, + "grad_norm": 2.21875, + "learning_rate": 1.170711111111111e-05, + "loss": 0.4241, + "step": 18660 + }, + { + "epoch": 0.41481514397440455, + "grad_norm": 2.21875, + "learning_rate": 1.1702666666666667e-05, + "loss": 0.3736, + "step": 18670 + }, + { + "epoch": 0.415037326697476, + "grad_norm": 2.0625, + "learning_rate": 1.1698222222222224e-05, + "loss": 0.3792, + "step": 18680 + }, + { + "epoch": 0.41525950942054746, + "grad_norm": 2.03125, + "learning_rate": 1.1693777777777779e-05, + "loss": 0.3791, + "step": 18690 + }, + { + "epoch": 0.4154816921436189, + "grad_norm": 2.578125, + "learning_rate": 1.1689333333333335e-05, + "loss": 0.4218, + "step": 18700 + }, + { + "epoch": 0.41570387486669036, + "grad_norm": 2.140625, + "learning_rate": 1.168488888888889e-05, + "loss": 0.3575, + "step": 18710 + }, + { + "epoch": 0.4159260575897618, + "grad_norm": 2.46875, + "learning_rate": 1.1680444444444445e-05, + "loss": 0.4201, + "step": 18720 + }, + { + "epoch": 0.41614824031283326, + "grad_norm": 2.390625, + "learning_rate": 1.1676000000000001e-05, + "loss": 0.4063, + "step": 18730 + }, + { + "epoch": 0.4163704230359047, + "grad_norm": 2.171875, + "learning_rate": 1.1671555555555556e-05, + "loss": 0.4136, + "step": 18740 + }, + { + "epoch": 0.41659260575897616, + "grad_norm": 2.171875, + "learning_rate": 1.1667111111111111e-05, + "loss": 0.3875, + "step": 18750 + }, + { + "epoch": 0.4168147884820476, + "grad_norm": 2.21875, + "learning_rate": 1.1662666666666666e-05, + "loss": 0.4041, + "step": 18760 + }, + { + "epoch": 0.41703697120511907, + "grad_norm": 2.296875, + "learning_rate": 1.1658222222222224e-05, + "loss": 0.368, + "step": 18770 + }, + { + "epoch": 0.4172591539281905, + "grad_norm": 2.203125, + "learning_rate": 1.1653777777777779e-05, + "loss": 0.3725, + "step": 18780 + }, + { + "epoch": 0.417481336651262, + "grad_norm": 2.09375, + "learning_rate": 1.1649333333333336e-05, + "loss": 0.3615, + "step": 18790 + }, + { + "epoch": 0.4177035193743335, + "grad_norm": 2.453125, + "learning_rate": 1.164488888888889e-05, + "loss": 0.3657, + "step": 18800 + }, + { + "epoch": 0.4179257020974049, + "grad_norm": 2.296875, + "learning_rate": 1.1640444444444445e-05, + "loss": 0.4285, + "step": 18810 + }, + { + "epoch": 0.4181478848204764, + "grad_norm": 2.453125, + "learning_rate": 1.1636e-05, + "loss": 0.3817, + "step": 18820 + }, + { + "epoch": 0.41837006754354783, + "grad_norm": 1.921875, + "learning_rate": 1.1631555555555557e-05, + "loss": 0.4134, + "step": 18830 + }, + { + "epoch": 0.4185922502666193, + "grad_norm": 2.1875, + "learning_rate": 1.1627111111111111e-05, + "loss": 0.4261, + "step": 18840 + }, + { + "epoch": 0.41881443298969073, + "grad_norm": 2.5625, + "learning_rate": 1.1622666666666666e-05, + "loss": 0.3703, + "step": 18850 + }, + { + "epoch": 0.4190366157127622, + "grad_norm": 1.9765625, + "learning_rate": 1.1618222222222225e-05, + "loss": 0.3884, + "step": 18860 + }, + { + "epoch": 0.41925879843583364, + "grad_norm": 2.09375, + "learning_rate": 1.161377777777778e-05, + "loss": 0.364, + "step": 18870 + }, + { + "epoch": 0.4194809811589051, + "grad_norm": 1.984375, + "learning_rate": 1.1609333333333334e-05, + "loss": 0.3927, + "step": 18880 + }, + { + "epoch": 0.41970316388197654, + "grad_norm": 2.375, + "learning_rate": 1.160488888888889e-05, + "loss": 0.3564, + "step": 18890 + }, + { + "epoch": 0.419925346605048, + "grad_norm": 2.53125, + "learning_rate": 1.1600444444444446e-05, + "loss": 0.3842, + "step": 18900 + }, + { + "epoch": 0.42014752932811944, + "grad_norm": 1.90625, + "learning_rate": 1.1596e-05, + "loss": 0.4098, + "step": 18910 + }, + { + "epoch": 0.4203697120511909, + "grad_norm": 1.890625, + "learning_rate": 1.1591555555555555e-05, + "loss": 0.3775, + "step": 18920 + }, + { + "epoch": 0.42059189477426234, + "grad_norm": 2.3125, + "learning_rate": 1.1587111111111112e-05, + "loss": 0.4061, + "step": 18930 + }, + { + "epoch": 0.4208140774973338, + "grad_norm": 2.234375, + "learning_rate": 1.1582666666666668e-05, + "loss": 0.4203, + "step": 18940 + }, + { + "epoch": 0.42103626022040525, + "grad_norm": 2.03125, + "learning_rate": 1.1578222222222223e-05, + "loss": 0.4124, + "step": 18950 + }, + { + "epoch": 0.4212584429434767, + "grad_norm": 2.296875, + "learning_rate": 1.157377777777778e-05, + "loss": 0.3668, + "step": 18960 + }, + { + "epoch": 0.42148062566654815, + "grad_norm": 2.390625, + "learning_rate": 1.1569333333333335e-05, + "loss": 0.3855, + "step": 18970 + }, + { + "epoch": 0.4217028083896196, + "grad_norm": 2.015625, + "learning_rate": 1.156488888888889e-05, + "loss": 0.3755, + "step": 18980 + }, + { + "epoch": 0.42192499111269105, + "grad_norm": 2.3125, + "learning_rate": 1.1560444444444446e-05, + "loss": 0.4197, + "step": 18990 + }, + { + "epoch": 0.42214717383576256, + "grad_norm": 2.359375, + "learning_rate": 1.1556e-05, + "loss": 0.383, + "step": 19000 + }, + { + "epoch": 0.422369356558834, + "grad_norm": 1.90625, + "learning_rate": 1.1551555555555556e-05, + "loss": 0.4043, + "step": 19010 + }, + { + "epoch": 0.42259153928190546, + "grad_norm": 2.46875, + "learning_rate": 1.154711111111111e-05, + "loss": 0.401, + "step": 19020 + }, + { + "epoch": 0.4228137220049769, + "grad_norm": 1.7578125, + "learning_rate": 1.1542666666666669e-05, + "loss": 0.3877, + "step": 19030 + }, + { + "epoch": 0.42303590472804836, + "grad_norm": 2.5, + "learning_rate": 1.1538222222222224e-05, + "loss": 0.4399, + "step": 19040 + }, + { + "epoch": 0.4232580874511198, + "grad_norm": 2.1875, + "learning_rate": 1.153377777777778e-05, + "loss": 0.4048, + "step": 19050 + }, + { + "epoch": 0.42348027017419126, + "grad_norm": 2.15625, + "learning_rate": 1.1529333333333335e-05, + "loss": 0.3851, + "step": 19060 + }, + { + "epoch": 0.4237024528972627, + "grad_norm": 2.296875, + "learning_rate": 1.152488888888889e-05, + "loss": 0.4034, + "step": 19070 + }, + { + "epoch": 0.42392463562033417, + "grad_norm": 2.296875, + "learning_rate": 1.1520444444444445e-05, + "loss": 0.4198, + "step": 19080 + }, + { + "epoch": 0.4241468183434056, + "grad_norm": 1.7890625, + "learning_rate": 1.1516000000000001e-05, + "loss": 0.4207, + "step": 19090 + }, + { + "epoch": 0.42436900106647707, + "grad_norm": 2.28125, + "learning_rate": 1.1511555555555556e-05, + "loss": 0.4425, + "step": 19100 + }, + { + "epoch": 0.4245911837895485, + "grad_norm": 1.984375, + "learning_rate": 1.1507111111111111e-05, + "loss": 0.3781, + "step": 19110 + }, + { + "epoch": 0.42481336651261997, + "grad_norm": 2.109375, + "learning_rate": 1.1502666666666669e-05, + "loss": 0.4106, + "step": 19120 + }, + { + "epoch": 0.4250355492356914, + "grad_norm": 2.5625, + "learning_rate": 1.1498222222222224e-05, + "loss": 0.3809, + "step": 19130 + }, + { + "epoch": 0.4252577319587629, + "grad_norm": 2.015625, + "learning_rate": 1.1493777777777779e-05, + "loss": 0.3521, + "step": 19140 + }, + { + "epoch": 0.4254799146818343, + "grad_norm": 2.25, + "learning_rate": 1.1489333333333335e-05, + "loss": 0.388, + "step": 19150 + }, + { + "epoch": 0.4257020974049058, + "grad_norm": 2.40625, + "learning_rate": 1.148488888888889e-05, + "loss": 0.4044, + "step": 19160 + }, + { + "epoch": 0.42592428012797723, + "grad_norm": 2.15625, + "learning_rate": 1.1480444444444445e-05, + "loss": 0.4445, + "step": 19170 + }, + { + "epoch": 0.4261464628510487, + "grad_norm": 1.984375, + "learning_rate": 1.1476e-05, + "loss": 0.405, + "step": 19180 + }, + { + "epoch": 0.42636864557412013, + "grad_norm": 2.203125, + "learning_rate": 1.1471555555555556e-05, + "loss": 0.3559, + "step": 19190 + }, + { + "epoch": 0.42659082829719164, + "grad_norm": 2.4375, + "learning_rate": 1.1467111111111111e-05, + "loss": 0.393, + "step": 19200 + }, + { + "epoch": 0.4268130110202631, + "grad_norm": 2.21875, + "learning_rate": 1.1462666666666668e-05, + "loss": 0.3441, + "step": 19210 + }, + { + "epoch": 0.42703519374333454, + "grad_norm": 2.140625, + "learning_rate": 1.1458222222222224e-05, + "loss": 0.3888, + "step": 19220 + }, + { + "epoch": 0.427257376466406, + "grad_norm": 2.28125, + "learning_rate": 1.145377777777778e-05, + "loss": 0.399, + "step": 19230 + }, + { + "epoch": 0.42747955918947744, + "grad_norm": 2.109375, + "learning_rate": 1.1449333333333334e-05, + "loss": 0.3745, + "step": 19240 + }, + { + "epoch": 0.4277017419125489, + "grad_norm": 2.203125, + "learning_rate": 1.144488888888889e-05, + "loss": 0.3968, + "step": 19250 + }, + { + "epoch": 0.42792392463562035, + "grad_norm": 2.15625, + "learning_rate": 1.1440444444444445e-05, + "loss": 0.4016, + "step": 19260 + }, + { + "epoch": 0.4281461073586918, + "grad_norm": 2.09375, + "learning_rate": 1.1436e-05, + "loss": 0.3813, + "step": 19270 + }, + { + "epoch": 0.42836829008176325, + "grad_norm": 2.28125, + "learning_rate": 1.1431555555555555e-05, + "loss": 0.4106, + "step": 19280 + }, + { + "epoch": 0.4285904728048347, + "grad_norm": 2.609375, + "learning_rate": 1.1427111111111112e-05, + "loss": 0.3633, + "step": 19290 + }, + { + "epoch": 0.42881265552790615, + "grad_norm": 2.171875, + "learning_rate": 1.1422666666666668e-05, + "loss": 0.3968, + "step": 19300 + }, + { + "epoch": 0.4290348382509776, + "grad_norm": 2.296875, + "learning_rate": 1.1418222222222223e-05, + "loss": 0.4016, + "step": 19310 + }, + { + "epoch": 0.42925702097404905, + "grad_norm": 2.484375, + "learning_rate": 1.141377777777778e-05, + "loss": 0.3636, + "step": 19320 + }, + { + "epoch": 0.4294792036971205, + "grad_norm": 1.96875, + "learning_rate": 1.1409333333333334e-05, + "loss": 0.3992, + "step": 19330 + }, + { + "epoch": 0.42970138642019196, + "grad_norm": 1.9296875, + "learning_rate": 1.140488888888889e-05, + "loss": 0.4122, + "step": 19340 + }, + { + "epoch": 0.4299235691432634, + "grad_norm": 2.640625, + "learning_rate": 1.1400444444444446e-05, + "loss": 0.3773, + "step": 19350 + }, + { + "epoch": 0.43014575186633486, + "grad_norm": 2.625, + "learning_rate": 1.1396e-05, + "loss": 0.3969, + "step": 19360 + }, + { + "epoch": 0.4303679345894063, + "grad_norm": 1.953125, + "learning_rate": 1.1391555555555556e-05, + "loss": 0.3679, + "step": 19370 + }, + { + "epoch": 0.43059011731247776, + "grad_norm": 2.625, + "learning_rate": 1.138711111111111e-05, + "loss": 0.3742, + "step": 19380 + }, + { + "epoch": 0.4308123000355492, + "grad_norm": 2.40625, + "learning_rate": 1.1382666666666669e-05, + "loss": 0.3824, + "step": 19390 + }, + { + "epoch": 0.43103448275862066, + "grad_norm": 2.5, + "learning_rate": 1.1378222222222223e-05, + "loss": 0.4468, + "step": 19400 + }, + { + "epoch": 0.43125666548169217, + "grad_norm": 2.15625, + "learning_rate": 1.137377777777778e-05, + "loss": 0.3628, + "step": 19410 + }, + { + "epoch": 0.4314788482047636, + "grad_norm": 2.34375, + "learning_rate": 1.1369333333333335e-05, + "loss": 0.3756, + "step": 19420 + }, + { + "epoch": 0.43170103092783507, + "grad_norm": 2.15625, + "learning_rate": 1.136488888888889e-05, + "loss": 0.3864, + "step": 19430 + }, + { + "epoch": 0.4319232136509065, + "grad_norm": 2.015625, + "learning_rate": 1.1360444444444445e-05, + "loss": 0.4071, + "step": 19440 + }, + { + "epoch": 0.432145396373978, + "grad_norm": 2.53125, + "learning_rate": 1.1356000000000001e-05, + "loss": 0.4236, + "step": 19450 + }, + { + "epoch": 0.4323675790970494, + "grad_norm": 2.21875, + "learning_rate": 1.1351555555555556e-05, + "loss": 0.423, + "step": 19460 + }, + { + "epoch": 0.4325897618201209, + "grad_norm": 3.03125, + "learning_rate": 1.134711111111111e-05, + "loss": 0.379, + "step": 19470 + }, + { + "epoch": 0.43281194454319233, + "grad_norm": 2.546875, + "learning_rate": 1.1342666666666669e-05, + "loss": 0.4025, + "step": 19480 + }, + { + "epoch": 0.4330341272662638, + "grad_norm": 2.0625, + "learning_rate": 1.1338222222222224e-05, + "loss": 0.3735, + "step": 19490 + }, + { + "epoch": 0.43325630998933523, + "grad_norm": 2.59375, + "learning_rate": 1.1333777777777779e-05, + "loss": 0.397, + "step": 19500 + }, + { + "epoch": 0.4334784927124067, + "grad_norm": 2.4375, + "learning_rate": 1.1329333333333335e-05, + "loss": 0.4139, + "step": 19510 + }, + { + "epoch": 0.43370067543547813, + "grad_norm": 2.8125, + "learning_rate": 1.132488888888889e-05, + "loss": 0.391, + "step": 19520 + }, + { + "epoch": 0.4339228581585496, + "grad_norm": 2.328125, + "learning_rate": 1.1320444444444445e-05, + "loss": 0.3767, + "step": 19530 + }, + { + "epoch": 0.43414504088162104, + "grad_norm": 2.375, + "learning_rate": 1.1316e-05, + "loss": 0.4362, + "step": 19540 + }, + { + "epoch": 0.4343672236046925, + "grad_norm": 3.0, + "learning_rate": 1.1311555555555556e-05, + "loss": 0.4181, + "step": 19550 + }, + { + "epoch": 0.43458940632776394, + "grad_norm": 2.359375, + "learning_rate": 1.1307111111111111e-05, + "loss": 0.363, + "step": 19560 + }, + { + "epoch": 0.4348115890508354, + "grad_norm": 2.765625, + "learning_rate": 1.1302666666666668e-05, + "loss": 0.4238, + "step": 19570 + }, + { + "epoch": 0.43503377177390684, + "grad_norm": 2.0625, + "learning_rate": 1.1298222222222224e-05, + "loss": 0.3864, + "step": 19580 + }, + { + "epoch": 0.4352559544969783, + "grad_norm": 2.0, + "learning_rate": 1.1293777777777779e-05, + "loss": 0.4356, + "step": 19590 + }, + { + "epoch": 0.43547813722004974, + "grad_norm": 2.453125, + "learning_rate": 1.1289333333333334e-05, + "loss": 0.4085, + "step": 19600 + }, + { + "epoch": 0.43570031994312125, + "grad_norm": 2.03125, + "learning_rate": 1.128488888888889e-05, + "loss": 0.3822, + "step": 19610 + }, + { + "epoch": 0.4359225026661927, + "grad_norm": 2.265625, + "learning_rate": 1.1280444444444445e-05, + "loss": 0.3907, + "step": 19620 + }, + { + "epoch": 0.43614468538926415, + "grad_norm": 2.3125, + "learning_rate": 1.1276e-05, + "loss": 0.4203, + "step": 19630 + }, + { + "epoch": 0.4363668681123356, + "grad_norm": 1.875, + "learning_rate": 1.1271555555555555e-05, + "loss": 0.4278, + "step": 19640 + }, + { + "epoch": 0.43658905083540706, + "grad_norm": 2.140625, + "learning_rate": 1.1267111111111113e-05, + "loss": 0.3878, + "step": 19650 + }, + { + "epoch": 0.4368112335584785, + "grad_norm": 2.515625, + "learning_rate": 1.1262666666666668e-05, + "loss": 0.4072, + "step": 19660 + }, + { + "epoch": 0.43703341628154996, + "grad_norm": 2.265625, + "learning_rate": 1.1258222222222223e-05, + "loss": 0.3515, + "step": 19670 + }, + { + "epoch": 0.4372555990046214, + "grad_norm": 2.96875, + "learning_rate": 1.125377777777778e-05, + "loss": 0.3994, + "step": 19680 + }, + { + "epoch": 0.43747778172769286, + "grad_norm": 2.921875, + "learning_rate": 1.1249333333333334e-05, + "loss": 0.4353, + "step": 19690 + }, + { + "epoch": 0.4376999644507643, + "grad_norm": 2.015625, + "learning_rate": 1.1244888888888889e-05, + "loss": 0.42, + "step": 19700 + }, + { + "epoch": 0.43792214717383576, + "grad_norm": 1.9375, + "learning_rate": 1.1240444444444446e-05, + "loss": 0.3881, + "step": 19710 + }, + { + "epoch": 0.4381443298969072, + "grad_norm": 2.046875, + "learning_rate": 1.1236e-05, + "loss": 0.3918, + "step": 19720 + }, + { + "epoch": 0.43836651261997867, + "grad_norm": 2.234375, + "learning_rate": 1.1231555555555555e-05, + "loss": 0.3917, + "step": 19730 + }, + { + "epoch": 0.4385886953430501, + "grad_norm": 2.375, + "learning_rate": 1.1227111111111114e-05, + "loss": 0.3976, + "step": 19740 + }, + { + "epoch": 0.43881087806612157, + "grad_norm": 1.75, + "learning_rate": 1.1222666666666668e-05, + "loss": 0.3924, + "step": 19750 + }, + { + "epoch": 0.439033060789193, + "grad_norm": 2.34375, + "learning_rate": 1.1218222222222223e-05, + "loss": 0.3909, + "step": 19760 + }, + { + "epoch": 0.43925524351226447, + "grad_norm": 1.8984375, + "learning_rate": 1.121377777777778e-05, + "loss": 0.42, + "step": 19770 + }, + { + "epoch": 0.4394774262353359, + "grad_norm": 2.109375, + "learning_rate": 1.1209333333333335e-05, + "loss": 0.3655, + "step": 19780 + }, + { + "epoch": 0.4396996089584074, + "grad_norm": 2.6875, + "learning_rate": 1.120488888888889e-05, + "loss": 0.3663, + "step": 19790 + }, + { + "epoch": 0.4399217916814788, + "grad_norm": 2.125, + "learning_rate": 1.1200444444444444e-05, + "loss": 0.3854, + "step": 19800 + }, + { + "epoch": 0.4401439744045503, + "grad_norm": 2.15625, + "learning_rate": 1.1196000000000001e-05, + "loss": 0.374, + "step": 19810 + }, + { + "epoch": 0.4403661571276218, + "grad_norm": 1.875, + "learning_rate": 1.1191555555555556e-05, + "loss": 0.3725, + "step": 19820 + }, + { + "epoch": 0.44058833985069323, + "grad_norm": 1.8984375, + "learning_rate": 1.1187111111111112e-05, + "loss": 0.3875, + "step": 19830 + }, + { + "epoch": 0.4408105225737647, + "grad_norm": 2.84375, + "learning_rate": 1.1182666666666669e-05, + "loss": 0.4269, + "step": 19840 + }, + { + "epoch": 0.44103270529683614, + "grad_norm": 2.046875, + "learning_rate": 1.1178222222222224e-05, + "loss": 0.3947, + "step": 19850 + }, + { + "epoch": 0.4412548880199076, + "grad_norm": 2.46875, + "learning_rate": 1.1173777777777779e-05, + "loss": 0.4126, + "step": 19860 + }, + { + "epoch": 0.44147707074297904, + "grad_norm": 2.640625, + "learning_rate": 1.1169333333333335e-05, + "loss": 0.4062, + "step": 19870 + }, + { + "epoch": 0.4416992534660505, + "grad_norm": 2.703125, + "learning_rate": 1.116488888888889e-05, + "loss": 0.3836, + "step": 19880 + }, + { + "epoch": 0.44192143618912194, + "grad_norm": 1.9921875, + "learning_rate": 1.1160444444444445e-05, + "loss": 0.3643, + "step": 19890 + }, + { + "epoch": 0.4421436189121934, + "grad_norm": 2.03125, + "learning_rate": 1.1156e-05, + "loss": 0.4177, + "step": 19900 + }, + { + "epoch": 0.44236580163526484, + "grad_norm": 2.234375, + "learning_rate": 1.1151555555555556e-05, + "loss": 0.3948, + "step": 19910 + }, + { + "epoch": 0.4425879843583363, + "grad_norm": 2.546875, + "learning_rate": 1.1147111111111113e-05, + "loss": 0.3898, + "step": 19920 + }, + { + "epoch": 0.44281016708140775, + "grad_norm": 1.953125, + "learning_rate": 1.1142666666666667e-05, + "loss": 0.3983, + "step": 19930 + }, + { + "epoch": 0.4430323498044792, + "grad_norm": 2.296875, + "learning_rate": 1.1138222222222224e-05, + "loss": 0.3946, + "step": 19940 + }, + { + "epoch": 0.44325453252755065, + "grad_norm": 2.375, + "learning_rate": 1.1133777777777779e-05, + "loss": 0.3744, + "step": 19950 + }, + { + "epoch": 0.4434767152506221, + "grad_norm": 2.59375, + "learning_rate": 1.1129333333333334e-05, + "loss": 0.4106, + "step": 19960 + }, + { + "epoch": 0.44369889797369355, + "grad_norm": 2.46875, + "learning_rate": 1.112488888888889e-05, + "loss": 0.3814, + "step": 19970 + }, + { + "epoch": 0.443921080696765, + "grad_norm": 2.265625, + "learning_rate": 1.1120444444444445e-05, + "loss": 0.4158, + "step": 19980 + }, + { + "epoch": 0.44414326341983645, + "grad_norm": 2.171875, + "learning_rate": 1.1116e-05, + "loss": 0.4096, + "step": 19990 + }, + { + "epoch": 0.4443654461429079, + "grad_norm": 2.578125, + "learning_rate": 1.1111555555555555e-05, + "loss": 0.3942, + "step": 20000 + }, + { + "epoch": 0.44458762886597936, + "grad_norm": 2.390625, + "learning_rate": 1.1107111111111113e-05, + "loss": 0.3741, + "step": 20010 + }, + { + "epoch": 0.44480981158905086, + "grad_norm": 2.203125, + "learning_rate": 1.1102666666666668e-05, + "loss": 0.3804, + "step": 20020 + }, + { + "epoch": 0.4450319943121223, + "grad_norm": 1.8828125, + "learning_rate": 1.1098222222222223e-05, + "loss": 0.3675, + "step": 20030 + }, + { + "epoch": 0.44525417703519377, + "grad_norm": 2.4375, + "learning_rate": 1.109377777777778e-05, + "loss": 0.3957, + "step": 20040 + }, + { + "epoch": 0.4454763597582652, + "grad_norm": 2.59375, + "learning_rate": 1.1089333333333334e-05, + "loss": 0.3769, + "step": 20050 + }, + { + "epoch": 0.44569854248133667, + "grad_norm": 2.0, + "learning_rate": 1.1084888888888889e-05, + "loss": 0.3605, + "step": 20060 + }, + { + "epoch": 0.4459207252044081, + "grad_norm": 2.375, + "learning_rate": 1.1080444444444445e-05, + "loss": 0.4117, + "step": 20070 + }, + { + "epoch": 0.44614290792747957, + "grad_norm": 1.9609375, + "learning_rate": 1.1076e-05, + "loss": 0.4063, + "step": 20080 + }, + { + "epoch": 0.446365090650551, + "grad_norm": 2.21875, + "learning_rate": 1.1071555555555555e-05, + "loss": 0.4014, + "step": 20090 + }, + { + "epoch": 0.4465872733736225, + "grad_norm": 2.546875, + "learning_rate": 1.1067111111111113e-05, + "loss": 0.3968, + "step": 20100 + }, + { + "epoch": 0.4468094560966939, + "grad_norm": 2.3125, + "learning_rate": 1.1062666666666668e-05, + "loss": 0.3791, + "step": 20110 + }, + { + "epoch": 0.4470316388197654, + "grad_norm": 2.453125, + "learning_rate": 1.1058222222222223e-05, + "loss": 0.4061, + "step": 20120 + }, + { + "epoch": 0.4472538215428368, + "grad_norm": 2.375, + "learning_rate": 1.105377777777778e-05, + "loss": 0.4039, + "step": 20130 + }, + { + "epoch": 0.4474760042659083, + "grad_norm": 1.8125, + "learning_rate": 1.1049333333333334e-05, + "loss": 0.3768, + "step": 20140 + }, + { + "epoch": 0.44769818698897973, + "grad_norm": 2.296875, + "learning_rate": 1.104488888888889e-05, + "loss": 0.388, + "step": 20150 + }, + { + "epoch": 0.4479203697120512, + "grad_norm": 2.296875, + "learning_rate": 1.1040444444444444e-05, + "loss": 0.4123, + "step": 20160 + }, + { + "epoch": 0.44814255243512263, + "grad_norm": 2.484375, + "learning_rate": 1.1036e-05, + "loss": 0.3996, + "step": 20170 + }, + { + "epoch": 0.4483647351581941, + "grad_norm": 2.0, + "learning_rate": 1.1031555555555556e-05, + "loss": 0.3852, + "step": 20180 + }, + { + "epoch": 0.44858691788126553, + "grad_norm": 2.015625, + "learning_rate": 1.1027111111111112e-05, + "loss": 0.3636, + "step": 20190 + }, + { + "epoch": 0.448809100604337, + "grad_norm": 2.046875, + "learning_rate": 1.1022666666666669e-05, + "loss": 0.4001, + "step": 20200 + }, + { + "epoch": 0.44903128332740844, + "grad_norm": 1.75, + "learning_rate": 1.1018222222222223e-05, + "loss": 0.3724, + "step": 20210 + }, + { + "epoch": 0.4492534660504799, + "grad_norm": 2.453125, + "learning_rate": 1.1013777777777778e-05, + "loss": 0.3837, + "step": 20220 + }, + { + "epoch": 0.4494756487735514, + "grad_norm": 2.109375, + "learning_rate": 1.1009333333333335e-05, + "loss": 0.3334, + "step": 20230 + }, + { + "epoch": 0.44969783149662285, + "grad_norm": 2.75, + "learning_rate": 1.100488888888889e-05, + "loss": 0.41, + "step": 20240 + }, + { + "epoch": 0.4499200142196943, + "grad_norm": 2.0625, + "learning_rate": 1.1000444444444445e-05, + "loss": 0.4168, + "step": 20250 + }, + { + "epoch": 0.45014219694276575, + "grad_norm": 2.21875, + "learning_rate": 1.0996e-05, + "loss": 0.4111, + "step": 20260 + }, + { + "epoch": 0.4503643796658372, + "grad_norm": 1.8671875, + "learning_rate": 1.0991555555555558e-05, + "loss": 0.3801, + "step": 20270 + }, + { + "epoch": 0.45058656238890865, + "grad_norm": 2.375, + "learning_rate": 1.0987111111111112e-05, + "loss": 0.3671, + "step": 20280 + }, + { + "epoch": 0.4508087451119801, + "grad_norm": 2.09375, + "learning_rate": 1.0982666666666667e-05, + "loss": 0.3702, + "step": 20290 + }, + { + "epoch": 0.45103092783505155, + "grad_norm": 2.46875, + "learning_rate": 1.0978222222222224e-05, + "loss": 0.4006, + "step": 20300 + }, + { + "epoch": 0.451253110558123, + "grad_norm": 2.109375, + "learning_rate": 1.0973777777777779e-05, + "loss": 0.4027, + "step": 20310 + }, + { + "epoch": 0.45147529328119446, + "grad_norm": 2.390625, + "learning_rate": 1.0969333333333334e-05, + "loss": 0.3681, + "step": 20320 + }, + { + "epoch": 0.4516974760042659, + "grad_norm": 1.9375, + "learning_rate": 1.096488888888889e-05, + "loss": 0.3681, + "step": 20330 + }, + { + "epoch": 0.45191965872733736, + "grad_norm": 2.390625, + "learning_rate": 1.0960444444444445e-05, + "loss": 0.3897, + "step": 20340 + }, + { + "epoch": 0.4521418414504088, + "grad_norm": 2.71875, + "learning_rate": 1.0956e-05, + "loss": 0.4075, + "step": 20350 + }, + { + "epoch": 0.45236402417348026, + "grad_norm": 1.8984375, + "learning_rate": 1.0951555555555558e-05, + "loss": 0.377, + "step": 20360 + }, + { + "epoch": 0.4525862068965517, + "grad_norm": 2.0, + "learning_rate": 1.0947111111111113e-05, + "loss": 0.372, + "step": 20370 + }, + { + "epoch": 0.45280838961962316, + "grad_norm": 2.109375, + "learning_rate": 1.0942666666666668e-05, + "loss": 0.389, + "step": 20380 + }, + { + "epoch": 0.4530305723426946, + "grad_norm": 2.125, + "learning_rate": 1.0938222222222223e-05, + "loss": 0.3631, + "step": 20390 + }, + { + "epoch": 0.45325275506576607, + "grad_norm": 2.265625, + "learning_rate": 1.0933777777777779e-05, + "loss": 0.3857, + "step": 20400 + }, + { + "epoch": 0.4534749377888375, + "grad_norm": 2.109375, + "learning_rate": 1.0929333333333334e-05, + "loss": 0.3991, + "step": 20410 + }, + { + "epoch": 0.45369712051190897, + "grad_norm": 2.21875, + "learning_rate": 1.0924888888888889e-05, + "loss": 0.3523, + "step": 20420 + }, + { + "epoch": 0.4539193032349804, + "grad_norm": 2.0, + "learning_rate": 1.0920444444444445e-05, + "loss": 0.4037, + "step": 20430 + }, + { + "epoch": 0.4541414859580519, + "grad_norm": 2.25, + "learning_rate": 1.0916e-05, + "loss": 0.3844, + "step": 20440 + }, + { + "epoch": 0.4543636686811234, + "grad_norm": 1.9609375, + "learning_rate": 1.0911555555555557e-05, + "loss": 0.3743, + "step": 20450 + }, + { + "epoch": 0.45458585140419483, + "grad_norm": 1.9765625, + "learning_rate": 1.0907111111111113e-05, + "loss": 0.3594, + "step": 20460 + }, + { + "epoch": 0.4548080341272663, + "grad_norm": 2.4375, + "learning_rate": 1.0902666666666668e-05, + "loss": 0.391, + "step": 20470 + }, + { + "epoch": 0.45503021685033773, + "grad_norm": 2.171875, + "learning_rate": 1.0898222222222223e-05, + "loss": 0.3538, + "step": 20480 + }, + { + "epoch": 0.4552523995734092, + "grad_norm": 2.390625, + "learning_rate": 1.089377777777778e-05, + "loss": 0.398, + "step": 20490 + }, + { + "epoch": 0.45547458229648063, + "grad_norm": 2.109375, + "learning_rate": 1.0889333333333334e-05, + "loss": 0.4254, + "step": 20500 + }, + { + "epoch": 0.4556967650195521, + "grad_norm": 2.234375, + "learning_rate": 1.088488888888889e-05, + "loss": 0.4078, + "step": 20510 + }, + { + "epoch": 0.45591894774262354, + "grad_norm": 2.203125, + "learning_rate": 1.0880444444444444e-05, + "loss": 0.3843, + "step": 20520 + }, + { + "epoch": 0.456141130465695, + "grad_norm": 2.421875, + "learning_rate": 1.0876e-05, + "loss": 0.4057, + "step": 20530 + }, + { + "epoch": 0.45636331318876644, + "grad_norm": 2.25, + "learning_rate": 1.0871555555555557e-05, + "loss": 0.3866, + "step": 20540 + }, + { + "epoch": 0.4565854959118379, + "grad_norm": 2.28125, + "learning_rate": 1.0867111111111112e-05, + "loss": 0.3896, + "step": 20550 + }, + { + "epoch": 0.45680767863490934, + "grad_norm": 2.703125, + "learning_rate": 1.0862666666666668e-05, + "loss": 0.4137, + "step": 20560 + }, + { + "epoch": 0.4570298613579808, + "grad_norm": 2.015625, + "learning_rate": 1.0858222222222223e-05, + "loss": 0.3563, + "step": 20570 + }, + { + "epoch": 0.45725204408105224, + "grad_norm": 2.1875, + "learning_rate": 1.0853777777777778e-05, + "loss": 0.4091, + "step": 20580 + }, + { + "epoch": 0.4574742268041237, + "grad_norm": 2.3125, + "learning_rate": 1.0849333333333335e-05, + "loss": 0.3981, + "step": 20590 + }, + { + "epoch": 0.45769640952719515, + "grad_norm": 2.59375, + "learning_rate": 1.084488888888889e-05, + "loss": 0.4079, + "step": 20600 + }, + { + "epoch": 0.4579185922502666, + "grad_norm": 1.9296875, + "learning_rate": 1.0840444444444444e-05, + "loss": 0.3801, + "step": 20610 + }, + { + "epoch": 0.45814077497333805, + "grad_norm": 1.890625, + "learning_rate": 1.0836e-05, + "loss": 0.3811, + "step": 20620 + }, + { + "epoch": 0.4583629576964095, + "grad_norm": 2.28125, + "learning_rate": 1.0831555555555557e-05, + "loss": 0.3834, + "step": 20630 + }, + { + "epoch": 0.458585140419481, + "grad_norm": 2.359375, + "learning_rate": 1.0827111111111112e-05, + "loss": 0.394, + "step": 20640 + }, + { + "epoch": 0.45880732314255246, + "grad_norm": 2.53125, + "learning_rate": 1.0822666666666667e-05, + "loss": 0.412, + "step": 20650 + }, + { + "epoch": 0.4590295058656239, + "grad_norm": 2.578125, + "learning_rate": 1.0818222222222224e-05, + "loss": 0.424, + "step": 20660 + }, + { + "epoch": 0.45925168858869536, + "grad_norm": 2.109375, + "learning_rate": 1.0813777777777779e-05, + "loss": 0.3918, + "step": 20670 + }, + { + "epoch": 0.4594738713117668, + "grad_norm": 2.265625, + "learning_rate": 1.0809333333333333e-05, + "loss": 0.3855, + "step": 20680 + }, + { + "epoch": 0.45969605403483826, + "grad_norm": 2.390625, + "learning_rate": 1.080488888888889e-05, + "loss": 0.3949, + "step": 20690 + }, + { + "epoch": 0.4599182367579097, + "grad_norm": 1.84375, + "learning_rate": 1.0800444444444445e-05, + "loss": 0.4035, + "step": 20700 + }, + { + "epoch": 0.46014041948098117, + "grad_norm": 2.125, + "learning_rate": 1.0796e-05, + "loss": 0.3553, + "step": 20710 + }, + { + "epoch": 0.4603626022040526, + "grad_norm": 2.265625, + "learning_rate": 1.0791555555555558e-05, + "loss": 0.4035, + "step": 20720 + }, + { + "epoch": 0.46058478492712407, + "grad_norm": 2.359375, + "learning_rate": 1.0787111111111113e-05, + "loss": 0.3682, + "step": 20730 + }, + { + "epoch": 0.4608069676501955, + "grad_norm": 2.21875, + "learning_rate": 1.0782666666666668e-05, + "loss": 0.384, + "step": 20740 + }, + { + "epoch": 0.46102915037326697, + "grad_norm": 2.234375, + "learning_rate": 1.0778222222222224e-05, + "loss": 0.3888, + "step": 20750 + }, + { + "epoch": 0.4612513330963384, + "grad_norm": 2.40625, + "learning_rate": 1.0773777777777779e-05, + "loss": 0.4149, + "step": 20760 + }, + { + "epoch": 0.4614735158194099, + "grad_norm": 2.328125, + "learning_rate": 1.0769333333333334e-05, + "loss": 0.3992, + "step": 20770 + }, + { + "epoch": 0.4616956985424813, + "grad_norm": 2.40625, + "learning_rate": 1.0764888888888889e-05, + "loss": 0.4211, + "step": 20780 + }, + { + "epoch": 0.4619178812655528, + "grad_norm": 2.03125, + "learning_rate": 1.0760444444444445e-05, + "loss": 0.3719, + "step": 20790 + }, + { + "epoch": 0.46214006398862423, + "grad_norm": 2.578125, + "learning_rate": 1.0756e-05, + "loss": 0.3809, + "step": 20800 + }, + { + "epoch": 0.4623622467116957, + "grad_norm": 2.28125, + "learning_rate": 1.0751555555555557e-05, + "loss": 0.3777, + "step": 20810 + }, + { + "epoch": 0.46258442943476713, + "grad_norm": 2.71875, + "learning_rate": 1.0747111111111113e-05, + "loss": 0.3977, + "step": 20820 + }, + { + "epoch": 0.4628066121578386, + "grad_norm": 2.65625, + "learning_rate": 1.0742666666666668e-05, + "loss": 0.3746, + "step": 20830 + }, + { + "epoch": 0.46302879488091003, + "grad_norm": 2.34375, + "learning_rate": 1.0738222222222223e-05, + "loss": 0.4218, + "step": 20840 + }, + { + "epoch": 0.46325097760398154, + "grad_norm": 2.46875, + "learning_rate": 1.073377777777778e-05, + "loss": 0.4456, + "step": 20850 + }, + { + "epoch": 0.463473160327053, + "grad_norm": 2.21875, + "learning_rate": 1.0729333333333334e-05, + "loss": 0.3884, + "step": 20860 + }, + { + "epoch": 0.46369534305012444, + "grad_norm": 2.078125, + "learning_rate": 1.0724888888888889e-05, + "loss": 0.417, + "step": 20870 + }, + { + "epoch": 0.4639175257731959, + "grad_norm": 2.421875, + "learning_rate": 1.0720444444444444e-05, + "loss": 0.3808, + "step": 20880 + }, + { + "epoch": 0.46413970849626734, + "grad_norm": 2.390625, + "learning_rate": 1.0716000000000002e-05, + "loss": 0.3692, + "step": 20890 + }, + { + "epoch": 0.4643618912193388, + "grad_norm": 2.765625, + "learning_rate": 1.0711555555555557e-05, + "loss": 0.4041, + "step": 20900 + }, + { + "epoch": 0.46458407394241025, + "grad_norm": 1.71875, + "learning_rate": 1.0707111111111112e-05, + "loss": 0.3721, + "step": 20910 + }, + { + "epoch": 0.4648062566654817, + "grad_norm": 2.21875, + "learning_rate": 1.0702666666666668e-05, + "loss": 0.4174, + "step": 20920 + }, + { + "epoch": 0.46502843938855315, + "grad_norm": 2.28125, + "learning_rate": 1.0698222222222223e-05, + "loss": 0.3944, + "step": 20930 + }, + { + "epoch": 0.4652506221116246, + "grad_norm": 3.015625, + "learning_rate": 1.0693777777777778e-05, + "loss": 0.3833, + "step": 20940 + }, + { + "epoch": 0.46547280483469605, + "grad_norm": 2.28125, + "learning_rate": 1.0689333333333335e-05, + "loss": 0.4119, + "step": 20950 + }, + { + "epoch": 0.4656949875577675, + "grad_norm": 1.75, + "learning_rate": 1.068488888888889e-05, + "loss": 0.3927, + "step": 20960 + }, + { + "epoch": 0.46591717028083895, + "grad_norm": 2.25, + "learning_rate": 1.0680444444444444e-05, + "loss": 0.394, + "step": 20970 + }, + { + "epoch": 0.4661393530039104, + "grad_norm": 2.296875, + "learning_rate": 1.0676000000000002e-05, + "loss": 0.3588, + "step": 20980 + }, + { + "epoch": 0.46636153572698186, + "grad_norm": 2.671875, + "learning_rate": 1.0671555555555557e-05, + "loss": 0.3979, + "step": 20990 + }, + { + "epoch": 0.4665837184500533, + "grad_norm": 2.171875, + "learning_rate": 1.0667111111111112e-05, + "loss": 0.3983, + "step": 21000 + }, + { + "epoch": 0.46680590117312476, + "grad_norm": 1.9453125, + "learning_rate": 1.0662666666666667e-05, + "loss": 0.3962, + "step": 21010 + }, + { + "epoch": 0.4670280838961962, + "grad_norm": 2.46875, + "learning_rate": 1.0658222222222224e-05, + "loss": 0.4133, + "step": 21020 + }, + { + "epoch": 0.46725026661926766, + "grad_norm": 2.5625, + "learning_rate": 1.0653777777777778e-05, + "loss": 0.3879, + "step": 21030 + }, + { + "epoch": 0.4674724493423391, + "grad_norm": 1.8828125, + "learning_rate": 1.0649333333333333e-05, + "loss": 0.4, + "step": 21040 + }, + { + "epoch": 0.4676946320654106, + "grad_norm": 2.15625, + "learning_rate": 1.064488888888889e-05, + "loss": 0.3913, + "step": 21050 + }, + { + "epoch": 0.46791681478848207, + "grad_norm": 2.65625, + "learning_rate": 1.0640444444444445e-05, + "loss": 0.3635, + "step": 21060 + }, + { + "epoch": 0.4681389975115535, + "grad_norm": 2.1875, + "learning_rate": 1.0636000000000001e-05, + "loss": 0.3726, + "step": 21070 + }, + { + "epoch": 0.468361180234625, + "grad_norm": 2.3125, + "learning_rate": 1.0631555555555558e-05, + "loss": 0.3947, + "step": 21080 + }, + { + "epoch": 0.4685833629576964, + "grad_norm": 1.890625, + "learning_rate": 1.0627111111111113e-05, + "loss": 0.3738, + "step": 21090 + }, + { + "epoch": 0.4688055456807679, + "grad_norm": 2.296875, + "learning_rate": 1.0622666666666667e-05, + "loss": 0.4123, + "step": 21100 + }, + { + "epoch": 0.46902772840383933, + "grad_norm": 2.484375, + "learning_rate": 1.0618222222222224e-05, + "loss": 0.4035, + "step": 21110 + }, + { + "epoch": 0.4692499111269108, + "grad_norm": 2.515625, + "learning_rate": 1.0613777777777779e-05, + "loss": 0.4067, + "step": 21120 + }, + { + "epoch": 0.46947209384998223, + "grad_norm": 2.28125, + "learning_rate": 1.0609333333333334e-05, + "loss": 0.3745, + "step": 21130 + }, + { + "epoch": 0.4696942765730537, + "grad_norm": 2.40625, + "learning_rate": 1.0604888888888888e-05, + "loss": 0.3901, + "step": 21140 + }, + { + "epoch": 0.46991645929612513, + "grad_norm": 2.75, + "learning_rate": 1.0600444444444445e-05, + "loss": 0.4082, + "step": 21150 + }, + { + "epoch": 0.4701386420191966, + "grad_norm": 2.796875, + "learning_rate": 1.0596000000000002e-05, + "loss": 0.3994, + "step": 21160 + }, + { + "epoch": 0.47036082474226804, + "grad_norm": 2.375, + "learning_rate": 1.0591555555555556e-05, + "loss": 0.4238, + "step": 21170 + }, + { + "epoch": 0.4705830074653395, + "grad_norm": 1.9921875, + "learning_rate": 1.0587111111111113e-05, + "loss": 0.4065, + "step": 21180 + }, + { + "epoch": 0.47080519018841094, + "grad_norm": 2.0625, + "learning_rate": 1.0582666666666668e-05, + "loss": 0.4038, + "step": 21190 + }, + { + "epoch": 0.4710273729114824, + "grad_norm": 2.15625, + "learning_rate": 1.0578222222222223e-05, + "loss": 0.3967, + "step": 21200 + }, + { + "epoch": 0.47124955563455384, + "grad_norm": 2.828125, + "learning_rate": 1.057377777777778e-05, + "loss": 0.3742, + "step": 21210 + }, + { + "epoch": 0.4714717383576253, + "grad_norm": 1.921875, + "learning_rate": 1.0569333333333334e-05, + "loss": 0.4002, + "step": 21220 + }, + { + "epoch": 0.47169392108069674, + "grad_norm": 2.28125, + "learning_rate": 1.0564888888888889e-05, + "loss": 0.3993, + "step": 21230 + }, + { + "epoch": 0.4719161038037682, + "grad_norm": 2.40625, + "learning_rate": 1.0560444444444444e-05, + "loss": 0.4094, + "step": 21240 + }, + { + "epoch": 0.47213828652683965, + "grad_norm": 2.375, + "learning_rate": 1.0556000000000002e-05, + "loss": 0.3779, + "step": 21250 + }, + { + "epoch": 0.47236046924991115, + "grad_norm": 2.578125, + "learning_rate": 1.0551555555555557e-05, + "loss": 0.3729, + "step": 21260 + }, + { + "epoch": 0.4725826519729826, + "grad_norm": 2.03125, + "learning_rate": 1.0547111111111112e-05, + "loss": 0.3719, + "step": 21270 + }, + { + "epoch": 0.47280483469605405, + "grad_norm": 2.5, + "learning_rate": 1.0542666666666668e-05, + "loss": 0.385, + "step": 21280 + }, + { + "epoch": 0.4730270174191255, + "grad_norm": 2.40625, + "learning_rate": 1.0538222222222223e-05, + "loss": 0.3988, + "step": 21290 + }, + { + "epoch": 0.47324920014219696, + "grad_norm": 2.609375, + "learning_rate": 1.0533777777777778e-05, + "loss": 0.3811, + "step": 21300 + }, + { + "epoch": 0.4734713828652684, + "grad_norm": 2.4375, + "learning_rate": 1.0529333333333334e-05, + "loss": 0.4309, + "step": 21310 + }, + { + "epoch": 0.47369356558833986, + "grad_norm": 2.359375, + "learning_rate": 1.052488888888889e-05, + "loss": 0.3763, + "step": 21320 + }, + { + "epoch": 0.4739157483114113, + "grad_norm": 2.671875, + "learning_rate": 1.0520444444444444e-05, + "loss": 0.3886, + "step": 21330 + }, + { + "epoch": 0.47413793103448276, + "grad_norm": 3.0, + "learning_rate": 1.0516000000000002e-05, + "loss": 0.3976, + "step": 21340 + }, + { + "epoch": 0.4743601137575542, + "grad_norm": 2.265625, + "learning_rate": 1.0511555555555557e-05, + "loss": 0.3736, + "step": 21350 + }, + { + "epoch": 0.47458229648062566, + "grad_norm": 2.421875, + "learning_rate": 1.0507111111111112e-05, + "loss": 0.4161, + "step": 21360 + }, + { + "epoch": 0.4748044792036971, + "grad_norm": 2.328125, + "learning_rate": 1.0502666666666667e-05, + "loss": 0.4237, + "step": 21370 + }, + { + "epoch": 0.47502666192676857, + "grad_norm": 2.28125, + "learning_rate": 1.0498222222222223e-05, + "loss": 0.3793, + "step": 21380 + }, + { + "epoch": 0.47524884464984, + "grad_norm": 2.21875, + "learning_rate": 1.0493777777777778e-05, + "loss": 0.4024, + "step": 21390 + }, + { + "epoch": 0.47547102737291147, + "grad_norm": 2.96875, + "learning_rate": 1.0489333333333333e-05, + "loss": 0.3968, + "step": 21400 + }, + { + "epoch": 0.4756932100959829, + "grad_norm": 2.53125, + "learning_rate": 1.048488888888889e-05, + "loss": 0.402, + "step": 21410 + }, + { + "epoch": 0.4759153928190544, + "grad_norm": 2.171875, + "learning_rate": 1.0480444444444444e-05, + "loss": 0.3966, + "step": 21420 + }, + { + "epoch": 0.4761375755421258, + "grad_norm": 2.53125, + "learning_rate": 1.0476000000000001e-05, + "loss": 0.4159, + "step": 21430 + }, + { + "epoch": 0.4763597582651973, + "grad_norm": 2.28125, + "learning_rate": 1.0471555555555558e-05, + "loss": 0.4008, + "step": 21440 + }, + { + "epoch": 0.4765819409882687, + "grad_norm": 2.25, + "learning_rate": 1.0467111111111112e-05, + "loss": 0.3913, + "step": 21450 + }, + { + "epoch": 0.47680412371134023, + "grad_norm": 2.25, + "learning_rate": 1.0462666666666667e-05, + "loss": 0.3975, + "step": 21460 + }, + { + "epoch": 0.4770263064344117, + "grad_norm": 2.28125, + "learning_rate": 1.0458222222222224e-05, + "loss": 0.3919, + "step": 21470 + }, + { + "epoch": 0.47724848915748314, + "grad_norm": 2.328125, + "learning_rate": 1.0453777777777779e-05, + "loss": 0.4096, + "step": 21480 + }, + { + "epoch": 0.4774706718805546, + "grad_norm": 2.375, + "learning_rate": 1.0449333333333333e-05, + "loss": 0.3979, + "step": 21490 + }, + { + "epoch": 0.47769285460362604, + "grad_norm": 2.921875, + "learning_rate": 1.0444888888888888e-05, + "loss": 0.388, + "step": 21500 + }, + { + "epoch": 0.4779150373266975, + "grad_norm": 2.140625, + "learning_rate": 1.0440444444444447e-05, + "loss": 0.3575, + "step": 21510 + }, + { + "epoch": 0.47813722004976894, + "grad_norm": 2.640625, + "learning_rate": 1.0436000000000001e-05, + "loss": 0.4031, + "step": 21520 + }, + { + "epoch": 0.4783594027728404, + "grad_norm": 2.390625, + "learning_rate": 1.0431555555555556e-05, + "loss": 0.386, + "step": 21530 + }, + { + "epoch": 0.47858158549591184, + "grad_norm": 2.453125, + "learning_rate": 1.0427111111111113e-05, + "loss": 0.3975, + "step": 21540 + }, + { + "epoch": 0.4788037682189833, + "grad_norm": 2.265625, + "learning_rate": 1.0422666666666668e-05, + "loss": 0.3538, + "step": 21550 + }, + { + "epoch": 0.47902595094205475, + "grad_norm": 1.9375, + "learning_rate": 1.0418222222222222e-05, + "loss": 0.3753, + "step": 21560 + }, + { + "epoch": 0.4792481336651262, + "grad_norm": 2.546875, + "learning_rate": 1.0413777777777779e-05, + "loss": 0.3741, + "step": 21570 + }, + { + "epoch": 0.47947031638819765, + "grad_norm": 2.109375, + "learning_rate": 1.0409333333333334e-05, + "loss": 0.4022, + "step": 21580 + }, + { + "epoch": 0.4796924991112691, + "grad_norm": 2.234375, + "learning_rate": 1.0404888888888889e-05, + "loss": 0.3794, + "step": 21590 + }, + { + "epoch": 0.47991468183434055, + "grad_norm": 2.359375, + "learning_rate": 1.0400444444444447e-05, + "loss": 0.3702, + "step": 21600 + }, + { + "epoch": 0.480136864557412, + "grad_norm": 2.46875, + "learning_rate": 1.0396000000000002e-05, + "loss": 0.435, + "step": 21610 + }, + { + "epoch": 0.48035904728048345, + "grad_norm": 2.4375, + "learning_rate": 1.0391555555555557e-05, + "loss": 0.389, + "step": 21620 + }, + { + "epoch": 0.4805812300035549, + "grad_norm": 2.046875, + "learning_rate": 1.0387111111111111e-05, + "loss": 0.3863, + "step": 21630 + }, + { + "epoch": 0.48080341272662636, + "grad_norm": 2.4375, + "learning_rate": 1.0382666666666668e-05, + "loss": 0.3822, + "step": 21640 + }, + { + "epoch": 0.4810255954496978, + "grad_norm": 1.9453125, + "learning_rate": 1.0378222222222223e-05, + "loss": 0.4093, + "step": 21650 + }, + { + "epoch": 0.48124777817276926, + "grad_norm": 2.21875, + "learning_rate": 1.0373777777777778e-05, + "loss": 0.3749, + "step": 21660 + }, + { + "epoch": 0.48146996089584077, + "grad_norm": 3.078125, + "learning_rate": 1.0369333333333334e-05, + "loss": 0.4006, + "step": 21670 + }, + { + "epoch": 0.4816921436189122, + "grad_norm": 2.375, + "learning_rate": 1.0364888888888889e-05, + "loss": 0.4306, + "step": 21680 + }, + { + "epoch": 0.48191432634198367, + "grad_norm": 2.5, + "learning_rate": 1.0360444444444446e-05, + "loss": 0.38, + "step": 21690 + }, + { + "epoch": 0.4821365090650551, + "grad_norm": 2.421875, + "learning_rate": 1.0356000000000002e-05, + "loss": 0.414, + "step": 21700 + }, + { + "epoch": 0.48235869178812657, + "grad_norm": 2.296875, + "learning_rate": 1.0351555555555557e-05, + "loss": 0.403, + "step": 21710 + }, + { + "epoch": 0.482580874511198, + "grad_norm": 2.25, + "learning_rate": 1.0347111111111112e-05, + "loss": 0.3978, + "step": 21720 + }, + { + "epoch": 0.4828030572342695, + "grad_norm": 2.46875, + "learning_rate": 1.0342666666666667e-05, + "loss": 0.393, + "step": 21730 + }, + { + "epoch": 0.4830252399573409, + "grad_norm": 2.25, + "learning_rate": 1.0338222222222223e-05, + "loss": 0.3547, + "step": 21740 + }, + { + "epoch": 0.4832474226804124, + "grad_norm": 2.296875, + "learning_rate": 1.0333777777777778e-05, + "loss": 0.4171, + "step": 21750 + }, + { + "epoch": 0.4834696054034838, + "grad_norm": 2.171875, + "learning_rate": 1.0329333333333333e-05, + "loss": 0.417, + "step": 21760 + }, + { + "epoch": 0.4836917881265553, + "grad_norm": 2.46875, + "learning_rate": 1.032488888888889e-05, + "loss": 0.4144, + "step": 21770 + }, + { + "epoch": 0.48391397084962673, + "grad_norm": 2.5625, + "learning_rate": 1.0320444444444446e-05, + "loss": 0.3907, + "step": 21780 + }, + { + "epoch": 0.4841361535726982, + "grad_norm": 2.328125, + "learning_rate": 1.0316e-05, + "loss": 0.3828, + "step": 21790 + }, + { + "epoch": 0.48435833629576963, + "grad_norm": 2.03125, + "learning_rate": 1.0311555555555557e-05, + "loss": 0.3694, + "step": 21800 + }, + { + "epoch": 0.4845805190188411, + "grad_norm": 1.9765625, + "learning_rate": 1.0307111111111112e-05, + "loss": 0.3973, + "step": 21810 + }, + { + "epoch": 0.48480270174191253, + "grad_norm": 2.078125, + "learning_rate": 1.0302666666666667e-05, + "loss": 0.3438, + "step": 21820 + }, + { + "epoch": 0.485024884464984, + "grad_norm": 2.21875, + "learning_rate": 1.0298222222222224e-05, + "loss": 0.3919, + "step": 21830 + }, + { + "epoch": 0.48524706718805544, + "grad_norm": 2.28125, + "learning_rate": 1.0293777777777778e-05, + "loss": 0.3648, + "step": 21840 + }, + { + "epoch": 0.4854692499111269, + "grad_norm": 2.421875, + "learning_rate": 1.0289333333333333e-05, + "loss": 0.3749, + "step": 21850 + }, + { + "epoch": 0.48569143263419834, + "grad_norm": 2.890625, + "learning_rate": 1.0284888888888888e-05, + "loss": 0.4041, + "step": 21860 + }, + { + "epoch": 0.48591361535726985, + "grad_norm": 2.484375, + "learning_rate": 1.0280444444444446e-05, + "loss": 0.4086, + "step": 21870 + }, + { + "epoch": 0.4861357980803413, + "grad_norm": 2.34375, + "learning_rate": 1.0276000000000001e-05, + "loss": 0.3983, + "step": 21880 + }, + { + "epoch": 0.48635798080341275, + "grad_norm": 1.9765625, + "learning_rate": 1.0271555555555556e-05, + "loss": 0.4025, + "step": 21890 + }, + { + "epoch": 0.4865801635264842, + "grad_norm": 2.203125, + "learning_rate": 1.0267111111111113e-05, + "loss": 0.3899, + "step": 21900 + }, + { + "epoch": 0.48680234624955565, + "grad_norm": 2.421875, + "learning_rate": 1.0262666666666667e-05, + "loss": 0.3517, + "step": 21910 + }, + { + "epoch": 0.4870245289726271, + "grad_norm": 3.0625, + "learning_rate": 1.0258222222222222e-05, + "loss": 0.3603, + "step": 21920 + }, + { + "epoch": 0.48724671169569855, + "grad_norm": 2.078125, + "learning_rate": 1.0253777777777779e-05, + "loss": 0.3897, + "step": 21930 + }, + { + "epoch": 0.48746889441877, + "grad_norm": 2.328125, + "learning_rate": 1.0249333333333334e-05, + "loss": 0.3824, + "step": 21940 + }, + { + "epoch": 0.48769107714184146, + "grad_norm": 2.15625, + "learning_rate": 1.0244888888888889e-05, + "loss": 0.4117, + "step": 21950 + }, + { + "epoch": 0.4879132598649129, + "grad_norm": 2.3125, + "learning_rate": 1.0240444444444447e-05, + "loss": 0.3726, + "step": 21960 + }, + { + "epoch": 0.48813544258798436, + "grad_norm": 2.46875, + "learning_rate": 1.0236000000000002e-05, + "loss": 0.3954, + "step": 21970 + }, + { + "epoch": 0.4883576253110558, + "grad_norm": 2.21875, + "learning_rate": 1.0231555555555556e-05, + "loss": 0.397, + "step": 21980 + }, + { + "epoch": 0.48857980803412726, + "grad_norm": 2.453125, + "learning_rate": 1.0227111111111111e-05, + "loss": 0.4118, + "step": 21990 + }, + { + "epoch": 0.4888019907571987, + "grad_norm": 2.375, + "learning_rate": 1.0222666666666668e-05, + "loss": 0.4002, + "step": 22000 + }, + { + "epoch": 0.48902417348027016, + "grad_norm": 2.859375, + "learning_rate": 1.0218222222222223e-05, + "loss": 0.409, + "step": 22010 + }, + { + "epoch": 0.4892463562033416, + "grad_norm": 2.59375, + "learning_rate": 1.0213777777777778e-05, + "loss": 0.3543, + "step": 22020 + }, + { + "epoch": 0.48946853892641307, + "grad_norm": 2.109375, + "learning_rate": 1.0209333333333334e-05, + "loss": 0.4016, + "step": 22030 + }, + { + "epoch": 0.4896907216494845, + "grad_norm": 3.03125, + "learning_rate": 1.0204888888888889e-05, + "loss": 0.3772, + "step": 22040 + }, + { + "epoch": 0.48991290437255597, + "grad_norm": 2.375, + "learning_rate": 1.0200444444444445e-05, + "loss": 0.3841, + "step": 22050 + }, + { + "epoch": 0.4901350870956274, + "grad_norm": 2.34375, + "learning_rate": 1.0196000000000002e-05, + "loss": 0.3945, + "step": 22060 + }, + { + "epoch": 0.49035726981869887, + "grad_norm": 2.578125, + "learning_rate": 1.0191555555555557e-05, + "loss": 0.357, + "step": 22070 + }, + { + "epoch": 0.4905794525417704, + "grad_norm": 2.625, + "learning_rate": 1.0187111111111112e-05, + "loss": 0.4068, + "step": 22080 + }, + { + "epoch": 0.49080163526484183, + "grad_norm": 2.65625, + "learning_rate": 1.0182666666666667e-05, + "loss": 0.4001, + "step": 22090 + }, + { + "epoch": 0.4910238179879133, + "grad_norm": 2.46875, + "learning_rate": 1.0178222222222223e-05, + "loss": 0.3655, + "step": 22100 + }, + { + "epoch": 0.49124600071098473, + "grad_norm": 2.375, + "learning_rate": 1.0173777777777778e-05, + "loss": 0.3908, + "step": 22110 + }, + { + "epoch": 0.4914681834340562, + "grad_norm": 2.625, + "learning_rate": 1.0169333333333333e-05, + "loss": 0.4094, + "step": 22120 + }, + { + "epoch": 0.49169036615712763, + "grad_norm": 2.109375, + "learning_rate": 1.0164888888888891e-05, + "loss": 0.3689, + "step": 22130 + }, + { + "epoch": 0.4919125488801991, + "grad_norm": 2.046875, + "learning_rate": 1.0160444444444446e-05, + "loss": 0.382, + "step": 22140 + }, + { + "epoch": 0.49213473160327054, + "grad_norm": 2.515625, + "learning_rate": 1.0156e-05, + "loss": 0.3878, + "step": 22150 + }, + { + "epoch": 0.492356914326342, + "grad_norm": 2.328125, + "learning_rate": 1.0151555555555557e-05, + "loss": 0.4285, + "step": 22160 + }, + { + "epoch": 0.49257909704941344, + "grad_norm": 3.03125, + "learning_rate": 1.0147111111111112e-05, + "loss": 0.356, + "step": 22170 + }, + { + "epoch": 0.4928012797724849, + "grad_norm": 2.25, + "learning_rate": 1.0142666666666667e-05, + "loss": 0.3692, + "step": 22180 + }, + { + "epoch": 0.49302346249555634, + "grad_norm": 2.0625, + "learning_rate": 1.0138222222222223e-05, + "loss": 0.3603, + "step": 22190 + }, + { + "epoch": 0.4932456452186278, + "grad_norm": 2.125, + "learning_rate": 1.0133777777777778e-05, + "loss": 0.409, + "step": 22200 + }, + { + "epoch": 0.49346782794169924, + "grad_norm": 2.25, + "learning_rate": 1.0129333333333333e-05, + "loss": 0.3898, + "step": 22210 + }, + { + "epoch": 0.4936900106647707, + "grad_norm": 2.671875, + "learning_rate": 1.0124888888888891e-05, + "loss": 0.4, + "step": 22220 + }, + { + "epoch": 0.49391219338784215, + "grad_norm": 1.8984375, + "learning_rate": 1.0120444444444446e-05, + "loss": 0.3752, + "step": 22230 + }, + { + "epoch": 0.4941343761109136, + "grad_norm": 2.625, + "learning_rate": 1.0116000000000001e-05, + "loss": 0.4245, + "step": 22240 + }, + { + "epoch": 0.49435655883398505, + "grad_norm": 2.890625, + "learning_rate": 1.0111555555555556e-05, + "loss": 0.4112, + "step": 22250 + }, + { + "epoch": 0.4945787415570565, + "grad_norm": 2.171875, + "learning_rate": 1.0107111111111112e-05, + "loss": 0.3718, + "step": 22260 + }, + { + "epoch": 0.49480092428012795, + "grad_norm": 2.265625, + "learning_rate": 1.0102666666666667e-05, + "loss": 0.3746, + "step": 22270 + }, + { + "epoch": 0.49502310700319946, + "grad_norm": 1.7890625, + "learning_rate": 1.0098222222222222e-05, + "loss": 0.3785, + "step": 22280 + }, + { + "epoch": 0.4952452897262709, + "grad_norm": 2.0, + "learning_rate": 1.0093777777777779e-05, + "loss": 0.3779, + "step": 22290 + }, + { + "epoch": 0.49546747244934236, + "grad_norm": 2.421875, + "learning_rate": 1.0089333333333334e-05, + "loss": 0.3781, + "step": 22300 + }, + { + "epoch": 0.4956896551724138, + "grad_norm": 2.3125, + "learning_rate": 1.008488888888889e-05, + "loss": 0.3549, + "step": 22310 + }, + { + "epoch": 0.49591183789548526, + "grad_norm": 3.125, + "learning_rate": 1.0080444444444447e-05, + "loss": 0.417, + "step": 22320 + }, + { + "epoch": 0.4961340206185567, + "grad_norm": 2.109375, + "learning_rate": 1.0076000000000001e-05, + "loss": 0.3762, + "step": 22330 + }, + { + "epoch": 0.49635620334162817, + "grad_norm": 2.109375, + "learning_rate": 1.0071555555555556e-05, + "loss": 0.4215, + "step": 22340 + }, + { + "epoch": 0.4965783860646996, + "grad_norm": 2.484375, + "learning_rate": 1.0067111111111111e-05, + "loss": 0.4219, + "step": 22350 + }, + { + "epoch": 0.49680056878777107, + "grad_norm": 2.734375, + "learning_rate": 1.0062666666666668e-05, + "loss": 0.3871, + "step": 22360 + }, + { + "epoch": 0.4970227515108425, + "grad_norm": 2.0, + "learning_rate": 1.0058222222222223e-05, + "loss": 0.3559, + "step": 22370 + }, + { + "epoch": 0.49724493423391397, + "grad_norm": 2.40625, + "learning_rate": 1.0053777777777777e-05, + "loss": 0.3805, + "step": 22380 + }, + { + "epoch": 0.4974671169569854, + "grad_norm": 1.9453125, + "learning_rate": 1.0049333333333334e-05, + "loss": 0.4216, + "step": 22390 + }, + { + "epoch": 0.4976892996800569, + "grad_norm": 1.9296875, + "learning_rate": 1.004488888888889e-05, + "loss": 0.3915, + "step": 22400 + }, + { + "epoch": 0.4979114824031283, + "grad_norm": 2.1875, + "learning_rate": 1.0040444444444445e-05, + "loss": 0.3722, + "step": 22410 + }, + { + "epoch": 0.4981336651261998, + "grad_norm": 2.453125, + "learning_rate": 1.0036000000000002e-05, + "loss": 0.4061, + "step": 22420 + }, + { + "epoch": 0.4983558478492712, + "grad_norm": 2.046875, + "learning_rate": 1.0031555555555557e-05, + "loss": 0.4162, + "step": 22430 + }, + { + "epoch": 0.4985780305723427, + "grad_norm": 2.265625, + "learning_rate": 1.0027111111111112e-05, + "loss": 0.4154, + "step": 22440 + }, + { + "epoch": 0.49880021329541413, + "grad_norm": 2.625, + "learning_rate": 1.0022666666666666e-05, + "loss": 0.3969, + "step": 22450 + }, + { + "epoch": 0.4990223960184856, + "grad_norm": 2.421875, + "learning_rate": 1.0018222222222223e-05, + "loss": 0.3987, + "step": 22460 + }, + { + "epoch": 0.49924457874155703, + "grad_norm": 2.96875, + "learning_rate": 1.0013777777777778e-05, + "loss": 0.387, + "step": 22470 + }, + { + "epoch": 0.4994667614646285, + "grad_norm": 2.875, + "learning_rate": 1.0009333333333333e-05, + "loss": 0.3965, + "step": 22480 + }, + { + "epoch": 0.4996889441877, + "grad_norm": 2.234375, + "learning_rate": 1.000488888888889e-05, + "loss": 0.4201, + "step": 22490 + }, + { + "epoch": 0.49991112691077144, + "grad_norm": 2.109375, + "learning_rate": 1.0000444444444446e-05, + "loss": 0.407, + "step": 22500 + }, + { + "epoch": 0.5001333096338428, + "grad_norm": 2.546875, + "learning_rate": 9.996e-06, + "loss": 0.4298, + "step": 22510 + }, + { + "epoch": 0.5003554923569143, + "grad_norm": 2.640625, + "learning_rate": 9.991555555555557e-06, + "loss": 0.3779, + "step": 22520 + }, + { + "epoch": 0.5005776750799857, + "grad_norm": 2.515625, + "learning_rate": 9.987111111111112e-06, + "loss": 0.3997, + "step": 22530 + }, + { + "epoch": 0.5007998578030572, + "grad_norm": 2.421875, + "learning_rate": 9.982666666666667e-06, + "loss": 0.3894, + "step": 22540 + }, + { + "epoch": 0.5010220405261286, + "grad_norm": 2.40625, + "learning_rate": 9.978222222222223e-06, + "loss": 0.4136, + "step": 22550 + }, + { + "epoch": 0.5012442232492001, + "grad_norm": 2.171875, + "learning_rate": 9.973777777777778e-06, + "loss": 0.3812, + "step": 22560 + }, + { + "epoch": 0.5014664059722715, + "grad_norm": 2.15625, + "learning_rate": 9.969333333333335e-06, + "loss": 0.4132, + "step": 22570 + }, + { + "epoch": 0.501688588695343, + "grad_norm": 2.34375, + "learning_rate": 9.96488888888889e-06, + "loss": 0.3851, + "step": 22580 + }, + { + "epoch": 0.5019107714184146, + "grad_norm": 2.125, + "learning_rate": 9.960444444444444e-06, + "loss": 0.3617, + "step": 22590 + }, + { + "epoch": 0.502132954141486, + "grad_norm": 2.546875, + "learning_rate": 9.956000000000001e-06, + "loss": 0.3363, + "step": 22600 + }, + { + "epoch": 0.5023551368645575, + "grad_norm": 2.40625, + "learning_rate": 9.951555555555556e-06, + "loss": 0.3396, + "step": 22610 + }, + { + "epoch": 0.5025773195876289, + "grad_norm": 2.359375, + "learning_rate": 9.947111111111112e-06, + "loss": 0.3656, + "step": 22620 + }, + { + "epoch": 0.5027995023107004, + "grad_norm": 2.0, + "learning_rate": 9.942666666666667e-06, + "loss": 0.4024, + "step": 22630 + }, + { + "epoch": 0.5030216850337718, + "grad_norm": 1.8828125, + "learning_rate": 9.938222222222224e-06, + "loss": 0.4049, + "step": 22640 + }, + { + "epoch": 0.5032438677568433, + "grad_norm": 2.28125, + "learning_rate": 9.933777777777779e-06, + "loss": 0.4076, + "step": 22650 + }, + { + "epoch": 0.5034660504799147, + "grad_norm": 2.5, + "learning_rate": 9.929333333333333e-06, + "loss": 0.3608, + "step": 22660 + }, + { + "epoch": 0.5036882332029862, + "grad_norm": 2.484375, + "learning_rate": 9.92488888888889e-06, + "loss": 0.4079, + "step": 22670 + }, + { + "epoch": 0.5039104159260576, + "grad_norm": 2.40625, + "learning_rate": 9.920444444444445e-06, + "loss": 0.4066, + "step": 22680 + }, + { + "epoch": 0.5041325986491291, + "grad_norm": 2.1875, + "learning_rate": 9.916000000000001e-06, + "loss": 0.4185, + "step": 22690 + }, + { + "epoch": 0.5043547813722005, + "grad_norm": 2.21875, + "learning_rate": 9.911555555555556e-06, + "loss": 0.3507, + "step": 22700 + }, + { + "epoch": 0.504576964095272, + "grad_norm": 2.125, + "learning_rate": 9.907111111111111e-06, + "loss": 0.3864, + "step": 22710 + }, + { + "epoch": 0.5047991468183434, + "grad_norm": 2.484375, + "learning_rate": 9.902666666666668e-06, + "loss": 0.4213, + "step": 22720 + }, + { + "epoch": 0.5050213295414149, + "grad_norm": 2.328125, + "learning_rate": 9.898222222222224e-06, + "loss": 0.3925, + "step": 22730 + }, + { + "epoch": 0.5052435122644863, + "grad_norm": 2.546875, + "learning_rate": 9.893777777777779e-06, + "loss": 0.4311, + "step": 22740 + }, + { + "epoch": 0.5054656949875578, + "grad_norm": 2.328125, + "learning_rate": 9.889333333333334e-06, + "loss": 0.3781, + "step": 22750 + }, + { + "epoch": 0.5056878777106292, + "grad_norm": 2.25, + "learning_rate": 9.884888888888889e-06, + "loss": 0.3717, + "step": 22760 + }, + { + "epoch": 0.5059100604337007, + "grad_norm": 2.109375, + "learning_rate": 9.880444444444445e-06, + "loss": 0.3664, + "step": 22770 + }, + { + "epoch": 0.5061322431567721, + "grad_norm": 2.203125, + "learning_rate": 9.876000000000002e-06, + "loss": 0.3805, + "step": 22780 + }, + { + "epoch": 0.5063544258798436, + "grad_norm": 2.75, + "learning_rate": 9.871555555555557e-06, + "loss": 0.4212, + "step": 22790 + }, + { + "epoch": 0.5065766086029151, + "grad_norm": 2.578125, + "learning_rate": 9.867111111111111e-06, + "loss": 0.4189, + "step": 22800 + }, + { + "epoch": 0.5067987913259865, + "grad_norm": 2.0, + "learning_rate": 9.862666666666668e-06, + "loss": 0.3924, + "step": 22810 + }, + { + "epoch": 0.507020974049058, + "grad_norm": 2.390625, + "learning_rate": 9.858222222222223e-06, + "loss": 0.3742, + "step": 22820 + }, + { + "epoch": 0.5072431567721294, + "grad_norm": 2.546875, + "learning_rate": 9.85377777777778e-06, + "loss": 0.3877, + "step": 22830 + }, + { + "epoch": 0.5074653394952009, + "grad_norm": 2.3125, + "learning_rate": 9.849333333333334e-06, + "loss": 0.4126, + "step": 22840 + }, + { + "epoch": 0.5076875222182723, + "grad_norm": 2.578125, + "learning_rate": 9.844888888888889e-06, + "loss": 0.3659, + "step": 22850 + }, + { + "epoch": 0.5079097049413438, + "grad_norm": 2.359375, + "learning_rate": 9.840444444444446e-06, + "loss": 0.4144, + "step": 22860 + }, + { + "epoch": 0.5081318876644152, + "grad_norm": 2.234375, + "learning_rate": 9.836e-06, + "loss": 0.3972, + "step": 22870 + }, + { + "epoch": 0.5083540703874867, + "grad_norm": 2.515625, + "learning_rate": 9.831555555555557e-06, + "loss": 0.364, + "step": 22880 + }, + { + "epoch": 0.5085762531105581, + "grad_norm": 2.96875, + "learning_rate": 9.827111111111112e-06, + "loss": 0.4129, + "step": 22890 + }, + { + "epoch": 0.5087984358336296, + "grad_norm": 2.484375, + "learning_rate": 9.822666666666667e-06, + "loss": 0.4082, + "step": 22900 + }, + { + "epoch": 0.509020618556701, + "grad_norm": 3.046875, + "learning_rate": 9.818222222222223e-06, + "loss": 0.4159, + "step": 22910 + }, + { + "epoch": 0.5092428012797725, + "grad_norm": 2.421875, + "learning_rate": 9.813777777777778e-06, + "loss": 0.3992, + "step": 22920 + }, + { + "epoch": 0.5094649840028439, + "grad_norm": 2.75, + "learning_rate": 9.809333333333335e-06, + "loss": 0.3495, + "step": 22930 + }, + { + "epoch": 0.5096871667259154, + "grad_norm": 2.390625, + "learning_rate": 9.80488888888889e-06, + "loss": 0.376, + "step": 22940 + }, + { + "epoch": 0.5099093494489868, + "grad_norm": 2.234375, + "learning_rate": 9.800444444444446e-06, + "loss": 0.3861, + "step": 22950 + }, + { + "epoch": 0.5101315321720583, + "grad_norm": 2.359375, + "learning_rate": 9.796e-06, + "loss": 0.4001, + "step": 22960 + }, + { + "epoch": 0.5103537148951297, + "grad_norm": 1.984375, + "learning_rate": 9.791555555555556e-06, + "loss": 0.4003, + "step": 22970 + }, + { + "epoch": 0.5105758976182012, + "grad_norm": 2.59375, + "learning_rate": 9.787111111111112e-06, + "loss": 0.3659, + "step": 22980 + }, + { + "epoch": 0.5107980803412726, + "grad_norm": 2.09375, + "learning_rate": 9.782666666666667e-06, + "loss": 0.3836, + "step": 22990 + }, + { + "epoch": 0.5110202630643441, + "grad_norm": 2.234375, + "learning_rate": 9.778222222222224e-06, + "loss": 0.3947, + "step": 23000 + }, + { + "epoch": 0.5112424457874156, + "grad_norm": 2.46875, + "learning_rate": 9.773777777777778e-06, + "loss": 0.3722, + "step": 23010 + }, + { + "epoch": 0.511464628510487, + "grad_norm": 2.3125, + "learning_rate": 9.769333333333333e-06, + "loss": 0.384, + "step": 23020 + }, + { + "epoch": 0.5116868112335585, + "grad_norm": 2.375, + "learning_rate": 9.76488888888889e-06, + "loss": 0.3549, + "step": 23030 + }, + { + "epoch": 0.5119089939566299, + "grad_norm": 2.140625, + "learning_rate": 9.760444444444446e-06, + "loss": 0.395, + "step": 23040 + }, + { + "epoch": 0.5121311766797014, + "grad_norm": 2.1875, + "learning_rate": 9.756000000000001e-06, + "loss": 0.34, + "step": 23050 + }, + { + "epoch": 0.5123533594027728, + "grad_norm": 2.15625, + "learning_rate": 9.751555555555556e-06, + "loss": 0.3952, + "step": 23060 + }, + { + "epoch": 0.5125755421258443, + "grad_norm": 2.359375, + "learning_rate": 9.74711111111111e-06, + "loss": 0.4082, + "step": 23070 + }, + { + "epoch": 0.5127977248489157, + "grad_norm": 2.359375, + "learning_rate": 9.742666666666667e-06, + "loss": 0.3826, + "step": 23080 + }, + { + "epoch": 0.5130199075719872, + "grad_norm": 2.421875, + "learning_rate": 9.738222222222224e-06, + "loss": 0.3905, + "step": 23090 + }, + { + "epoch": 0.5132420902950586, + "grad_norm": 1.9609375, + "learning_rate": 9.733777777777779e-06, + "loss": 0.3996, + "step": 23100 + }, + { + "epoch": 0.5134642730181301, + "grad_norm": 2.359375, + "learning_rate": 9.729333333333334e-06, + "loss": 0.3944, + "step": 23110 + }, + { + "epoch": 0.5136864557412015, + "grad_norm": 2.21875, + "learning_rate": 9.724888888888888e-06, + "loss": 0.3675, + "step": 23120 + }, + { + "epoch": 0.513908638464273, + "grad_norm": 2.328125, + "learning_rate": 9.720444444444445e-06, + "loss": 0.4111, + "step": 23130 + }, + { + "epoch": 0.5141308211873444, + "grad_norm": 2.59375, + "learning_rate": 9.716000000000002e-06, + "loss": 0.4227, + "step": 23140 + }, + { + "epoch": 0.5143530039104159, + "grad_norm": 2.265625, + "learning_rate": 9.711555555555556e-06, + "loss": 0.396, + "step": 23150 + }, + { + "epoch": 0.5145751866334873, + "grad_norm": 2.625, + "learning_rate": 9.707111111111111e-06, + "loss": 0.3875, + "step": 23160 + }, + { + "epoch": 0.5147973693565588, + "grad_norm": 2.5625, + "learning_rate": 9.702666666666668e-06, + "loss": 0.387, + "step": 23170 + }, + { + "epoch": 0.5150195520796302, + "grad_norm": 2.484375, + "learning_rate": 9.698222222222223e-06, + "loss": 0.3835, + "step": 23180 + }, + { + "epoch": 0.5152417348027017, + "grad_norm": 2.5, + "learning_rate": 9.693777777777779e-06, + "loss": 0.3879, + "step": 23190 + }, + { + "epoch": 0.5154639175257731, + "grad_norm": 2.640625, + "learning_rate": 9.689333333333334e-06, + "loss": 0.3927, + "step": 23200 + }, + { + "epoch": 0.5156861002488446, + "grad_norm": 2.625, + "learning_rate": 9.684888888888889e-06, + "loss": 0.3956, + "step": 23210 + }, + { + "epoch": 0.5159082829719162, + "grad_norm": 2.46875, + "learning_rate": 9.680444444444445e-06, + "loss": 0.3682, + "step": 23220 + }, + { + "epoch": 0.5161304656949876, + "grad_norm": 2.5, + "learning_rate": 9.676e-06, + "loss": 0.4165, + "step": 23230 + }, + { + "epoch": 0.5163526484180591, + "grad_norm": 2.453125, + "learning_rate": 9.671555555555557e-06, + "loss": 0.3805, + "step": 23240 + }, + { + "epoch": 0.5165748311411305, + "grad_norm": 2.203125, + "learning_rate": 9.667111111111112e-06, + "loss": 0.3986, + "step": 23250 + }, + { + "epoch": 0.516797013864202, + "grad_norm": 2.390625, + "learning_rate": 9.662666666666668e-06, + "loss": 0.3789, + "step": 23260 + }, + { + "epoch": 0.5170191965872734, + "grad_norm": 1.9921875, + "learning_rate": 9.658222222222223e-06, + "loss": 0.4063, + "step": 23270 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 2.890625, + "learning_rate": 9.653777777777778e-06, + "loss": 0.4068, + "step": 23280 + }, + { + "epoch": 0.5174635620334163, + "grad_norm": 2.140625, + "learning_rate": 9.649333333333334e-06, + "loss": 0.3881, + "step": 23290 + }, + { + "epoch": 0.5176857447564878, + "grad_norm": 2.234375, + "learning_rate": 9.64488888888889e-06, + "loss": 0.403, + "step": 23300 + }, + { + "epoch": 0.5179079274795592, + "grad_norm": 2.3125, + "learning_rate": 9.640444444444446e-06, + "loss": 0.411, + "step": 23310 + }, + { + "epoch": 0.5181301102026307, + "grad_norm": 2.71875, + "learning_rate": 9.636e-06, + "loss": 0.411, + "step": 23320 + }, + { + "epoch": 0.5183522929257021, + "grad_norm": 2.875, + "learning_rate": 9.631555555555555e-06, + "loss": 0.3763, + "step": 23330 + }, + { + "epoch": 0.5185744756487736, + "grad_norm": 2.15625, + "learning_rate": 9.627111111111112e-06, + "loss": 0.4007, + "step": 23340 + }, + { + "epoch": 0.518796658371845, + "grad_norm": 2.90625, + "learning_rate": 9.622666666666668e-06, + "loss": 0.3983, + "step": 23350 + }, + { + "epoch": 0.5190188410949165, + "grad_norm": 2.171875, + "learning_rate": 9.618222222222223e-06, + "loss": 0.4572, + "step": 23360 + }, + { + "epoch": 0.5192410238179879, + "grad_norm": 2.265625, + "learning_rate": 9.613777777777778e-06, + "loss": 0.3783, + "step": 23370 + }, + { + "epoch": 0.5194632065410594, + "grad_norm": 2.109375, + "learning_rate": 9.609333333333333e-06, + "loss": 0.3842, + "step": 23380 + }, + { + "epoch": 0.5196853892641308, + "grad_norm": 2.5, + "learning_rate": 9.60488888888889e-06, + "loss": 0.3901, + "step": 23390 + }, + { + "epoch": 0.5199075719872023, + "grad_norm": 2.5, + "learning_rate": 9.600444444444446e-06, + "loss": 0.4148, + "step": 23400 + }, + { + "epoch": 0.5201297547102738, + "grad_norm": 2.609375, + "learning_rate": 9.596000000000001e-06, + "loss": 0.3899, + "step": 23410 + }, + { + "epoch": 0.5203519374333452, + "grad_norm": 2.65625, + "learning_rate": 9.591555555555556e-06, + "loss": 0.4579, + "step": 23420 + }, + { + "epoch": 0.5205741201564167, + "grad_norm": 2.75, + "learning_rate": 9.58711111111111e-06, + "loss": 0.3828, + "step": 23430 + }, + { + "epoch": 0.5207963028794881, + "grad_norm": 1.9296875, + "learning_rate": 9.582666666666667e-06, + "loss": 0.3632, + "step": 23440 + }, + { + "epoch": 0.5210184856025596, + "grad_norm": 2.046875, + "learning_rate": 9.578222222222224e-06, + "loss": 0.4083, + "step": 23450 + }, + { + "epoch": 0.521240668325631, + "grad_norm": 2.3125, + "learning_rate": 9.573777777777779e-06, + "loss": 0.37, + "step": 23460 + }, + { + "epoch": 0.5214628510487025, + "grad_norm": 1.78125, + "learning_rate": 9.569333333333333e-06, + "loss": 0.3641, + "step": 23470 + }, + { + "epoch": 0.5216850337717739, + "grad_norm": 1.9140625, + "learning_rate": 9.56488888888889e-06, + "loss": 0.3725, + "step": 23480 + }, + { + "epoch": 0.5219072164948454, + "grad_norm": 2.296875, + "learning_rate": 9.560444444444445e-06, + "loss": 0.3971, + "step": 23490 + }, + { + "epoch": 0.5221293992179168, + "grad_norm": 2.453125, + "learning_rate": 9.556000000000001e-06, + "loss": 0.3923, + "step": 23500 + }, + { + "epoch": 0.5223515819409883, + "grad_norm": 2.453125, + "learning_rate": 9.551555555555556e-06, + "loss": 0.3926, + "step": 23510 + }, + { + "epoch": 0.5225737646640597, + "grad_norm": 2.546875, + "learning_rate": 9.547111111111111e-06, + "loss": 0.3701, + "step": 23520 + }, + { + "epoch": 0.5227959473871312, + "grad_norm": 2.5625, + "learning_rate": 9.542666666666668e-06, + "loss": 0.3761, + "step": 23530 + }, + { + "epoch": 0.5230181301102026, + "grad_norm": 1.9765625, + "learning_rate": 9.538222222222222e-06, + "loss": 0.3749, + "step": 23540 + }, + { + "epoch": 0.5232403128332741, + "grad_norm": 2.0, + "learning_rate": 9.533777777777779e-06, + "loss": 0.401, + "step": 23550 + }, + { + "epoch": 0.5234624955563455, + "grad_norm": 3.125, + "learning_rate": 9.529333333333334e-06, + "loss": 0.3932, + "step": 23560 + }, + { + "epoch": 0.523684678279417, + "grad_norm": 2.5, + "learning_rate": 9.52488888888889e-06, + "loss": 0.409, + "step": 23570 + }, + { + "epoch": 0.5239068610024884, + "grad_norm": 2.359375, + "learning_rate": 9.520444444444445e-06, + "loss": 0.3663, + "step": 23580 + }, + { + "epoch": 0.5241290437255599, + "grad_norm": 2.59375, + "learning_rate": 9.516e-06, + "loss": 0.4164, + "step": 23590 + }, + { + "epoch": 0.5243512264486313, + "grad_norm": 2.234375, + "learning_rate": 9.511555555555557e-06, + "loss": 0.3705, + "step": 23600 + }, + { + "epoch": 0.5245734091717028, + "grad_norm": 1.8828125, + "learning_rate": 9.507111111111111e-06, + "loss": 0.3507, + "step": 23610 + }, + { + "epoch": 0.5247955918947743, + "grad_norm": 2.53125, + "learning_rate": 9.502666666666668e-06, + "loss": 0.4186, + "step": 23620 + }, + { + "epoch": 0.5250177746178457, + "grad_norm": 2.703125, + "learning_rate": 9.498222222222223e-06, + "loss": 0.4051, + "step": 23630 + }, + { + "epoch": 0.5252399573409172, + "grad_norm": 2.40625, + "learning_rate": 9.493777777777778e-06, + "loss": 0.4121, + "step": 23640 + }, + { + "epoch": 0.5254621400639886, + "grad_norm": 2.4375, + "learning_rate": 9.489333333333334e-06, + "loss": 0.372, + "step": 23650 + }, + { + "epoch": 0.5256843227870601, + "grad_norm": 2.875, + "learning_rate": 9.48488888888889e-06, + "loss": 0.4198, + "step": 23660 + }, + { + "epoch": 0.5259065055101315, + "grad_norm": 2.5, + "learning_rate": 9.480444444444446e-06, + "loss": 0.368, + "step": 23670 + }, + { + "epoch": 0.526128688233203, + "grad_norm": 2.625, + "learning_rate": 9.476e-06, + "loss": 0.3731, + "step": 23680 + }, + { + "epoch": 0.5263508709562744, + "grad_norm": 2.328125, + "learning_rate": 9.471555555555555e-06, + "loss": 0.3682, + "step": 23690 + }, + { + "epoch": 0.5265730536793459, + "grad_norm": 2.96875, + "learning_rate": 9.467111111111112e-06, + "loss": 0.3844, + "step": 23700 + }, + { + "epoch": 0.5267952364024173, + "grad_norm": 2.28125, + "learning_rate": 9.462666666666668e-06, + "loss": 0.3779, + "step": 23710 + }, + { + "epoch": 0.5270174191254888, + "grad_norm": 2.453125, + "learning_rate": 9.458222222222223e-06, + "loss": 0.3784, + "step": 23720 + }, + { + "epoch": 0.5272396018485602, + "grad_norm": 2.765625, + "learning_rate": 9.453777777777778e-06, + "loss": 0.3681, + "step": 23730 + }, + { + "epoch": 0.5274617845716317, + "grad_norm": 2.203125, + "learning_rate": 9.449333333333333e-06, + "loss": 0.3966, + "step": 23740 + }, + { + "epoch": 0.5276839672947031, + "grad_norm": 2.5, + "learning_rate": 9.44488888888889e-06, + "loss": 0.36, + "step": 23750 + }, + { + "epoch": 0.5279061500177746, + "grad_norm": 2.171875, + "learning_rate": 9.440444444444446e-06, + "loss": 0.3675, + "step": 23760 + }, + { + "epoch": 0.528128332740846, + "grad_norm": 2.375, + "learning_rate": 9.436e-06, + "loss": 0.3679, + "step": 23770 + }, + { + "epoch": 0.5283505154639175, + "grad_norm": 2.109375, + "learning_rate": 9.431555555555556e-06, + "loss": 0.3862, + "step": 23780 + }, + { + "epoch": 0.5285726981869889, + "grad_norm": 2.453125, + "learning_rate": 9.427111111111112e-06, + "loss": 0.365, + "step": 23790 + }, + { + "epoch": 0.5287948809100604, + "grad_norm": 2.21875, + "learning_rate": 9.422666666666667e-06, + "loss": 0.3974, + "step": 23800 + }, + { + "epoch": 0.5290170636331318, + "grad_norm": 2.1875, + "learning_rate": 9.418222222222224e-06, + "loss": 0.35, + "step": 23810 + }, + { + "epoch": 0.5292392463562033, + "grad_norm": 2.84375, + "learning_rate": 9.413777777777778e-06, + "loss": 0.423, + "step": 23820 + }, + { + "epoch": 0.5294614290792748, + "grad_norm": 2.203125, + "learning_rate": 9.409333333333333e-06, + "loss": 0.3607, + "step": 23830 + }, + { + "epoch": 0.5296836118023462, + "grad_norm": 2.765625, + "learning_rate": 9.40488888888889e-06, + "loss": 0.3973, + "step": 23840 + }, + { + "epoch": 0.5299057945254178, + "grad_norm": 2.0625, + "learning_rate": 9.400444444444445e-06, + "loss": 0.4067, + "step": 23850 + }, + { + "epoch": 0.5301279772484891, + "grad_norm": 2.1875, + "learning_rate": 9.396000000000001e-06, + "loss": 0.4117, + "step": 23860 + }, + { + "epoch": 0.5303501599715607, + "grad_norm": 2.1875, + "learning_rate": 9.391555555555556e-06, + "loss": 0.3517, + "step": 23870 + }, + { + "epoch": 0.530572342694632, + "grad_norm": 2.375, + "learning_rate": 9.387111111111113e-06, + "loss": 0.3877, + "step": 23880 + }, + { + "epoch": 0.5307945254177036, + "grad_norm": 2.40625, + "learning_rate": 9.382666666666667e-06, + "loss": 0.4231, + "step": 23890 + }, + { + "epoch": 0.531016708140775, + "grad_norm": 2.8125, + "learning_rate": 9.378222222222222e-06, + "loss": 0.3845, + "step": 23900 + }, + { + "epoch": 0.5312388908638465, + "grad_norm": 2.078125, + "learning_rate": 9.373777777777779e-06, + "loss": 0.436, + "step": 23910 + }, + { + "epoch": 0.5314610735869179, + "grad_norm": 2.40625, + "learning_rate": 9.369333333333334e-06, + "loss": 0.3605, + "step": 23920 + }, + { + "epoch": 0.5316832563099894, + "grad_norm": 2.078125, + "learning_rate": 9.36488888888889e-06, + "loss": 0.3881, + "step": 23930 + }, + { + "epoch": 0.5319054390330608, + "grad_norm": 2.46875, + "learning_rate": 9.360444444444445e-06, + "loss": 0.4014, + "step": 23940 + }, + { + "epoch": 0.5321276217561323, + "grad_norm": 2.8125, + "learning_rate": 9.356e-06, + "loss": 0.3885, + "step": 23950 + }, + { + "epoch": 0.5323498044792037, + "grad_norm": 2.296875, + "learning_rate": 9.351555555555556e-06, + "loss": 0.3429, + "step": 23960 + }, + { + "epoch": 0.5325719872022752, + "grad_norm": 2.640625, + "learning_rate": 9.347111111111113e-06, + "loss": 0.4116, + "step": 23970 + }, + { + "epoch": 0.5327941699253466, + "grad_norm": 2.203125, + "learning_rate": 9.342666666666668e-06, + "loss": 0.4032, + "step": 23980 + }, + { + "epoch": 0.5330163526484181, + "grad_norm": 1.875, + "learning_rate": 9.338222222222223e-06, + "loss": 0.3314, + "step": 23990 + }, + { + "epoch": 0.5332385353714895, + "grad_norm": 2.40625, + "learning_rate": 9.333777777777777e-06, + "loss": 0.3691, + "step": 24000 + }, + { + "epoch": 0.533460718094561, + "grad_norm": 2.125, + "learning_rate": 9.329333333333334e-06, + "loss": 0.3932, + "step": 24010 + }, + { + "epoch": 0.5336829008176324, + "grad_norm": 2.25, + "learning_rate": 9.32488888888889e-06, + "loss": 0.3472, + "step": 24020 + }, + { + "epoch": 0.5339050835407039, + "grad_norm": 2.71875, + "learning_rate": 9.320444444444445e-06, + "loss": 0.4251, + "step": 24030 + }, + { + "epoch": 0.5341272662637754, + "grad_norm": 2.375, + "learning_rate": 9.316e-06, + "loss": 0.3884, + "step": 24040 + }, + { + "epoch": 0.5343494489868468, + "grad_norm": 1.953125, + "learning_rate": 9.311555555555555e-06, + "loss": 0.3509, + "step": 24050 + }, + { + "epoch": 0.5345716317099183, + "grad_norm": 2.140625, + "learning_rate": 9.307111111111112e-06, + "loss": 0.3587, + "step": 24060 + }, + { + "epoch": 0.5347938144329897, + "grad_norm": 2.34375, + "learning_rate": 9.302666666666668e-06, + "loss": 0.4026, + "step": 24070 + }, + { + "epoch": 0.5350159971560612, + "grad_norm": 2.390625, + "learning_rate": 9.298222222222223e-06, + "loss": 0.3798, + "step": 24080 + }, + { + "epoch": 0.5352381798791326, + "grad_norm": 2.46875, + "learning_rate": 9.293777777777778e-06, + "loss": 0.3895, + "step": 24090 + }, + { + "epoch": 0.5354603626022041, + "grad_norm": 2.09375, + "learning_rate": 9.289333333333334e-06, + "loss": 0.3804, + "step": 24100 + }, + { + "epoch": 0.5356825453252755, + "grad_norm": 2.125, + "learning_rate": 9.28488888888889e-06, + "loss": 0.3708, + "step": 24110 + }, + { + "epoch": 0.535904728048347, + "grad_norm": 2.5625, + "learning_rate": 9.280444444444446e-06, + "loss": 0.3976, + "step": 24120 + }, + { + "epoch": 0.5361269107714184, + "grad_norm": 2.484375, + "learning_rate": 9.276e-06, + "loss": 0.3709, + "step": 24130 + }, + { + "epoch": 0.5363490934944899, + "grad_norm": 2.328125, + "learning_rate": 9.271555555555555e-06, + "loss": 0.3845, + "step": 24140 + }, + { + "epoch": 0.5365712762175613, + "grad_norm": 2.21875, + "learning_rate": 9.267111111111112e-06, + "loss": 0.3443, + "step": 24150 + }, + { + "epoch": 0.5367934589406328, + "grad_norm": 1.9765625, + "learning_rate": 9.262666666666667e-06, + "loss": 0.3965, + "step": 24160 + }, + { + "epoch": 0.5370156416637042, + "grad_norm": 2.5, + "learning_rate": 9.258222222222223e-06, + "loss": 0.3619, + "step": 24170 + }, + { + "epoch": 0.5372378243867757, + "grad_norm": 2.671875, + "learning_rate": 9.253777777777778e-06, + "loss": 0.4331, + "step": 24180 + }, + { + "epoch": 0.5374600071098471, + "grad_norm": 2.953125, + "learning_rate": 9.249333333333335e-06, + "loss": 0.3977, + "step": 24190 + }, + { + "epoch": 0.5376821898329186, + "grad_norm": 2.4375, + "learning_rate": 9.24488888888889e-06, + "loss": 0.404, + "step": 24200 + }, + { + "epoch": 0.53790437255599, + "grad_norm": 2.8125, + "learning_rate": 9.240444444444444e-06, + "loss": 0.4076, + "step": 24210 + }, + { + "epoch": 0.5381265552790615, + "grad_norm": 2.5, + "learning_rate": 9.236000000000001e-06, + "loss": 0.4087, + "step": 24220 + }, + { + "epoch": 0.538348738002133, + "grad_norm": 2.84375, + "learning_rate": 9.231555555555556e-06, + "loss": 0.4065, + "step": 24230 + }, + { + "epoch": 0.5385709207252044, + "grad_norm": 2.703125, + "learning_rate": 9.227111111111112e-06, + "loss": 0.4035, + "step": 24240 + }, + { + "epoch": 0.5387931034482759, + "grad_norm": 2.234375, + "learning_rate": 9.222666666666667e-06, + "loss": 0.3972, + "step": 24250 + }, + { + "epoch": 0.5390152861713473, + "grad_norm": 2.328125, + "learning_rate": 9.218222222222222e-06, + "loss": 0.4168, + "step": 24260 + }, + { + "epoch": 0.5392374688944188, + "grad_norm": 2.453125, + "learning_rate": 9.213777777777779e-06, + "loss": 0.3985, + "step": 24270 + }, + { + "epoch": 0.5394596516174902, + "grad_norm": 2.515625, + "learning_rate": 9.209333333333335e-06, + "loss": 0.3861, + "step": 24280 + }, + { + "epoch": 0.5396818343405617, + "grad_norm": 2.1875, + "learning_rate": 9.20488888888889e-06, + "loss": 0.3952, + "step": 24290 + }, + { + "epoch": 0.5399040170636331, + "grad_norm": 2.46875, + "learning_rate": 9.200444444444445e-06, + "loss": 0.3793, + "step": 24300 + }, + { + "epoch": 0.5401261997867046, + "grad_norm": 2.546875, + "learning_rate": 9.196e-06, + "loss": 0.3831, + "step": 24310 + }, + { + "epoch": 0.540348382509776, + "grad_norm": 2.9375, + "learning_rate": 9.191555555555556e-06, + "loss": 0.3802, + "step": 24320 + }, + { + "epoch": 0.5405705652328475, + "grad_norm": 2.5, + "learning_rate": 9.187111111111113e-06, + "loss": 0.4176, + "step": 24330 + }, + { + "epoch": 0.5407927479559189, + "grad_norm": 2.59375, + "learning_rate": 9.182666666666668e-06, + "loss": 0.342, + "step": 24340 + }, + { + "epoch": 0.5410149306789904, + "grad_norm": 2.359375, + "learning_rate": 9.178222222222222e-06, + "loss": 0.3505, + "step": 24350 + }, + { + "epoch": 0.5412371134020618, + "grad_norm": 2.328125, + "learning_rate": 9.173777777777777e-06, + "loss": 0.4308, + "step": 24360 + }, + { + "epoch": 0.5414592961251333, + "grad_norm": 2.375, + "learning_rate": 9.169333333333334e-06, + "loss": 0.3593, + "step": 24370 + }, + { + "epoch": 0.5416814788482047, + "grad_norm": 2.546875, + "learning_rate": 9.16488888888889e-06, + "loss": 0.3972, + "step": 24380 + }, + { + "epoch": 0.5419036615712762, + "grad_norm": 2.234375, + "learning_rate": 9.160444444444445e-06, + "loss": 0.3626, + "step": 24390 + }, + { + "epoch": 0.5421258442943476, + "grad_norm": 2.78125, + "learning_rate": 9.156e-06, + "loss": 0.3998, + "step": 24400 + }, + { + "epoch": 0.5423480270174191, + "grad_norm": 2.8125, + "learning_rate": 9.151555555555557e-06, + "loss": 0.4031, + "step": 24410 + }, + { + "epoch": 0.5425702097404905, + "grad_norm": 2.53125, + "learning_rate": 9.147111111111111e-06, + "loss": 0.3568, + "step": 24420 + }, + { + "epoch": 0.542792392463562, + "grad_norm": 2.578125, + "learning_rate": 9.142666666666668e-06, + "loss": 0.3548, + "step": 24430 + }, + { + "epoch": 0.5430145751866335, + "grad_norm": 2.484375, + "learning_rate": 9.138222222222223e-06, + "loss": 0.3947, + "step": 24440 + }, + { + "epoch": 0.5432367579097049, + "grad_norm": 2.4375, + "learning_rate": 9.133777777777778e-06, + "loss": 0.4046, + "step": 24450 + }, + { + "epoch": 0.5434589406327764, + "grad_norm": 2.65625, + "learning_rate": 9.129333333333334e-06, + "loss": 0.4305, + "step": 24460 + }, + { + "epoch": 0.5436811233558478, + "grad_norm": 2.515625, + "learning_rate": 9.124888888888889e-06, + "loss": 0.413, + "step": 24470 + }, + { + "epoch": 0.5439033060789193, + "grad_norm": 2.078125, + "learning_rate": 9.120444444444446e-06, + "loss": 0.3882, + "step": 24480 + }, + { + "epoch": 0.5441254888019907, + "grad_norm": 2.390625, + "learning_rate": 9.116e-06, + "loss": 0.4162, + "step": 24490 + }, + { + "epoch": 0.5443476715250622, + "grad_norm": 2.296875, + "learning_rate": 9.111555555555557e-06, + "loss": 0.402, + "step": 24500 + }, + { + "epoch": 0.5445698542481336, + "grad_norm": 2.34375, + "learning_rate": 9.107111111111112e-06, + "loss": 0.4079, + "step": 24510 + }, + { + "epoch": 0.5447920369712052, + "grad_norm": 2.640625, + "learning_rate": 9.102666666666667e-06, + "loss": 0.3852, + "step": 24520 + }, + { + "epoch": 0.5450142196942765, + "grad_norm": 2.84375, + "learning_rate": 9.098222222222223e-06, + "loss": 0.3792, + "step": 24530 + }, + { + "epoch": 0.545236402417348, + "grad_norm": 2.609375, + "learning_rate": 9.093777777777778e-06, + "loss": 0.3756, + "step": 24540 + }, + { + "epoch": 0.5454585851404194, + "grad_norm": 2.5625, + "learning_rate": 9.089333333333335e-06, + "loss": 0.379, + "step": 24550 + }, + { + "epoch": 0.545680767863491, + "grad_norm": 2.25, + "learning_rate": 9.08488888888889e-06, + "loss": 0.3756, + "step": 24560 + }, + { + "epoch": 0.5459029505865624, + "grad_norm": 2.703125, + "learning_rate": 9.080444444444444e-06, + "loss": 0.3867, + "step": 24570 + }, + { + "epoch": 0.5461251333096339, + "grad_norm": 2.1875, + "learning_rate": 9.076000000000001e-06, + "loss": 0.3852, + "step": 24580 + }, + { + "epoch": 0.5463473160327053, + "grad_norm": 2.03125, + "learning_rate": 9.071555555555557e-06, + "loss": 0.3832, + "step": 24590 + }, + { + "epoch": 0.5465694987557768, + "grad_norm": 1.9296875, + "learning_rate": 9.067111111111112e-06, + "loss": 0.363, + "step": 24600 + }, + { + "epoch": 0.5467916814788482, + "grad_norm": 2.609375, + "learning_rate": 9.062666666666667e-06, + "loss": 0.4217, + "step": 24610 + }, + { + "epoch": 0.5470138642019197, + "grad_norm": 2.21875, + "learning_rate": 9.058222222222222e-06, + "loss": 0.3823, + "step": 24620 + }, + { + "epoch": 0.5472360469249911, + "grad_norm": 2.59375, + "learning_rate": 9.053777777777778e-06, + "loss": 0.3898, + "step": 24630 + }, + { + "epoch": 0.5474582296480626, + "grad_norm": 3.078125, + "learning_rate": 9.049333333333335e-06, + "loss": 0.3935, + "step": 24640 + }, + { + "epoch": 0.5476804123711341, + "grad_norm": 2.453125, + "learning_rate": 9.04488888888889e-06, + "loss": 0.3809, + "step": 24650 + }, + { + "epoch": 0.5479025950942055, + "grad_norm": 2.390625, + "learning_rate": 9.040444444444445e-06, + "loss": 0.3726, + "step": 24660 + }, + { + "epoch": 0.548124777817277, + "grad_norm": 2.453125, + "learning_rate": 9.036e-06, + "loss": 0.3664, + "step": 24670 + }, + { + "epoch": 0.5483469605403484, + "grad_norm": 2.203125, + "learning_rate": 9.031555555555556e-06, + "loss": 0.3716, + "step": 24680 + }, + { + "epoch": 0.5485691432634199, + "grad_norm": 2.09375, + "learning_rate": 9.027111111111113e-06, + "loss": 0.4025, + "step": 24690 + }, + { + "epoch": 0.5487913259864913, + "grad_norm": 2.21875, + "learning_rate": 9.022666666666667e-06, + "loss": 0.3298, + "step": 24700 + }, + { + "epoch": 0.5490135087095628, + "grad_norm": 2.078125, + "learning_rate": 9.018222222222222e-06, + "loss": 0.3857, + "step": 24710 + }, + { + "epoch": 0.5492356914326342, + "grad_norm": 2.609375, + "learning_rate": 9.013777777777779e-06, + "loss": 0.3528, + "step": 24720 + }, + { + "epoch": 0.5494578741557057, + "grad_norm": 2.21875, + "learning_rate": 9.009333333333334e-06, + "loss": 0.3563, + "step": 24730 + }, + { + "epoch": 0.5496800568787771, + "grad_norm": 2.4375, + "learning_rate": 9.00488888888889e-06, + "loss": 0.3696, + "step": 24740 + }, + { + "epoch": 0.5499022396018486, + "grad_norm": 2.328125, + "learning_rate": 9.000444444444445e-06, + "loss": 0.3847, + "step": 24750 + }, + { + "epoch": 0.55012442232492, + "grad_norm": 2.359375, + "learning_rate": 8.996e-06, + "loss": 0.3865, + "step": 24760 + }, + { + "epoch": 0.5503466050479915, + "grad_norm": 2.890625, + "learning_rate": 8.991555555555556e-06, + "loss": 0.3664, + "step": 24770 + }, + { + "epoch": 0.5505687877710629, + "grad_norm": 2.625, + "learning_rate": 8.987111111111111e-06, + "loss": 0.4078, + "step": 24780 + }, + { + "epoch": 0.5507909704941344, + "grad_norm": 2.453125, + "learning_rate": 8.982666666666668e-06, + "loss": 0.3581, + "step": 24790 + }, + { + "epoch": 0.5510131532172058, + "grad_norm": 2.25, + "learning_rate": 8.978222222222223e-06, + "loss": 0.3675, + "step": 24800 + }, + { + "epoch": 0.5512353359402773, + "grad_norm": 2.65625, + "learning_rate": 8.97377777777778e-06, + "loss": 0.4063, + "step": 24810 + }, + { + "epoch": 0.5514575186633487, + "grad_norm": 2.453125, + "learning_rate": 8.969333333333334e-06, + "loss": 0.385, + "step": 24820 + }, + { + "epoch": 0.5516797013864202, + "grad_norm": 2.625, + "learning_rate": 8.964888888888889e-06, + "loss": 0.4183, + "step": 24830 + }, + { + "epoch": 0.5519018841094916, + "grad_norm": 3.484375, + "learning_rate": 8.960444444444445e-06, + "loss": 0.4121, + "step": 24840 + }, + { + "epoch": 0.5521240668325631, + "grad_norm": 2.0625, + "learning_rate": 8.956e-06, + "loss": 0.3987, + "step": 24850 + }, + { + "epoch": 0.5523462495556346, + "grad_norm": 2.484375, + "learning_rate": 8.951555555555557e-06, + "loss": 0.3913, + "step": 24860 + }, + { + "epoch": 0.552568432278706, + "grad_norm": 2.15625, + "learning_rate": 8.947111111111112e-06, + "loss": 0.4085, + "step": 24870 + }, + { + "epoch": 0.5527906150017775, + "grad_norm": 2.296875, + "learning_rate": 8.942666666666667e-06, + "loss": 0.3699, + "step": 24880 + }, + { + "epoch": 0.5530127977248489, + "grad_norm": 2.578125, + "learning_rate": 8.938222222222223e-06, + "loss": 0.3996, + "step": 24890 + }, + { + "epoch": 0.5532349804479204, + "grad_norm": 2.1875, + "learning_rate": 8.93377777777778e-06, + "loss": 0.378, + "step": 24900 + }, + { + "epoch": 0.5534571631709918, + "grad_norm": 2.75, + "learning_rate": 8.929333333333334e-06, + "loss": 0.4091, + "step": 24910 + }, + { + "epoch": 0.5536793458940633, + "grad_norm": 2.765625, + "learning_rate": 8.92488888888889e-06, + "loss": 0.3781, + "step": 24920 + }, + { + "epoch": 0.5539015286171347, + "grad_norm": 2.5, + "learning_rate": 8.920444444444444e-06, + "loss": 0.3566, + "step": 24930 + }, + { + "epoch": 0.5541237113402062, + "grad_norm": 2.265625, + "learning_rate": 8.916e-06, + "loss": 0.3532, + "step": 24940 + }, + { + "epoch": 0.5543458940632776, + "grad_norm": 2.4375, + "learning_rate": 8.911555555555557e-06, + "loss": 0.3517, + "step": 24950 + }, + { + "epoch": 0.5545680767863491, + "grad_norm": 2.625, + "learning_rate": 8.907111111111112e-06, + "loss": 0.3661, + "step": 24960 + }, + { + "epoch": 0.5547902595094205, + "grad_norm": 2.203125, + "learning_rate": 8.902666666666667e-06, + "loss": 0.3667, + "step": 24970 + }, + { + "epoch": 0.555012442232492, + "grad_norm": 2.9375, + "learning_rate": 8.898222222222222e-06, + "loss": 0.384, + "step": 24980 + }, + { + "epoch": 0.5552346249555634, + "grad_norm": 2.46875, + "learning_rate": 8.893777777777778e-06, + "loss": 0.3959, + "step": 24990 + }, + { + "epoch": 0.5554568076786349, + "grad_norm": 2.21875, + "learning_rate": 8.889333333333335e-06, + "loss": 0.3585, + "step": 25000 + }, + { + "epoch": 0.5556789904017063, + "grad_norm": 2.546875, + "learning_rate": 8.88488888888889e-06, + "loss": 0.3841, + "step": 25010 + }, + { + "epoch": 0.5559011731247778, + "grad_norm": 2.78125, + "learning_rate": 8.880444444444445e-06, + "loss": 0.3772, + "step": 25020 + }, + { + "epoch": 0.5561233558478492, + "grad_norm": 2.4375, + "learning_rate": 8.876e-06, + "loss": 0.3723, + "step": 25030 + }, + { + "epoch": 0.5563455385709207, + "grad_norm": 2.46875, + "learning_rate": 8.871555555555556e-06, + "loss": 0.4183, + "step": 25040 + }, + { + "epoch": 0.5565677212939921, + "grad_norm": 2.765625, + "learning_rate": 8.867111111111112e-06, + "loss": 0.3735, + "step": 25050 + }, + { + "epoch": 0.5567899040170636, + "grad_norm": 2.328125, + "learning_rate": 8.862666666666667e-06, + "loss": 0.3475, + "step": 25060 + }, + { + "epoch": 0.5570120867401351, + "grad_norm": 2.3125, + "learning_rate": 8.858222222222222e-06, + "loss": 0.3926, + "step": 25070 + }, + { + "epoch": 0.5572342694632065, + "grad_norm": 2.78125, + "learning_rate": 8.853777777777779e-06, + "loss": 0.4119, + "step": 25080 + }, + { + "epoch": 0.557456452186278, + "grad_norm": 2.84375, + "learning_rate": 8.849333333333334e-06, + "loss": 0.3947, + "step": 25090 + }, + { + "epoch": 0.5576786349093494, + "grad_norm": 2.5625, + "learning_rate": 8.84488888888889e-06, + "loss": 0.4107, + "step": 25100 + }, + { + "epoch": 0.5579008176324209, + "grad_norm": 2.609375, + "learning_rate": 8.840444444444445e-06, + "loss": 0.4038, + "step": 25110 + }, + { + "epoch": 0.5581230003554923, + "grad_norm": 2.421875, + "learning_rate": 8.836000000000001e-06, + "loss": 0.3674, + "step": 25120 + }, + { + "epoch": 0.5583451830785638, + "grad_norm": 2.765625, + "learning_rate": 8.831555555555556e-06, + "loss": 0.428, + "step": 25130 + }, + { + "epoch": 0.5585673658016352, + "grad_norm": 2.3125, + "learning_rate": 8.827111111111111e-06, + "loss": 0.3403, + "step": 25140 + }, + { + "epoch": 0.5587895485247067, + "grad_norm": 2.328125, + "learning_rate": 8.822666666666668e-06, + "loss": 0.3861, + "step": 25150 + }, + { + "epoch": 0.5590117312477781, + "grad_norm": 2.625, + "learning_rate": 8.818222222222223e-06, + "loss": 0.3654, + "step": 25160 + }, + { + "epoch": 0.5592339139708496, + "grad_norm": 2.5, + "learning_rate": 8.813777777777779e-06, + "loss": 0.3821, + "step": 25170 + }, + { + "epoch": 0.559456096693921, + "grad_norm": 2.578125, + "learning_rate": 8.809333333333334e-06, + "loss": 0.3817, + "step": 25180 + }, + { + "epoch": 0.5596782794169926, + "grad_norm": 2.515625, + "learning_rate": 8.804888888888889e-06, + "loss": 0.3875, + "step": 25190 + }, + { + "epoch": 0.559900462140064, + "grad_norm": 3.578125, + "learning_rate": 8.800444444444445e-06, + "loss": 0.4084, + "step": 25200 + }, + { + "epoch": 0.5601226448631355, + "grad_norm": 2.5, + "learning_rate": 8.796000000000002e-06, + "loss": 0.4038, + "step": 25210 + }, + { + "epoch": 0.5603448275862069, + "grad_norm": 1.984375, + "learning_rate": 8.791555555555557e-06, + "loss": 0.3568, + "step": 25220 + }, + { + "epoch": 0.5605670103092784, + "grad_norm": 2.078125, + "learning_rate": 8.787111111111112e-06, + "loss": 0.3638, + "step": 25230 + }, + { + "epoch": 0.5607891930323498, + "grad_norm": 2.515625, + "learning_rate": 8.782666666666666e-06, + "loss": 0.4023, + "step": 25240 + }, + { + "epoch": 0.5610113757554213, + "grad_norm": 2.3125, + "learning_rate": 8.778222222222223e-06, + "loss": 0.3715, + "step": 25250 + }, + { + "epoch": 0.5612335584784928, + "grad_norm": 2.65625, + "learning_rate": 8.77377777777778e-06, + "loss": 0.3963, + "step": 25260 + }, + { + "epoch": 0.5614557412015642, + "grad_norm": 2.90625, + "learning_rate": 8.769333333333334e-06, + "loss": 0.3837, + "step": 25270 + }, + { + "epoch": 0.5616779239246357, + "grad_norm": 2.296875, + "learning_rate": 8.764888888888889e-06, + "loss": 0.4131, + "step": 25280 + }, + { + "epoch": 0.5619001066477071, + "grad_norm": 2.046875, + "learning_rate": 8.760444444444444e-06, + "loss": 0.3561, + "step": 25290 + }, + { + "epoch": 0.5621222893707786, + "grad_norm": 2.109375, + "learning_rate": 8.756e-06, + "loss": 0.3487, + "step": 25300 + }, + { + "epoch": 0.56234447209385, + "grad_norm": 2.421875, + "learning_rate": 8.751555555555557e-06, + "loss": 0.388, + "step": 25310 + }, + { + "epoch": 0.5625666548169215, + "grad_norm": 2.296875, + "learning_rate": 8.747111111111112e-06, + "loss": 0.3671, + "step": 25320 + }, + { + "epoch": 0.5627888375399929, + "grad_norm": 2.53125, + "learning_rate": 8.742666666666667e-06, + "loss": 0.4174, + "step": 25330 + }, + { + "epoch": 0.5630110202630644, + "grad_norm": 2.546875, + "learning_rate": 8.738222222222222e-06, + "loss": 0.405, + "step": 25340 + }, + { + "epoch": 0.5632332029861358, + "grad_norm": 2.40625, + "learning_rate": 8.733777777777778e-06, + "loss": 0.3833, + "step": 25350 + }, + { + "epoch": 0.5634553857092073, + "grad_norm": 2.328125, + "learning_rate": 8.729333333333335e-06, + "loss": 0.4073, + "step": 25360 + }, + { + "epoch": 0.5636775684322787, + "grad_norm": 2.78125, + "learning_rate": 8.72488888888889e-06, + "loss": 0.3715, + "step": 25370 + }, + { + "epoch": 0.5638997511553502, + "grad_norm": 2.28125, + "learning_rate": 8.720444444444444e-06, + "loss": 0.3855, + "step": 25380 + }, + { + "epoch": 0.5641219338784216, + "grad_norm": 2.265625, + "learning_rate": 8.716000000000001e-06, + "loss": 0.3614, + "step": 25390 + }, + { + "epoch": 0.5643441166014931, + "grad_norm": 2.125, + "learning_rate": 8.711555555555556e-06, + "loss": 0.3656, + "step": 25400 + }, + { + "epoch": 0.5645662993245645, + "grad_norm": 2.1875, + "learning_rate": 8.707111111111112e-06, + "loss": 0.3981, + "step": 25410 + }, + { + "epoch": 0.564788482047636, + "grad_norm": 2.3125, + "learning_rate": 8.702666666666667e-06, + "loss": 0.4118, + "step": 25420 + }, + { + "epoch": 0.5650106647707074, + "grad_norm": 2.703125, + "learning_rate": 8.698222222222224e-06, + "loss": 0.3816, + "step": 25430 + }, + { + "epoch": 0.5652328474937789, + "grad_norm": 2.484375, + "learning_rate": 8.693777777777779e-06, + "loss": 0.3911, + "step": 25440 + }, + { + "epoch": 0.5654550302168503, + "grad_norm": 1.984375, + "learning_rate": 8.689333333333333e-06, + "loss": 0.3567, + "step": 25450 + }, + { + "epoch": 0.5656772129399218, + "grad_norm": 2.546875, + "learning_rate": 8.68488888888889e-06, + "loss": 0.3928, + "step": 25460 + }, + { + "epoch": 0.5658993956629933, + "grad_norm": 1.671875, + "learning_rate": 8.680444444444445e-06, + "loss": 0.3439, + "step": 25470 + }, + { + "epoch": 0.5661215783860647, + "grad_norm": 2.65625, + "learning_rate": 8.676000000000001e-06, + "loss": 0.3818, + "step": 25480 + }, + { + "epoch": 0.5663437611091362, + "grad_norm": 3.171875, + "learning_rate": 8.671555555555556e-06, + "loss": 0.3726, + "step": 25490 + }, + { + "epoch": 0.5665659438322076, + "grad_norm": 2.8125, + "learning_rate": 8.667111111111111e-06, + "loss": 0.3898, + "step": 25500 + }, + { + "epoch": 0.5667881265552791, + "grad_norm": 2.515625, + "learning_rate": 8.662666666666668e-06, + "loss": 0.4124, + "step": 25510 + }, + { + "epoch": 0.5670103092783505, + "grad_norm": 2.234375, + "learning_rate": 8.658222222222224e-06, + "loss": 0.3822, + "step": 25520 + }, + { + "epoch": 0.567232492001422, + "grad_norm": 2.1875, + "learning_rate": 8.653777777777779e-06, + "loss": 0.3788, + "step": 25530 + }, + { + "epoch": 0.5674546747244934, + "grad_norm": 2.125, + "learning_rate": 8.649333333333334e-06, + "loss": 0.3916, + "step": 25540 + }, + { + "epoch": 0.5676768574475649, + "grad_norm": 2.40625, + "learning_rate": 8.644888888888889e-06, + "loss": 0.3715, + "step": 25550 + }, + { + "epoch": 0.5678990401706363, + "grad_norm": 2.65625, + "learning_rate": 8.640444444444445e-06, + "loss": 0.3836, + "step": 25560 + }, + { + "epoch": 0.5681212228937078, + "grad_norm": 2.203125, + "learning_rate": 8.636000000000002e-06, + "loss": 0.4102, + "step": 25570 + }, + { + "epoch": 0.5683434056167792, + "grad_norm": 2.109375, + "learning_rate": 8.631555555555557e-06, + "loss": 0.3541, + "step": 25580 + }, + { + "epoch": 0.5685655883398507, + "grad_norm": 2.4375, + "learning_rate": 8.627111111111111e-06, + "loss": 0.3905, + "step": 25590 + }, + { + "epoch": 0.5687877710629221, + "grad_norm": 2.359375, + "learning_rate": 8.622666666666666e-06, + "loss": 0.3719, + "step": 25600 + }, + { + "epoch": 0.5690099537859936, + "grad_norm": 2.5, + "learning_rate": 8.618222222222223e-06, + "loss": 0.3946, + "step": 25610 + }, + { + "epoch": 0.569232136509065, + "grad_norm": 2.578125, + "learning_rate": 8.61377777777778e-06, + "loss": 0.4057, + "step": 25620 + }, + { + "epoch": 0.5694543192321365, + "grad_norm": 2.265625, + "learning_rate": 8.609333333333334e-06, + "loss": 0.4055, + "step": 25630 + }, + { + "epoch": 0.5696765019552079, + "grad_norm": 2.25, + "learning_rate": 8.604888888888889e-06, + "loss": 0.4136, + "step": 25640 + }, + { + "epoch": 0.5698986846782794, + "grad_norm": 2.40625, + "learning_rate": 8.600444444444444e-06, + "loss": 0.396, + "step": 25650 + }, + { + "epoch": 0.5701208674013508, + "grad_norm": 2.90625, + "learning_rate": 8.596e-06, + "loss": 0.3647, + "step": 25660 + }, + { + "epoch": 0.5703430501244223, + "grad_norm": 1.96875, + "learning_rate": 8.591555555555557e-06, + "loss": 0.3717, + "step": 25670 + }, + { + "epoch": 0.5705652328474938, + "grad_norm": 2.265625, + "learning_rate": 8.587111111111112e-06, + "loss": 0.3827, + "step": 25680 + }, + { + "epoch": 0.5707874155705652, + "grad_norm": 2.515625, + "learning_rate": 8.582666666666667e-06, + "loss": 0.4217, + "step": 25690 + }, + { + "epoch": 0.5710095982936367, + "grad_norm": 2.640625, + "learning_rate": 8.578222222222223e-06, + "loss": 0.4349, + "step": 25700 + }, + { + "epoch": 0.5712317810167081, + "grad_norm": 2.203125, + "learning_rate": 8.573777777777778e-06, + "loss": 0.3873, + "step": 25710 + }, + { + "epoch": 0.5714539637397796, + "grad_norm": 2.015625, + "learning_rate": 8.569333333333335e-06, + "loss": 0.4171, + "step": 25720 + }, + { + "epoch": 0.571676146462851, + "grad_norm": 2.53125, + "learning_rate": 8.56488888888889e-06, + "loss": 0.4147, + "step": 25730 + }, + { + "epoch": 0.5718983291859225, + "grad_norm": 2.296875, + "learning_rate": 8.560444444444446e-06, + "loss": 0.4042, + "step": 25740 + }, + { + "epoch": 0.5721205119089939, + "grad_norm": 2.703125, + "learning_rate": 8.556e-06, + "loss": 0.4124, + "step": 25750 + }, + { + "epoch": 0.5723426946320654, + "grad_norm": 2.078125, + "learning_rate": 8.551555555555556e-06, + "loss": 0.3899, + "step": 25760 + }, + { + "epoch": 0.5725648773551368, + "grad_norm": 2.15625, + "learning_rate": 8.547111111111112e-06, + "loss": 0.397, + "step": 25770 + }, + { + "epoch": 0.5727870600782083, + "grad_norm": 2.453125, + "learning_rate": 8.542666666666667e-06, + "loss": 0.3999, + "step": 25780 + }, + { + "epoch": 0.5730092428012797, + "grad_norm": 2.53125, + "learning_rate": 8.538222222222224e-06, + "loss": 0.3891, + "step": 25790 + }, + { + "epoch": 0.5732314255243512, + "grad_norm": 2.78125, + "learning_rate": 8.533777777777778e-06, + "loss": 0.4335, + "step": 25800 + }, + { + "epoch": 0.5734536082474226, + "grad_norm": 2.203125, + "learning_rate": 8.529333333333333e-06, + "loss": 0.4054, + "step": 25810 + }, + { + "epoch": 0.5736757909704941, + "grad_norm": 2.171875, + "learning_rate": 8.52488888888889e-06, + "loss": 0.3576, + "step": 25820 + }, + { + "epoch": 0.5738979736935655, + "grad_norm": 2.640625, + "learning_rate": 8.520444444444446e-06, + "loss": 0.428, + "step": 25830 + }, + { + "epoch": 0.574120156416637, + "grad_norm": 2.078125, + "learning_rate": 8.516000000000001e-06, + "loss": 0.362, + "step": 25840 + }, + { + "epoch": 0.5743423391397084, + "grad_norm": 2.0625, + "learning_rate": 8.511555555555556e-06, + "loss": 0.3951, + "step": 25850 + }, + { + "epoch": 0.57456452186278, + "grad_norm": 2.375, + "learning_rate": 8.50711111111111e-06, + "loss": 0.3886, + "step": 25860 + }, + { + "epoch": 0.5747867045858513, + "grad_norm": 2.8125, + "learning_rate": 8.502666666666667e-06, + "loss": 0.4179, + "step": 25870 + }, + { + "epoch": 0.5750088873089229, + "grad_norm": 1.9921875, + "learning_rate": 8.498222222222224e-06, + "loss": 0.3508, + "step": 25880 + }, + { + "epoch": 0.5752310700319944, + "grad_norm": 2.71875, + "learning_rate": 8.493777777777779e-06, + "loss": 0.3949, + "step": 25890 + }, + { + "epoch": 0.5754532527550658, + "grad_norm": 2.53125, + "learning_rate": 8.489333333333334e-06, + "loss": 0.4156, + "step": 25900 + }, + { + "epoch": 0.5756754354781373, + "grad_norm": 2.15625, + "learning_rate": 8.484888888888888e-06, + "loss": 0.3551, + "step": 25910 + }, + { + "epoch": 0.5758976182012087, + "grad_norm": 2.15625, + "learning_rate": 8.480444444444445e-06, + "loss": 0.3424, + "step": 25920 + }, + { + "epoch": 0.5761198009242802, + "grad_norm": 2.578125, + "learning_rate": 8.476000000000002e-06, + "loss": 0.4259, + "step": 25930 + }, + { + "epoch": 0.5763419836473516, + "grad_norm": 2.4375, + "learning_rate": 8.471555555555556e-06, + "loss": 0.3698, + "step": 25940 + }, + { + "epoch": 0.5765641663704231, + "grad_norm": 2.78125, + "learning_rate": 8.467111111111111e-06, + "loss": 0.3744, + "step": 25950 + }, + { + "epoch": 0.5767863490934945, + "grad_norm": 2.015625, + "learning_rate": 8.462666666666666e-06, + "loss": 0.3816, + "step": 25960 + }, + { + "epoch": 0.577008531816566, + "grad_norm": 2.515625, + "learning_rate": 8.458222222222223e-06, + "loss": 0.3592, + "step": 25970 + }, + { + "epoch": 0.5772307145396374, + "grad_norm": 2.0625, + "learning_rate": 8.453777777777779e-06, + "loss": 0.361, + "step": 25980 + }, + { + "epoch": 0.5774528972627089, + "grad_norm": 2.5625, + "learning_rate": 8.449333333333334e-06, + "loss": 0.3623, + "step": 25990 + }, + { + "epoch": 0.5776750799857803, + "grad_norm": 2.28125, + "learning_rate": 8.444888888888889e-06, + "loss": 0.4267, + "step": 26000 + }, + { + "epoch": 0.5778972627088518, + "grad_norm": 2.4375, + "learning_rate": 8.440444444444445e-06, + "loss": 0.4347, + "step": 26010 + }, + { + "epoch": 0.5781194454319232, + "grad_norm": 2.453125, + "learning_rate": 8.436e-06, + "loss": 0.3921, + "step": 26020 + }, + { + "epoch": 0.5783416281549947, + "grad_norm": 2.484375, + "learning_rate": 8.431555555555557e-06, + "loss": 0.3784, + "step": 26030 + }, + { + "epoch": 0.5785638108780661, + "grad_norm": 2.25, + "learning_rate": 8.427111111111112e-06, + "loss": 0.3643, + "step": 26040 + }, + { + "epoch": 0.5787859936011376, + "grad_norm": 2.640625, + "learning_rate": 8.422666666666668e-06, + "loss": 0.3954, + "step": 26050 + }, + { + "epoch": 0.579008176324209, + "grad_norm": 2.109375, + "learning_rate": 8.418222222222223e-06, + "loss": 0.3729, + "step": 26060 + }, + { + "epoch": 0.5792303590472805, + "grad_norm": 2.421875, + "learning_rate": 8.413777777777778e-06, + "loss": 0.4098, + "step": 26070 + }, + { + "epoch": 0.579452541770352, + "grad_norm": 2.5, + "learning_rate": 8.409333333333334e-06, + "loss": 0.3831, + "step": 26080 + }, + { + "epoch": 0.5796747244934234, + "grad_norm": 2.40625, + "learning_rate": 8.40488888888889e-06, + "loss": 0.3327, + "step": 26090 + }, + { + "epoch": 0.5798969072164949, + "grad_norm": 2.296875, + "learning_rate": 8.400444444444446e-06, + "loss": 0.389, + "step": 26100 + }, + { + "epoch": 0.5801190899395663, + "grad_norm": 2.453125, + "learning_rate": 8.396e-06, + "loss": 0.4266, + "step": 26110 + }, + { + "epoch": 0.5803412726626378, + "grad_norm": 2.40625, + "learning_rate": 8.391555555555555e-06, + "loss": 0.3712, + "step": 26120 + }, + { + "epoch": 0.5805634553857092, + "grad_norm": 2.59375, + "learning_rate": 8.387111111111112e-06, + "loss": 0.3867, + "step": 26130 + }, + { + "epoch": 0.5807856381087807, + "grad_norm": 2.359375, + "learning_rate": 8.382666666666669e-06, + "loss": 0.4288, + "step": 26140 + }, + { + "epoch": 0.5810078208318521, + "grad_norm": 2.46875, + "learning_rate": 8.378222222222223e-06, + "loss": 0.3929, + "step": 26150 + }, + { + "epoch": 0.5812300035549236, + "grad_norm": 2.375, + "learning_rate": 8.373777777777778e-06, + "loss": 0.3626, + "step": 26160 + }, + { + "epoch": 0.581452186277995, + "grad_norm": 2.71875, + "learning_rate": 8.369333333333333e-06, + "loss": 0.3701, + "step": 26170 + }, + { + "epoch": 0.5816743690010665, + "grad_norm": 2.25, + "learning_rate": 8.36488888888889e-06, + "loss": 0.3522, + "step": 26180 + }, + { + "epoch": 0.5818965517241379, + "grad_norm": 2.390625, + "learning_rate": 8.360444444444446e-06, + "loss": 0.3935, + "step": 26190 + }, + { + "epoch": 0.5821187344472094, + "grad_norm": 2.0625, + "learning_rate": 8.356000000000001e-06, + "loss": 0.3453, + "step": 26200 + }, + { + "epoch": 0.5823409171702808, + "grad_norm": 2.1875, + "learning_rate": 8.351555555555556e-06, + "loss": 0.3886, + "step": 26210 + }, + { + "epoch": 0.5825630998933523, + "grad_norm": 2.8125, + "learning_rate": 8.34711111111111e-06, + "loss": 0.4037, + "step": 26220 + }, + { + "epoch": 0.5827852826164237, + "grad_norm": 2.421875, + "learning_rate": 8.342666666666667e-06, + "loss": 0.3791, + "step": 26230 + }, + { + "epoch": 0.5830074653394952, + "grad_norm": 2.015625, + "learning_rate": 8.338222222222224e-06, + "loss": 0.3771, + "step": 26240 + }, + { + "epoch": 0.5832296480625666, + "grad_norm": 2.53125, + "learning_rate": 8.333777777777779e-06, + "loss": 0.3691, + "step": 26250 + }, + { + "epoch": 0.5834518307856381, + "grad_norm": 2.53125, + "learning_rate": 8.329333333333333e-06, + "loss": 0.3587, + "step": 26260 + }, + { + "epoch": 0.5836740135087095, + "grad_norm": 2.53125, + "learning_rate": 8.324888888888888e-06, + "loss": 0.3607, + "step": 26270 + }, + { + "epoch": 0.583896196231781, + "grad_norm": 2.671875, + "learning_rate": 8.320444444444445e-06, + "loss": 0.3732, + "step": 26280 + }, + { + "epoch": 0.5841183789548525, + "grad_norm": 2.578125, + "learning_rate": 8.316000000000001e-06, + "loss": 0.3848, + "step": 26290 + }, + { + "epoch": 0.5843405616779239, + "grad_norm": 2.515625, + "learning_rate": 8.311555555555556e-06, + "loss": 0.3833, + "step": 26300 + }, + { + "epoch": 0.5845627444009954, + "grad_norm": 2.390625, + "learning_rate": 8.307111111111111e-06, + "loss": 0.351, + "step": 26310 + }, + { + "epoch": 0.5847849271240668, + "grad_norm": 2.46875, + "learning_rate": 8.302666666666668e-06, + "loss": 0.3716, + "step": 26320 + }, + { + "epoch": 0.5850071098471383, + "grad_norm": 2.3125, + "learning_rate": 8.298222222222222e-06, + "loss": 0.3664, + "step": 26330 + }, + { + "epoch": 0.5852292925702097, + "grad_norm": 3.125, + "learning_rate": 8.293777777777779e-06, + "loss": 0.4228, + "step": 26340 + }, + { + "epoch": 0.5854514752932812, + "grad_norm": 3.15625, + "learning_rate": 8.289333333333334e-06, + "loss": 0.3897, + "step": 26350 + }, + { + "epoch": 0.5856736580163526, + "grad_norm": 2.890625, + "learning_rate": 8.28488888888889e-06, + "loss": 0.3806, + "step": 26360 + }, + { + "epoch": 0.5858958407394241, + "grad_norm": 2.28125, + "learning_rate": 8.280444444444445e-06, + "loss": 0.378, + "step": 26370 + }, + { + "epoch": 0.5861180234624955, + "grad_norm": 2.65625, + "learning_rate": 8.276e-06, + "loss": 0.3796, + "step": 26380 + }, + { + "epoch": 0.586340206185567, + "grad_norm": 2.125, + "learning_rate": 8.271555555555557e-06, + "loss": 0.3603, + "step": 26390 + }, + { + "epoch": 0.5865623889086384, + "grad_norm": 2.171875, + "learning_rate": 8.267111111111111e-06, + "loss": 0.3759, + "step": 26400 + }, + { + "epoch": 0.5867845716317099, + "grad_norm": 2.53125, + "learning_rate": 8.262666666666668e-06, + "loss": 0.3788, + "step": 26410 + }, + { + "epoch": 0.5870067543547813, + "grad_norm": 3.0625, + "learning_rate": 8.258222222222223e-06, + "loss": 0.3692, + "step": 26420 + }, + { + "epoch": 0.5872289370778528, + "grad_norm": 2.53125, + "learning_rate": 8.253777777777778e-06, + "loss": 0.4502, + "step": 26430 + }, + { + "epoch": 0.5874511198009242, + "grad_norm": 2.609375, + "learning_rate": 8.249333333333334e-06, + "loss": 0.3761, + "step": 26440 + }, + { + "epoch": 0.5876733025239957, + "grad_norm": 2.546875, + "learning_rate": 8.24488888888889e-06, + "loss": 0.3842, + "step": 26450 + }, + { + "epoch": 0.5878954852470671, + "grad_norm": 2.078125, + "learning_rate": 8.240444444444446e-06, + "loss": 0.3799, + "step": 26460 + }, + { + "epoch": 0.5881176679701386, + "grad_norm": 2.4375, + "learning_rate": 8.236e-06, + "loss": 0.3935, + "step": 26470 + }, + { + "epoch": 0.58833985069321, + "grad_norm": 3.0625, + "learning_rate": 8.231555555555555e-06, + "loss": 0.4229, + "step": 26480 + }, + { + "epoch": 0.5885620334162815, + "grad_norm": 2.15625, + "learning_rate": 8.227111111111112e-06, + "loss": 0.39, + "step": 26490 + }, + { + "epoch": 0.5887842161393531, + "grad_norm": 2.359375, + "learning_rate": 8.222666666666668e-06, + "loss": 0.386, + "step": 26500 + }, + { + "epoch": 0.5890063988624245, + "grad_norm": 2.515625, + "learning_rate": 8.218222222222223e-06, + "loss": 0.3853, + "step": 26510 + }, + { + "epoch": 0.589228581585496, + "grad_norm": 2.3125, + "learning_rate": 8.213777777777778e-06, + "loss": 0.4081, + "step": 26520 + }, + { + "epoch": 0.5894507643085674, + "grad_norm": 2.484375, + "learning_rate": 8.209333333333333e-06, + "loss": 0.3724, + "step": 26530 + }, + { + "epoch": 0.5896729470316389, + "grad_norm": 2.140625, + "learning_rate": 8.20488888888889e-06, + "loss": 0.3663, + "step": 26540 + }, + { + "epoch": 0.5898951297547103, + "grad_norm": 2.25, + "learning_rate": 8.200444444444446e-06, + "loss": 0.402, + "step": 26550 + }, + { + "epoch": 0.5901173124777818, + "grad_norm": 2.578125, + "learning_rate": 8.196e-06, + "loss": 0.3919, + "step": 26560 + }, + { + "epoch": 0.5903394952008532, + "grad_norm": 2.390625, + "learning_rate": 8.191555555555556e-06, + "loss": 0.3896, + "step": 26570 + }, + { + "epoch": 0.5905616779239247, + "grad_norm": 2.421875, + "learning_rate": 8.18711111111111e-06, + "loss": 0.4163, + "step": 26580 + }, + { + "epoch": 0.5907838606469961, + "grad_norm": 2.0625, + "learning_rate": 8.182666666666667e-06, + "loss": 0.3443, + "step": 26590 + }, + { + "epoch": 0.5910060433700676, + "grad_norm": 2.328125, + "learning_rate": 8.178222222222224e-06, + "loss": 0.3769, + "step": 26600 + }, + { + "epoch": 0.591228226093139, + "grad_norm": 2.6875, + "learning_rate": 8.173777777777778e-06, + "loss": 0.3943, + "step": 26610 + }, + { + "epoch": 0.5914504088162105, + "grad_norm": 2.4375, + "learning_rate": 8.169333333333333e-06, + "loss": 0.406, + "step": 26620 + }, + { + "epoch": 0.5916725915392819, + "grad_norm": 2.234375, + "learning_rate": 8.16488888888889e-06, + "loss": 0.3781, + "step": 26630 + }, + { + "epoch": 0.5918947742623534, + "grad_norm": 2.421875, + "learning_rate": 8.160444444444445e-06, + "loss": 0.3836, + "step": 26640 + }, + { + "epoch": 0.5921169569854248, + "grad_norm": 2.25, + "learning_rate": 8.156000000000001e-06, + "loss": 0.4014, + "step": 26650 + }, + { + "epoch": 0.5923391397084963, + "grad_norm": 2.375, + "learning_rate": 8.151555555555556e-06, + "loss": 0.3852, + "step": 26660 + }, + { + "epoch": 0.5925613224315677, + "grad_norm": 2.40625, + "learning_rate": 8.147111111111113e-06, + "loss": 0.3652, + "step": 26670 + }, + { + "epoch": 0.5927835051546392, + "grad_norm": 2.234375, + "learning_rate": 8.142666666666667e-06, + "loss": 0.3818, + "step": 26680 + }, + { + "epoch": 0.5930056878777106, + "grad_norm": 2.78125, + "learning_rate": 8.138222222222222e-06, + "loss": 0.3772, + "step": 26690 + }, + { + "epoch": 0.5932278706007821, + "grad_norm": 3.140625, + "learning_rate": 8.133777777777779e-06, + "loss": 0.4421, + "step": 26700 + }, + { + "epoch": 0.5934500533238536, + "grad_norm": 2.28125, + "learning_rate": 8.129333333333334e-06, + "loss": 0.3549, + "step": 26710 + }, + { + "epoch": 0.593672236046925, + "grad_norm": 2.625, + "learning_rate": 8.12488888888889e-06, + "loss": 0.4144, + "step": 26720 + }, + { + "epoch": 0.5938944187699965, + "grad_norm": 2.265625, + "learning_rate": 8.120444444444445e-06, + "loss": 0.3973, + "step": 26730 + }, + { + "epoch": 0.5941166014930679, + "grad_norm": 2.3125, + "learning_rate": 8.116e-06, + "loss": 0.4206, + "step": 26740 + }, + { + "epoch": 0.5943387842161394, + "grad_norm": 2.359375, + "learning_rate": 8.111555555555556e-06, + "loss": 0.3941, + "step": 26750 + }, + { + "epoch": 0.5945609669392108, + "grad_norm": 2.1875, + "learning_rate": 8.107111111111113e-06, + "loss": 0.3912, + "step": 26760 + }, + { + "epoch": 0.5947831496622823, + "grad_norm": 2.234375, + "learning_rate": 8.102666666666668e-06, + "loss": 0.4154, + "step": 26770 + }, + { + "epoch": 0.5950053323853537, + "grad_norm": 2.421875, + "learning_rate": 8.098222222222223e-06, + "loss": 0.3884, + "step": 26780 + }, + { + "epoch": 0.5952275151084252, + "grad_norm": 2.484375, + "learning_rate": 8.093777777777777e-06, + "loss": 0.3626, + "step": 26790 + }, + { + "epoch": 0.5954496978314966, + "grad_norm": 2.015625, + "learning_rate": 8.089333333333334e-06, + "loss": 0.4196, + "step": 26800 + }, + { + "epoch": 0.5956718805545681, + "grad_norm": 2.3125, + "learning_rate": 8.08488888888889e-06, + "loss": 0.365, + "step": 26810 + }, + { + "epoch": 0.5958940632776395, + "grad_norm": 2.484375, + "learning_rate": 8.080444444444445e-06, + "loss": 0.36, + "step": 26820 + }, + { + "epoch": 0.596116246000711, + "grad_norm": 2.265625, + "learning_rate": 8.076e-06, + "loss": 0.362, + "step": 26830 + }, + { + "epoch": 0.5963384287237824, + "grad_norm": 2.375, + "learning_rate": 8.071555555555555e-06, + "loss": 0.3744, + "step": 26840 + }, + { + "epoch": 0.5965606114468539, + "grad_norm": 1.9765625, + "learning_rate": 8.067111111111112e-06, + "loss": 0.3593, + "step": 26850 + }, + { + "epoch": 0.5967827941699253, + "grad_norm": 2.375, + "learning_rate": 8.062666666666668e-06, + "loss": 0.3708, + "step": 26860 + }, + { + "epoch": 0.5970049768929968, + "grad_norm": 2.390625, + "learning_rate": 8.058222222222223e-06, + "loss": 0.372, + "step": 26870 + }, + { + "epoch": 0.5972271596160682, + "grad_norm": 2.53125, + "learning_rate": 8.053777777777778e-06, + "loss": 0.3874, + "step": 26880 + }, + { + "epoch": 0.5974493423391397, + "grad_norm": 2.53125, + "learning_rate": 8.049333333333333e-06, + "loss": 0.3587, + "step": 26890 + }, + { + "epoch": 0.5976715250622112, + "grad_norm": 2.25, + "learning_rate": 8.04488888888889e-06, + "loss": 0.3558, + "step": 26900 + }, + { + "epoch": 0.5978937077852826, + "grad_norm": 2.484375, + "learning_rate": 8.040444444444446e-06, + "loss": 0.3847, + "step": 26910 + }, + { + "epoch": 0.5981158905083541, + "grad_norm": 2.3125, + "learning_rate": 8.036e-06, + "loss": 0.4165, + "step": 26920 + }, + { + "epoch": 0.5983380732314255, + "grad_norm": 2.546875, + "learning_rate": 8.031555555555555e-06, + "loss": 0.3804, + "step": 26930 + }, + { + "epoch": 0.598560255954497, + "grad_norm": 2.703125, + "learning_rate": 8.027111111111112e-06, + "loss": 0.3987, + "step": 26940 + }, + { + "epoch": 0.5987824386775684, + "grad_norm": 3.0, + "learning_rate": 8.022666666666667e-06, + "loss": 0.4194, + "step": 26950 + }, + { + "epoch": 0.5990046214006399, + "grad_norm": 2.234375, + "learning_rate": 8.018222222222223e-06, + "loss": 0.3575, + "step": 26960 + }, + { + "epoch": 0.5992268041237113, + "grad_norm": 2.0625, + "learning_rate": 8.013777777777778e-06, + "loss": 0.3862, + "step": 26970 + }, + { + "epoch": 0.5994489868467828, + "grad_norm": 2.640625, + "learning_rate": 8.009333333333335e-06, + "loss": 0.3968, + "step": 26980 + }, + { + "epoch": 0.5996711695698542, + "grad_norm": 2.859375, + "learning_rate": 8.00488888888889e-06, + "loss": 0.3703, + "step": 26990 + }, + { + "epoch": 0.5998933522929257, + "grad_norm": 1.921875, + "learning_rate": 8.000444444444444e-06, + "loss": 0.388, + "step": 27000 + }, + { + "epoch": 0.6001155350159971, + "grad_norm": 2.671875, + "learning_rate": 7.996000000000001e-06, + "loss": 0.3826, + "step": 27010 + }, + { + "epoch": 0.6003377177390686, + "grad_norm": 2.140625, + "learning_rate": 7.991555555555556e-06, + "loss": 0.382, + "step": 27020 + }, + { + "epoch": 0.60055990046214, + "grad_norm": 2.671875, + "learning_rate": 7.987111111111112e-06, + "loss": 0.3621, + "step": 27030 + }, + { + "epoch": 0.6007820831852115, + "grad_norm": 2.5625, + "learning_rate": 7.982666666666667e-06, + "loss": 0.4083, + "step": 27040 + }, + { + "epoch": 0.6010042659082829, + "grad_norm": 2.40625, + "learning_rate": 7.978222222222222e-06, + "loss": 0.3844, + "step": 27050 + }, + { + "epoch": 0.6012264486313544, + "grad_norm": 2.53125, + "learning_rate": 7.973777777777779e-06, + "loss": 0.3781, + "step": 27060 + }, + { + "epoch": 0.6014486313544258, + "grad_norm": 2.265625, + "learning_rate": 7.969333333333335e-06, + "loss": 0.3586, + "step": 27070 + }, + { + "epoch": 0.6016708140774973, + "grad_norm": 2.46875, + "learning_rate": 7.96488888888889e-06, + "loss": 0.3765, + "step": 27080 + }, + { + "epoch": 0.6018929968005687, + "grad_norm": 2.859375, + "learning_rate": 7.960444444444445e-06, + "loss": 0.3895, + "step": 27090 + }, + { + "epoch": 0.6021151795236402, + "grad_norm": 2.296875, + "learning_rate": 7.956e-06, + "loss": 0.3906, + "step": 27100 + }, + { + "epoch": 0.6023373622467117, + "grad_norm": 2.21875, + "learning_rate": 7.951555555555556e-06, + "loss": 0.3882, + "step": 27110 + }, + { + "epoch": 0.6025595449697831, + "grad_norm": 2.078125, + "learning_rate": 7.947111111111113e-06, + "loss": 0.3914, + "step": 27120 + }, + { + "epoch": 0.6027817276928547, + "grad_norm": 2.46875, + "learning_rate": 7.942666666666668e-06, + "loss": 0.3816, + "step": 27130 + }, + { + "epoch": 0.603003910415926, + "grad_norm": 2.140625, + "learning_rate": 7.938222222222222e-06, + "loss": 0.3587, + "step": 27140 + }, + { + "epoch": 0.6032260931389976, + "grad_norm": 2.453125, + "learning_rate": 7.933777777777777e-06, + "loss": 0.4219, + "step": 27150 + }, + { + "epoch": 0.603448275862069, + "grad_norm": 2.53125, + "learning_rate": 7.929333333333334e-06, + "loss": 0.3842, + "step": 27160 + }, + { + "epoch": 0.6036704585851405, + "grad_norm": 2.171875, + "learning_rate": 7.92488888888889e-06, + "loss": 0.3762, + "step": 27170 + }, + { + "epoch": 0.6038926413082119, + "grad_norm": 2.265625, + "learning_rate": 7.920444444444445e-06, + "loss": 0.3777, + "step": 27180 + }, + { + "epoch": 0.6041148240312834, + "grad_norm": 2.96875, + "learning_rate": 7.916e-06, + "loss": 0.389, + "step": 27190 + }, + { + "epoch": 0.6043370067543548, + "grad_norm": 2.46875, + "learning_rate": 7.911555555555555e-06, + "loss": 0.3915, + "step": 27200 + }, + { + "epoch": 0.6045591894774263, + "grad_norm": 2.265625, + "learning_rate": 7.907111111111111e-06, + "loss": 0.3741, + "step": 27210 + }, + { + "epoch": 0.6047813722004977, + "grad_norm": 2.421875, + "learning_rate": 7.902666666666668e-06, + "loss": 0.3503, + "step": 27220 + }, + { + "epoch": 0.6050035549235692, + "grad_norm": 2.71875, + "learning_rate": 7.898222222222223e-06, + "loss": 0.3603, + "step": 27230 + }, + { + "epoch": 0.6052257376466406, + "grad_norm": 2.640625, + "learning_rate": 7.893777777777778e-06, + "loss": 0.3596, + "step": 27240 + }, + { + "epoch": 0.6054479203697121, + "grad_norm": 2.65625, + "learning_rate": 7.889333333333334e-06, + "loss": 0.4017, + "step": 27250 + }, + { + "epoch": 0.6056701030927835, + "grad_norm": 2.21875, + "learning_rate": 7.884888888888889e-06, + "loss": 0.375, + "step": 27260 + }, + { + "epoch": 0.605892285815855, + "grad_norm": 2.671875, + "learning_rate": 7.880444444444446e-06, + "loss": 0.3834, + "step": 27270 + }, + { + "epoch": 0.6061144685389264, + "grad_norm": 2.21875, + "learning_rate": 7.876e-06, + "loss": 0.3794, + "step": 27280 + }, + { + "epoch": 0.6063366512619979, + "grad_norm": 2.359375, + "learning_rate": 7.871555555555557e-06, + "loss": 0.3893, + "step": 27290 + }, + { + "epoch": 0.6065588339850693, + "grad_norm": 2.0625, + "learning_rate": 7.867111111111112e-06, + "loss": 0.3511, + "step": 27300 + }, + { + "epoch": 0.6067810167081408, + "grad_norm": 2.5, + "learning_rate": 7.862666666666667e-06, + "loss": 0.3741, + "step": 27310 + }, + { + "epoch": 0.6070031994312123, + "grad_norm": 2.921875, + "learning_rate": 7.858222222222223e-06, + "loss": 0.4052, + "step": 27320 + }, + { + "epoch": 0.6072253821542837, + "grad_norm": 2.609375, + "learning_rate": 7.853777777777778e-06, + "loss": 0.4087, + "step": 27330 + }, + { + "epoch": 0.6074475648773552, + "grad_norm": 2.28125, + "learning_rate": 7.849333333333335e-06, + "loss": 0.3986, + "step": 27340 + }, + { + "epoch": 0.6076697476004266, + "grad_norm": 2.71875, + "learning_rate": 7.84488888888889e-06, + "loss": 0.3892, + "step": 27350 + }, + { + "epoch": 0.6078919303234981, + "grad_norm": 2.4375, + "learning_rate": 7.840444444444444e-06, + "loss": 0.3862, + "step": 27360 + }, + { + "epoch": 0.6081141130465695, + "grad_norm": 2.8125, + "learning_rate": 7.836000000000001e-06, + "loss": 0.3889, + "step": 27370 + }, + { + "epoch": 0.608336295769641, + "grad_norm": 2.640625, + "learning_rate": 7.831555555555557e-06, + "loss": 0.3702, + "step": 27380 + }, + { + "epoch": 0.6085584784927124, + "grad_norm": 2.625, + "learning_rate": 7.827111111111112e-06, + "loss": 0.3859, + "step": 27390 + }, + { + "epoch": 0.6087806612157839, + "grad_norm": 2.78125, + "learning_rate": 7.822666666666667e-06, + "loss": 0.4044, + "step": 27400 + }, + { + "epoch": 0.6090028439388553, + "grad_norm": 2.28125, + "learning_rate": 7.818222222222222e-06, + "loss": 0.3996, + "step": 27410 + }, + { + "epoch": 0.6092250266619268, + "grad_norm": 2.078125, + "learning_rate": 7.813777777777778e-06, + "loss": 0.3879, + "step": 27420 + }, + { + "epoch": 0.6094472093849982, + "grad_norm": 3.0, + "learning_rate": 7.809333333333335e-06, + "loss": 0.3552, + "step": 27430 + }, + { + "epoch": 0.6096693921080697, + "grad_norm": 2.328125, + "learning_rate": 7.80488888888889e-06, + "loss": 0.3508, + "step": 27440 + }, + { + "epoch": 0.6098915748311411, + "grad_norm": 1.9453125, + "learning_rate": 7.800444444444445e-06, + "loss": 0.3573, + "step": 27450 + }, + { + "epoch": 0.6101137575542126, + "grad_norm": 2.4375, + "learning_rate": 7.796e-06, + "loss": 0.385, + "step": 27460 + }, + { + "epoch": 0.610335940277284, + "grad_norm": 2.296875, + "learning_rate": 7.791555555555556e-06, + "loss": 0.3949, + "step": 27470 + }, + { + "epoch": 0.6105581230003555, + "grad_norm": 2.78125, + "learning_rate": 7.787111111111113e-06, + "loss": 0.393, + "step": 27480 + }, + { + "epoch": 0.6107803057234269, + "grad_norm": 2.4375, + "learning_rate": 7.782666666666667e-06, + "loss": 0.3738, + "step": 27490 + }, + { + "epoch": 0.6110024884464984, + "grad_norm": 2.859375, + "learning_rate": 7.778222222222222e-06, + "loss": 0.4097, + "step": 27500 + }, + { + "epoch": 0.6112246711695698, + "grad_norm": 2.109375, + "learning_rate": 7.773777777777777e-06, + "loss": 0.3911, + "step": 27510 + }, + { + "epoch": 0.6114468538926413, + "grad_norm": 2.5, + "learning_rate": 7.769333333333334e-06, + "loss": 0.4197, + "step": 27520 + }, + { + "epoch": 0.6116690366157128, + "grad_norm": 2.375, + "learning_rate": 7.76488888888889e-06, + "loss": 0.3551, + "step": 27530 + }, + { + "epoch": 0.6118912193387842, + "grad_norm": 2.8125, + "learning_rate": 7.760444444444445e-06, + "loss": 0.3914, + "step": 27540 + }, + { + "epoch": 0.6121134020618557, + "grad_norm": 3.125, + "learning_rate": 7.756e-06, + "loss": 0.371, + "step": 27550 + }, + { + "epoch": 0.6123355847849271, + "grad_norm": 2.328125, + "learning_rate": 7.751555555555556e-06, + "loss": 0.3976, + "step": 27560 + }, + { + "epoch": 0.6125577675079986, + "grad_norm": 2.734375, + "learning_rate": 7.747111111111111e-06, + "loss": 0.3953, + "step": 27570 + }, + { + "epoch": 0.61277995023107, + "grad_norm": 2.765625, + "learning_rate": 7.742666666666668e-06, + "loss": 0.3821, + "step": 27580 + }, + { + "epoch": 0.6130021329541415, + "grad_norm": 2.296875, + "learning_rate": 7.738222222222223e-06, + "loss": 0.3977, + "step": 27590 + }, + { + "epoch": 0.6132243156772129, + "grad_norm": 2.390625, + "learning_rate": 7.73377777777778e-06, + "loss": 0.392, + "step": 27600 + }, + { + "epoch": 0.6134464984002844, + "grad_norm": 2.0625, + "learning_rate": 7.729333333333334e-06, + "loss": 0.3432, + "step": 27610 + }, + { + "epoch": 0.6136686811233558, + "grad_norm": 2.46875, + "learning_rate": 7.724888888888889e-06, + "loss": 0.381, + "step": 27620 + }, + { + "epoch": 0.6138908638464273, + "grad_norm": 2.703125, + "learning_rate": 7.720444444444445e-06, + "loss": 0.3911, + "step": 27630 + }, + { + "epoch": 0.6141130465694987, + "grad_norm": 2.34375, + "learning_rate": 7.716e-06, + "loss": 0.3796, + "step": 27640 + }, + { + "epoch": 0.6143352292925702, + "grad_norm": 2.578125, + "learning_rate": 7.711555555555557e-06, + "loss": 0.3854, + "step": 27650 + }, + { + "epoch": 0.6145574120156416, + "grad_norm": 2.578125, + "learning_rate": 7.707111111111112e-06, + "loss": 0.4328, + "step": 27660 + }, + { + "epoch": 0.6147795947387131, + "grad_norm": 2.46875, + "learning_rate": 7.702666666666667e-06, + "loss": 0.3718, + "step": 27670 + }, + { + "epoch": 0.6150017774617845, + "grad_norm": 2.359375, + "learning_rate": 7.698222222222223e-06, + "loss": 0.3647, + "step": 27680 + }, + { + "epoch": 0.615223960184856, + "grad_norm": 2.3125, + "learning_rate": 7.69377777777778e-06, + "loss": 0.4028, + "step": 27690 + }, + { + "epoch": 0.6154461429079274, + "grad_norm": 2.453125, + "learning_rate": 7.689333333333334e-06, + "loss": 0.399, + "step": 27700 + }, + { + "epoch": 0.6156683256309989, + "grad_norm": 2.15625, + "learning_rate": 7.68488888888889e-06, + "loss": 0.3847, + "step": 27710 + }, + { + "epoch": 0.6158905083540704, + "grad_norm": 2.578125, + "learning_rate": 7.680444444444444e-06, + "loss": 0.3966, + "step": 27720 + }, + { + "epoch": 0.6161126910771418, + "grad_norm": 2.59375, + "learning_rate": 7.676e-06, + "loss": 0.4005, + "step": 27730 + }, + { + "epoch": 0.6163348738002133, + "grad_norm": 2.25, + "learning_rate": 7.671555555555557e-06, + "loss": 0.3959, + "step": 27740 + }, + { + "epoch": 0.6165570565232847, + "grad_norm": 2.328125, + "learning_rate": 7.667111111111112e-06, + "loss": 0.3794, + "step": 27750 + }, + { + "epoch": 0.6167792392463562, + "grad_norm": 2.578125, + "learning_rate": 7.662666666666667e-06, + "loss": 0.376, + "step": 27760 + }, + { + "epoch": 0.6170014219694276, + "grad_norm": 2.21875, + "learning_rate": 7.658222222222222e-06, + "loss": 0.388, + "step": 27770 + }, + { + "epoch": 0.6172236046924992, + "grad_norm": 1.828125, + "learning_rate": 7.653777777777778e-06, + "loss": 0.398, + "step": 27780 + }, + { + "epoch": 0.6174457874155705, + "grad_norm": 2.5625, + "learning_rate": 7.649333333333335e-06, + "loss": 0.3529, + "step": 27790 + }, + { + "epoch": 0.617667970138642, + "grad_norm": 3.046875, + "learning_rate": 7.64488888888889e-06, + "loss": 0.3881, + "step": 27800 + }, + { + "epoch": 0.6178901528617134, + "grad_norm": 2.140625, + "learning_rate": 7.640444444444445e-06, + "loss": 0.357, + "step": 27810 + }, + { + "epoch": 0.618112335584785, + "grad_norm": 2.703125, + "learning_rate": 7.636e-06, + "loss": 0.3694, + "step": 27820 + }, + { + "epoch": 0.6183345183078564, + "grad_norm": 2.25, + "learning_rate": 7.631555555555556e-06, + "loss": 0.396, + "step": 27830 + }, + { + "epoch": 0.6185567010309279, + "grad_norm": 3.09375, + "learning_rate": 7.627111111111112e-06, + "loss": 0.4501, + "step": 27840 + }, + { + "epoch": 0.6187788837539993, + "grad_norm": 2.421875, + "learning_rate": 7.622666666666667e-06, + "loss": 0.3959, + "step": 27850 + }, + { + "epoch": 0.6190010664770708, + "grad_norm": 3.0625, + "learning_rate": 7.618222222222222e-06, + "loss": 0.3726, + "step": 27860 + }, + { + "epoch": 0.6192232492001422, + "grad_norm": 2.453125, + "learning_rate": 7.613777777777779e-06, + "loss": 0.3592, + "step": 27870 + }, + { + "epoch": 0.6194454319232137, + "grad_norm": 2.171875, + "learning_rate": 7.609333333333334e-06, + "loss": 0.3812, + "step": 27880 + }, + { + "epoch": 0.6196676146462851, + "grad_norm": 2.71875, + "learning_rate": 7.604888888888889e-06, + "loss": 0.3947, + "step": 27890 + }, + { + "epoch": 0.6198897973693566, + "grad_norm": 2.3125, + "learning_rate": 7.600444444444445e-06, + "loss": 0.3816, + "step": 27900 + }, + { + "epoch": 0.620111980092428, + "grad_norm": 2.359375, + "learning_rate": 7.5960000000000015e-06, + "loss": 0.3579, + "step": 27910 + }, + { + "epoch": 0.6203341628154995, + "grad_norm": 2.5, + "learning_rate": 7.591555555555556e-06, + "loss": 0.3766, + "step": 27920 + }, + { + "epoch": 0.620556345538571, + "grad_norm": 2.21875, + "learning_rate": 7.587111111111112e-06, + "loss": 0.4472, + "step": 27930 + }, + { + "epoch": 0.6207785282616424, + "grad_norm": 2.140625, + "learning_rate": 7.582666666666667e-06, + "loss": 0.3622, + "step": 27940 + }, + { + "epoch": 0.6210007109847139, + "grad_norm": 2.609375, + "learning_rate": 7.5782222222222225e-06, + "loss": 0.3767, + "step": 27950 + }, + { + "epoch": 0.6212228937077853, + "grad_norm": 2.34375, + "learning_rate": 7.573777777777779e-06, + "loss": 0.3808, + "step": 27960 + }, + { + "epoch": 0.6214450764308568, + "grad_norm": 2.359375, + "learning_rate": 7.569333333333334e-06, + "loss": 0.3676, + "step": 27970 + }, + { + "epoch": 0.6216672591539282, + "grad_norm": 2.734375, + "learning_rate": 7.56488888888889e-06, + "loss": 0.3983, + "step": 27980 + }, + { + "epoch": 0.6218894418769997, + "grad_norm": 2.421875, + "learning_rate": 7.5604444444444445e-06, + "loss": 0.4203, + "step": 27990 + }, + { + "epoch": 0.6221116246000711, + "grad_norm": 2.203125, + "learning_rate": 7.556000000000001e-06, + "loss": 0.4059, + "step": 28000 + }, + { + "epoch": 0.6223338073231426, + "grad_norm": 2.25, + "learning_rate": 7.551555555555557e-06, + "loss": 0.3762, + "step": 28010 + }, + { + "epoch": 0.622555990046214, + "grad_norm": 2.359375, + "learning_rate": 7.5471111111111115e-06, + "loss": 0.3727, + "step": 28020 + }, + { + "epoch": 0.6227781727692855, + "grad_norm": 2.5625, + "learning_rate": 7.542666666666667e-06, + "loss": 0.3787, + "step": 28030 + }, + { + "epoch": 0.6230003554923569, + "grad_norm": 2.515625, + "learning_rate": 7.538222222222222e-06, + "loss": 0.3643, + "step": 28040 + }, + { + "epoch": 0.6232225382154284, + "grad_norm": 2.453125, + "learning_rate": 7.533777777777779e-06, + "loss": 0.3842, + "step": 28050 + }, + { + "epoch": 0.6234447209384998, + "grad_norm": 2.59375, + "learning_rate": 7.529333333333334e-06, + "loss": 0.3752, + "step": 28060 + }, + { + "epoch": 0.6236669036615713, + "grad_norm": 2.484375, + "learning_rate": 7.524888888888889e-06, + "loss": 0.3572, + "step": 28070 + }, + { + "epoch": 0.6238890863846427, + "grad_norm": 2.28125, + "learning_rate": 7.520444444444445e-06, + "loss": 0.3834, + "step": 28080 + }, + { + "epoch": 0.6241112691077142, + "grad_norm": 2.40625, + "learning_rate": 7.516000000000001e-06, + "loss": 0.3519, + "step": 28090 + }, + { + "epoch": 0.6243334518307856, + "grad_norm": 2.265625, + "learning_rate": 7.511555555555556e-06, + "loss": 0.3824, + "step": 28100 + }, + { + "epoch": 0.6245556345538571, + "grad_norm": 2.59375, + "learning_rate": 7.507111111111112e-06, + "loss": 0.3752, + "step": 28110 + }, + { + "epoch": 0.6247778172769285, + "grad_norm": 2.25, + "learning_rate": 7.502666666666667e-06, + "loss": 0.3523, + "step": 28120 + }, + { + "epoch": 0.625, + "grad_norm": 2.40625, + "learning_rate": 7.4982222222222225e-06, + "loss": 0.3832, + "step": 28130 + }, + { + "epoch": 0.6252221827230715, + "grad_norm": 2.484375, + "learning_rate": 7.493777777777779e-06, + "loss": 0.4148, + "step": 28140 + }, + { + "epoch": 0.6254443654461429, + "grad_norm": 2.484375, + "learning_rate": 7.489333333333334e-06, + "loss": 0.366, + "step": 28150 + }, + { + "epoch": 0.6256665481692144, + "grad_norm": 2.6875, + "learning_rate": 7.4848888888888895e-06, + "loss": 0.3789, + "step": 28160 + }, + { + "epoch": 0.6258887308922858, + "grad_norm": 2.15625, + "learning_rate": 7.480444444444444e-06, + "loss": 0.3716, + "step": 28170 + }, + { + "epoch": 0.6261109136153573, + "grad_norm": 2.296875, + "learning_rate": 7.476000000000001e-06, + "loss": 0.3479, + "step": 28180 + }, + { + "epoch": 0.6263330963384287, + "grad_norm": 2.296875, + "learning_rate": 7.471555555555557e-06, + "loss": 0.3754, + "step": 28190 + }, + { + "epoch": 0.6265552790615002, + "grad_norm": 2.59375, + "learning_rate": 7.4671111111111115e-06, + "loss": 0.3692, + "step": 28200 + }, + { + "epoch": 0.6267774617845716, + "grad_norm": 2.390625, + "learning_rate": 7.462666666666667e-06, + "loss": 0.3499, + "step": 28210 + }, + { + "epoch": 0.6269996445076431, + "grad_norm": 2.53125, + "learning_rate": 7.458222222222224e-06, + "loss": 0.3593, + "step": 28220 + }, + { + "epoch": 0.6272218272307145, + "grad_norm": 2.515625, + "learning_rate": 7.4537777777777785e-06, + "loss": 0.3982, + "step": 28230 + }, + { + "epoch": 0.627444009953786, + "grad_norm": 2.21875, + "learning_rate": 7.449333333333334e-06, + "loss": 0.3793, + "step": 28240 + }, + { + "epoch": 0.6276661926768574, + "grad_norm": 2.609375, + "learning_rate": 7.444888888888889e-06, + "loss": 0.3641, + "step": 28250 + }, + { + "epoch": 0.6278883753999289, + "grad_norm": 2.15625, + "learning_rate": 7.440444444444445e-06, + "loss": 0.3766, + "step": 28260 + }, + { + "epoch": 0.6281105581230003, + "grad_norm": 2.5625, + "learning_rate": 7.436000000000001e-06, + "loss": 0.3702, + "step": 28270 + }, + { + "epoch": 0.6283327408460718, + "grad_norm": 2.703125, + "learning_rate": 7.431555555555556e-06, + "loss": 0.4161, + "step": 28280 + }, + { + "epoch": 0.6285549235691432, + "grad_norm": 2.796875, + "learning_rate": 7.427111111111112e-06, + "loss": 0.3946, + "step": 28290 + }, + { + "epoch": 0.6287771062922147, + "grad_norm": 2.375, + "learning_rate": 7.422666666666667e-06, + "loss": 0.4037, + "step": 28300 + }, + { + "epoch": 0.6289992890152861, + "grad_norm": 2.5625, + "learning_rate": 7.418222222222223e-06, + "loss": 0.3448, + "step": 28310 + }, + { + "epoch": 0.6292214717383576, + "grad_norm": 2.296875, + "learning_rate": 7.413777777777779e-06, + "loss": 0.3975, + "step": 28320 + }, + { + "epoch": 0.629443654461429, + "grad_norm": 2.734375, + "learning_rate": 7.409333333333334e-06, + "loss": 0.3917, + "step": 28330 + }, + { + "epoch": 0.6296658371845005, + "grad_norm": 2.40625, + "learning_rate": 7.4048888888888895e-06, + "loss": 0.3971, + "step": 28340 + }, + { + "epoch": 0.629888019907572, + "grad_norm": 2.40625, + "learning_rate": 7.400444444444444e-06, + "loss": 0.3488, + "step": 28350 + }, + { + "epoch": 0.6301102026306434, + "grad_norm": 2.9375, + "learning_rate": 7.396000000000001e-06, + "loss": 0.3703, + "step": 28360 + }, + { + "epoch": 0.6303323853537149, + "grad_norm": 2.546875, + "learning_rate": 7.3915555555555565e-06, + "loss": 0.3957, + "step": 28370 + }, + { + "epoch": 0.6305545680767863, + "grad_norm": 2.765625, + "learning_rate": 7.387111111111111e-06, + "loss": 0.3904, + "step": 28380 + }, + { + "epoch": 0.6307767507998578, + "grad_norm": 2.75, + "learning_rate": 7.382666666666667e-06, + "loss": 0.3894, + "step": 28390 + }, + { + "epoch": 0.6309989335229292, + "grad_norm": 2.25, + "learning_rate": 7.378222222222224e-06, + "loss": 0.3837, + "step": 28400 + }, + { + "epoch": 0.6312211162460007, + "grad_norm": 2.234375, + "learning_rate": 7.3737777777777785e-06, + "loss": 0.3542, + "step": 28410 + }, + { + "epoch": 0.6314432989690721, + "grad_norm": 2.265625, + "learning_rate": 7.369333333333334e-06, + "loss": 0.4, + "step": 28420 + }, + { + "epoch": 0.6316654816921436, + "grad_norm": 2.640625, + "learning_rate": 7.364888888888889e-06, + "loss": 0.3896, + "step": 28430 + }, + { + "epoch": 0.631887664415215, + "grad_norm": 2.40625, + "learning_rate": 7.360444444444445e-06, + "loss": 0.3741, + "step": 28440 + }, + { + "epoch": 0.6321098471382866, + "grad_norm": 2.96875, + "learning_rate": 7.356000000000001e-06, + "loss": 0.386, + "step": 28450 + }, + { + "epoch": 0.632332029861358, + "grad_norm": 2.71875, + "learning_rate": 7.351555555555556e-06, + "loss": 0.372, + "step": 28460 + }, + { + "epoch": 0.6325542125844295, + "grad_norm": 2.484375, + "learning_rate": 7.347111111111112e-06, + "loss": 0.3576, + "step": 28470 + }, + { + "epoch": 0.6327763953075008, + "grad_norm": 2.578125, + "learning_rate": 7.342666666666667e-06, + "loss": 0.3976, + "step": 28480 + }, + { + "epoch": 0.6329985780305724, + "grad_norm": 2.640625, + "learning_rate": 7.338222222222223e-06, + "loss": 0.3562, + "step": 28490 + }, + { + "epoch": 0.6332207607536438, + "grad_norm": 2.5, + "learning_rate": 7.333777777777779e-06, + "loss": 0.3926, + "step": 28500 + }, + { + "epoch": 0.6334429434767153, + "grad_norm": 3.015625, + "learning_rate": 7.329333333333334e-06, + "loss": 0.3995, + "step": 28510 + }, + { + "epoch": 0.6336651261997867, + "grad_norm": 2.140625, + "learning_rate": 7.324888888888889e-06, + "loss": 0.3772, + "step": 28520 + }, + { + "epoch": 0.6338873089228582, + "grad_norm": 2.546875, + "learning_rate": 7.320444444444446e-06, + "loss": 0.3619, + "step": 28530 + }, + { + "epoch": 0.6341094916459296, + "grad_norm": 2.703125, + "learning_rate": 7.316000000000001e-06, + "loss": 0.3671, + "step": 28540 + }, + { + "epoch": 0.6343316743690011, + "grad_norm": 2.578125, + "learning_rate": 7.3115555555555565e-06, + "loss": 0.3554, + "step": 28550 + }, + { + "epoch": 0.6345538570920726, + "grad_norm": 2.421875, + "learning_rate": 7.307111111111111e-06, + "loss": 0.3984, + "step": 28560 + }, + { + "epoch": 0.634776039815144, + "grad_norm": 2.359375, + "learning_rate": 7.302666666666667e-06, + "loss": 0.3698, + "step": 28570 + }, + { + "epoch": 0.6349982225382155, + "grad_norm": 2.515625, + "learning_rate": 7.2982222222222235e-06, + "loss": 0.358, + "step": 28580 + }, + { + "epoch": 0.6352204052612869, + "grad_norm": 2.03125, + "learning_rate": 7.293777777777778e-06, + "loss": 0.3577, + "step": 28590 + }, + { + "epoch": 0.6354425879843584, + "grad_norm": 2.59375, + "learning_rate": 7.289333333333334e-06, + "loss": 0.4029, + "step": 28600 + }, + { + "epoch": 0.6356647707074298, + "grad_norm": 2.0625, + "learning_rate": 7.284888888888889e-06, + "loss": 0.3996, + "step": 28610 + }, + { + "epoch": 0.6358869534305013, + "grad_norm": 2.015625, + "learning_rate": 7.2804444444444455e-06, + "loss": 0.3708, + "step": 28620 + }, + { + "epoch": 0.6361091361535727, + "grad_norm": 2.125, + "learning_rate": 7.276000000000001e-06, + "loss": 0.3876, + "step": 28630 + }, + { + "epoch": 0.6363313188766442, + "grad_norm": 2.390625, + "learning_rate": 7.271555555555556e-06, + "loss": 0.3658, + "step": 28640 + }, + { + "epoch": 0.6365535015997156, + "grad_norm": 2.4375, + "learning_rate": 7.267111111111112e-06, + "loss": 0.3854, + "step": 28650 + }, + { + "epoch": 0.6367756843227871, + "grad_norm": 2.328125, + "learning_rate": 7.2626666666666665e-06, + "loss": 0.3653, + "step": 28660 + }, + { + "epoch": 0.6369978670458585, + "grad_norm": 2.640625, + "learning_rate": 7.258222222222223e-06, + "loss": 0.3889, + "step": 28670 + }, + { + "epoch": 0.63722004976893, + "grad_norm": 2.546875, + "learning_rate": 7.253777777777779e-06, + "loss": 0.3712, + "step": 28680 + }, + { + "epoch": 0.6374422324920014, + "grad_norm": 2.203125, + "learning_rate": 7.249333333333334e-06, + "loss": 0.3837, + "step": 28690 + }, + { + "epoch": 0.6376644152150729, + "grad_norm": 2.375, + "learning_rate": 7.244888888888889e-06, + "loss": 0.3379, + "step": 28700 + }, + { + "epoch": 0.6378865979381443, + "grad_norm": 2.484375, + "learning_rate": 7.240444444444446e-06, + "loss": 0.3851, + "step": 28710 + }, + { + "epoch": 0.6381087806612158, + "grad_norm": 2.953125, + "learning_rate": 7.236000000000001e-06, + "loss": 0.3968, + "step": 28720 + }, + { + "epoch": 0.6383309633842872, + "grad_norm": 2.15625, + "learning_rate": 7.231555555555556e-06, + "loss": 0.3234, + "step": 28730 + }, + { + "epoch": 0.6385531461073587, + "grad_norm": 2.125, + "learning_rate": 7.227111111111111e-06, + "loss": 0.3903, + "step": 28740 + }, + { + "epoch": 0.6387753288304302, + "grad_norm": 2.109375, + "learning_rate": 7.222666666666667e-06, + "loss": 0.359, + "step": 28750 + }, + { + "epoch": 0.6389975115535016, + "grad_norm": 2.84375, + "learning_rate": 7.2182222222222235e-06, + "loss": 0.3954, + "step": 28760 + }, + { + "epoch": 0.6392196942765731, + "grad_norm": 2.265625, + "learning_rate": 7.213777777777778e-06, + "loss": 0.4062, + "step": 28770 + }, + { + "epoch": 0.6394418769996445, + "grad_norm": 2.28125, + "learning_rate": 7.209333333333334e-06, + "loss": 0.3806, + "step": 28780 + }, + { + "epoch": 0.639664059722716, + "grad_norm": 2.421875, + "learning_rate": 7.204888888888889e-06, + "loss": 0.3964, + "step": 28790 + }, + { + "epoch": 0.6398862424457874, + "grad_norm": 2.53125, + "learning_rate": 7.200444444444445e-06, + "loss": 0.3821, + "step": 28800 + }, + { + "epoch": 0.6401084251688589, + "grad_norm": 3.140625, + "learning_rate": 7.196000000000001e-06, + "loss": 0.3998, + "step": 28810 + }, + { + "epoch": 0.6403306078919303, + "grad_norm": 2.71875, + "learning_rate": 7.191555555555556e-06, + "loss": 0.3813, + "step": 28820 + }, + { + "epoch": 0.6405527906150018, + "grad_norm": 2.4375, + "learning_rate": 7.187111111111112e-06, + "loss": 0.437, + "step": 28830 + }, + { + "epoch": 0.6407749733380732, + "grad_norm": 3.140625, + "learning_rate": 7.182666666666668e-06, + "loss": 0.3915, + "step": 28840 + }, + { + "epoch": 0.6409971560611447, + "grad_norm": 2.609375, + "learning_rate": 7.178222222222223e-06, + "loss": 0.372, + "step": 28850 + }, + { + "epoch": 0.6412193387842161, + "grad_norm": 2.4375, + "learning_rate": 7.173777777777779e-06, + "loss": 0.3675, + "step": 28860 + }, + { + "epoch": 0.6414415215072876, + "grad_norm": 2.453125, + "learning_rate": 7.1693333333333335e-06, + "loss": 0.3891, + "step": 28870 + }, + { + "epoch": 0.641663704230359, + "grad_norm": 2.15625, + "learning_rate": 7.164888888888889e-06, + "loss": 0.3852, + "step": 28880 + }, + { + "epoch": 0.6418858869534305, + "grad_norm": 2.328125, + "learning_rate": 7.160444444444446e-06, + "loss": 0.3704, + "step": 28890 + }, + { + "epoch": 0.6421080696765019, + "grad_norm": 2.34375, + "learning_rate": 7.156000000000001e-06, + "loss": 0.4069, + "step": 28900 + }, + { + "epoch": 0.6423302523995734, + "grad_norm": 2.453125, + "learning_rate": 7.151555555555556e-06, + "loss": 0.3886, + "step": 28910 + }, + { + "epoch": 0.6425524351226448, + "grad_norm": 2.25, + "learning_rate": 7.147111111111111e-06, + "loss": 0.3955, + "step": 28920 + }, + { + "epoch": 0.6427746178457163, + "grad_norm": 2.34375, + "learning_rate": 7.142666666666668e-06, + "loss": 0.3679, + "step": 28930 + }, + { + "epoch": 0.6429968005687877, + "grad_norm": 2.3125, + "learning_rate": 7.138222222222223e-06, + "loss": 0.3426, + "step": 28940 + }, + { + "epoch": 0.6432189832918592, + "grad_norm": 2.0625, + "learning_rate": 7.133777777777778e-06, + "loss": 0.378, + "step": 28950 + }, + { + "epoch": 0.6434411660149307, + "grad_norm": 2.265625, + "learning_rate": 7.129333333333334e-06, + "loss": 0.4086, + "step": 28960 + }, + { + "epoch": 0.6436633487380021, + "grad_norm": 2.84375, + "learning_rate": 7.124888888888889e-06, + "loss": 0.3604, + "step": 28970 + }, + { + "epoch": 0.6438855314610736, + "grad_norm": 3.0, + "learning_rate": 7.120444444444445e-06, + "loss": 0.41, + "step": 28980 + }, + { + "epoch": 0.644107714184145, + "grad_norm": 2.234375, + "learning_rate": 7.116000000000001e-06, + "loss": 0.3723, + "step": 28990 + }, + { + "epoch": 0.6443298969072165, + "grad_norm": 2.546875, + "learning_rate": 7.111555555555556e-06, + "loss": 0.4027, + "step": 29000 + }, + { + "epoch": 0.6445520796302879, + "grad_norm": 2.484375, + "learning_rate": 7.1071111111111115e-06, + "loss": 0.3697, + "step": 29010 + }, + { + "epoch": 0.6447742623533594, + "grad_norm": 2.46875, + "learning_rate": 7.102666666666668e-06, + "loss": 0.3818, + "step": 29020 + }, + { + "epoch": 0.6449964450764308, + "grad_norm": 2.3125, + "learning_rate": 7.098222222222223e-06, + "loss": 0.3822, + "step": 29030 + }, + { + "epoch": 0.6452186277995023, + "grad_norm": 2.34375, + "learning_rate": 7.093777777777779e-06, + "loss": 0.358, + "step": 29040 + }, + { + "epoch": 0.6454408105225737, + "grad_norm": 2.109375, + "learning_rate": 7.0893333333333334e-06, + "loss": 0.4125, + "step": 29050 + }, + { + "epoch": 0.6456629932456452, + "grad_norm": 2.484375, + "learning_rate": 7.084888888888889e-06, + "loss": 0.3836, + "step": 29060 + }, + { + "epoch": 0.6458851759687166, + "grad_norm": 2.4375, + "learning_rate": 7.080444444444446e-06, + "loss": 0.3953, + "step": 29070 + }, + { + "epoch": 0.6461073586917881, + "grad_norm": 2.84375, + "learning_rate": 7.0760000000000005e-06, + "loss": 0.4048, + "step": 29080 + }, + { + "epoch": 0.6463295414148595, + "grad_norm": 2.046875, + "learning_rate": 7.071555555555556e-06, + "loss": 0.3697, + "step": 29090 + }, + { + "epoch": 0.646551724137931, + "grad_norm": 2.6875, + "learning_rate": 7.067111111111111e-06, + "loss": 0.3944, + "step": 29100 + }, + { + "epoch": 0.6467739068610024, + "grad_norm": 2.171875, + "learning_rate": 7.062666666666668e-06, + "loss": 0.3524, + "step": 29110 + }, + { + "epoch": 0.646996089584074, + "grad_norm": 2.46875, + "learning_rate": 7.058222222222223e-06, + "loss": 0.3895, + "step": 29120 + }, + { + "epoch": 0.6472182723071453, + "grad_norm": 2.265625, + "learning_rate": 7.053777777777778e-06, + "loss": 0.4205, + "step": 29130 + }, + { + "epoch": 0.6474404550302169, + "grad_norm": 2.53125, + "learning_rate": 7.049333333333334e-06, + "loss": 0.3825, + "step": 29140 + }, + { + "epoch": 0.6476626377532883, + "grad_norm": 2.890625, + "learning_rate": 7.04488888888889e-06, + "loss": 0.4281, + "step": 29150 + }, + { + "epoch": 0.6478848204763598, + "grad_norm": 2.34375, + "learning_rate": 7.040444444444445e-06, + "loss": 0.3488, + "step": 29160 + }, + { + "epoch": 0.6481070031994313, + "grad_norm": 2.078125, + "learning_rate": 7.036000000000001e-06, + "loss": 0.3986, + "step": 29170 + }, + { + "epoch": 0.6483291859225027, + "grad_norm": 3.15625, + "learning_rate": 7.031555555555556e-06, + "loss": 0.3739, + "step": 29180 + }, + { + "epoch": 0.6485513686455742, + "grad_norm": 2.171875, + "learning_rate": 7.0271111111111114e-06, + "loss": 0.3763, + "step": 29190 + }, + { + "epoch": 0.6487735513686456, + "grad_norm": 2.671875, + "learning_rate": 7.022666666666668e-06, + "loss": 0.3906, + "step": 29200 + }, + { + "epoch": 0.6489957340917171, + "grad_norm": 2.734375, + "learning_rate": 7.018222222222223e-06, + "loss": 0.3285, + "step": 29210 + }, + { + "epoch": 0.6492179168147885, + "grad_norm": 2.5, + "learning_rate": 7.0137777777777785e-06, + "loss": 0.4033, + "step": 29220 + }, + { + "epoch": 0.64944009953786, + "grad_norm": 2.65625, + "learning_rate": 7.009333333333333e-06, + "loss": 0.4007, + "step": 29230 + }, + { + "epoch": 0.6496622822609314, + "grad_norm": 2.28125, + "learning_rate": 7.00488888888889e-06, + "loss": 0.4126, + "step": 29240 + }, + { + "epoch": 0.6498844649840029, + "grad_norm": 2.5625, + "learning_rate": 7.000444444444446e-06, + "loss": 0.357, + "step": 29250 + }, + { + "epoch": 0.6501066477070743, + "grad_norm": 2.171875, + "learning_rate": 6.9960000000000004e-06, + "loss": 0.3915, + "step": 29260 + }, + { + "epoch": 0.6503288304301458, + "grad_norm": 2.71875, + "learning_rate": 6.991555555555556e-06, + "loss": 0.3613, + "step": 29270 + }, + { + "epoch": 0.6505510131532172, + "grad_norm": 2.4375, + "learning_rate": 6.987111111111111e-06, + "loss": 0.3869, + "step": 29280 + }, + { + "epoch": 0.6507731958762887, + "grad_norm": 3.578125, + "learning_rate": 6.9826666666666675e-06, + "loss": 0.3879, + "step": 29290 + }, + { + "epoch": 0.6509953785993601, + "grad_norm": 2.203125, + "learning_rate": 6.978222222222223e-06, + "loss": 0.3714, + "step": 29300 + }, + { + "epoch": 0.6512175613224316, + "grad_norm": 2.375, + "learning_rate": 6.973777777777778e-06, + "loss": 0.426, + "step": 29310 + }, + { + "epoch": 0.651439744045503, + "grad_norm": 2.765625, + "learning_rate": 6.969333333333334e-06, + "loss": 0.3995, + "step": 29320 + }, + { + "epoch": 0.6516619267685745, + "grad_norm": 2.46875, + "learning_rate": 6.96488888888889e-06, + "loss": 0.3908, + "step": 29330 + }, + { + "epoch": 0.6518841094916459, + "grad_norm": 2.453125, + "learning_rate": 6.960444444444445e-06, + "loss": 0.3951, + "step": 29340 + }, + { + "epoch": 0.6521062922147174, + "grad_norm": 2.171875, + "learning_rate": 6.956000000000001e-06, + "loss": 0.3983, + "step": 29350 + }, + { + "epoch": 0.6523284749377888, + "grad_norm": 2.625, + "learning_rate": 6.951555555555556e-06, + "loss": 0.3974, + "step": 29360 + }, + { + "epoch": 0.6525506576608603, + "grad_norm": 2.59375, + "learning_rate": 6.947111111111111e-06, + "loss": 0.3993, + "step": 29370 + }, + { + "epoch": 0.6527728403839318, + "grad_norm": 2.859375, + "learning_rate": 6.942666666666668e-06, + "loss": 0.3831, + "step": 29380 + }, + { + "epoch": 0.6529950231070032, + "grad_norm": 2.671875, + "learning_rate": 6.938222222222223e-06, + "loss": 0.412, + "step": 29390 + }, + { + "epoch": 0.6532172058300747, + "grad_norm": 2.734375, + "learning_rate": 6.9337777777777784e-06, + "loss": 0.3932, + "step": 29400 + }, + { + "epoch": 0.6534393885531461, + "grad_norm": 2.515625, + "learning_rate": 6.929333333333333e-06, + "loss": 0.3707, + "step": 29410 + }, + { + "epoch": 0.6536615712762176, + "grad_norm": 2.859375, + "learning_rate": 6.92488888888889e-06, + "loss": 0.3733, + "step": 29420 + }, + { + "epoch": 0.653883753999289, + "grad_norm": 2.46875, + "learning_rate": 6.9204444444444455e-06, + "loss": 0.3707, + "step": 29430 + }, + { + "epoch": 0.6541059367223605, + "grad_norm": 3.203125, + "learning_rate": 6.916e-06, + "loss": 0.3863, + "step": 29440 + }, + { + "epoch": 0.6543281194454319, + "grad_norm": 1.8046875, + "learning_rate": 6.911555555555556e-06, + "loss": 0.3652, + "step": 29450 + }, + { + "epoch": 0.6545503021685034, + "grad_norm": 2.671875, + "learning_rate": 6.907111111111113e-06, + "loss": 0.3804, + "step": 29460 + }, + { + "epoch": 0.6547724848915748, + "grad_norm": 2.578125, + "learning_rate": 6.9026666666666674e-06, + "loss": 0.3855, + "step": 29470 + }, + { + "epoch": 0.6549946676146463, + "grad_norm": 2.15625, + "learning_rate": 6.898222222222223e-06, + "loss": 0.3665, + "step": 29480 + }, + { + "epoch": 0.6552168503377177, + "grad_norm": 2.40625, + "learning_rate": 6.893777777777778e-06, + "loss": 0.3597, + "step": 29490 + }, + { + "epoch": 0.6554390330607892, + "grad_norm": 3.25, + "learning_rate": 6.889333333333334e-06, + "loss": 0.358, + "step": 29500 + }, + { + "epoch": 0.6556612157838606, + "grad_norm": 2.265625, + "learning_rate": 6.88488888888889e-06, + "loss": 0.3633, + "step": 29510 + }, + { + "epoch": 0.6558833985069321, + "grad_norm": 2.734375, + "learning_rate": 6.880444444444445e-06, + "loss": 0.3802, + "step": 29520 + }, + { + "epoch": 0.6561055812300035, + "grad_norm": 2.5625, + "learning_rate": 6.876000000000001e-06, + "loss": 0.3688, + "step": 29530 + }, + { + "epoch": 0.656327763953075, + "grad_norm": 2.234375, + "learning_rate": 6.871555555555556e-06, + "loss": 0.3919, + "step": 29540 + }, + { + "epoch": 0.6565499466761464, + "grad_norm": 2.375, + "learning_rate": 6.867111111111112e-06, + "loss": 0.394, + "step": 29550 + }, + { + "epoch": 0.6567721293992179, + "grad_norm": 2.375, + "learning_rate": 6.862666666666668e-06, + "loss": 0.3783, + "step": 29560 + }, + { + "epoch": 0.6569943121222894, + "grad_norm": 2.375, + "learning_rate": 6.858222222222223e-06, + "loss": 0.3726, + "step": 29570 + }, + { + "epoch": 0.6572164948453608, + "grad_norm": 2.296875, + "learning_rate": 6.853777777777778e-06, + "loss": 0.3872, + "step": 29580 + }, + { + "epoch": 0.6574386775684323, + "grad_norm": 2.5, + "learning_rate": 6.849333333333333e-06, + "loss": 0.3658, + "step": 29590 + }, + { + "epoch": 0.6576608602915037, + "grad_norm": 2.484375, + "learning_rate": 6.84488888888889e-06, + "loss": 0.3468, + "step": 29600 + }, + { + "epoch": 0.6578830430145752, + "grad_norm": 2.625, + "learning_rate": 6.8404444444444454e-06, + "loss": 0.4079, + "step": 29610 + }, + { + "epoch": 0.6581052257376466, + "grad_norm": 2.609375, + "learning_rate": 6.836e-06, + "loss": 0.3706, + "step": 29620 + }, + { + "epoch": 0.6583274084607181, + "grad_norm": 2.453125, + "learning_rate": 6.831555555555556e-06, + "loss": 0.3939, + "step": 29630 + }, + { + "epoch": 0.6585495911837895, + "grad_norm": 2.859375, + "learning_rate": 6.8271111111111125e-06, + "loss": 0.4351, + "step": 29640 + }, + { + "epoch": 0.658771773906861, + "grad_norm": 2.46875, + "learning_rate": 6.822666666666667e-06, + "loss": 0.4096, + "step": 29650 + }, + { + "epoch": 0.6589939566299324, + "grad_norm": 2.15625, + "learning_rate": 6.818222222222223e-06, + "loss": 0.3693, + "step": 29660 + }, + { + "epoch": 0.6592161393530039, + "grad_norm": 2.171875, + "learning_rate": 6.813777777777778e-06, + "loss": 0.3968, + "step": 29670 + }, + { + "epoch": 0.6594383220760753, + "grad_norm": 2.265625, + "learning_rate": 6.809333333333334e-06, + "loss": 0.3775, + "step": 29680 + }, + { + "epoch": 0.6596605047991468, + "grad_norm": 2.296875, + "learning_rate": 6.80488888888889e-06, + "loss": 0.3893, + "step": 29690 + }, + { + "epoch": 0.6598826875222182, + "grad_norm": 2.734375, + "learning_rate": 6.800444444444445e-06, + "loss": 0.3933, + "step": 29700 + }, + { + "epoch": 0.6601048702452897, + "grad_norm": 2.40625, + "learning_rate": 6.796000000000001e-06, + "loss": 0.412, + "step": 29710 + }, + { + "epoch": 0.6603270529683611, + "grad_norm": 2.3125, + "learning_rate": 6.7915555555555555e-06, + "loss": 0.3521, + "step": 29720 + }, + { + "epoch": 0.6605492356914326, + "grad_norm": 2.1875, + "learning_rate": 6.787111111111112e-06, + "loss": 0.3412, + "step": 29730 + }, + { + "epoch": 0.660771418414504, + "grad_norm": 2.453125, + "learning_rate": 6.782666666666668e-06, + "loss": 0.367, + "step": 29740 + }, + { + "epoch": 0.6609936011375755, + "grad_norm": 2.359375, + "learning_rate": 6.778222222222223e-06, + "loss": 0.3539, + "step": 29750 + }, + { + "epoch": 0.6612157838606469, + "grad_norm": 2.171875, + "learning_rate": 6.773777777777778e-06, + "loss": 0.3605, + "step": 29760 + }, + { + "epoch": 0.6614379665837185, + "grad_norm": 2.15625, + "learning_rate": 6.769333333333335e-06, + "loss": 0.3604, + "step": 29770 + }, + { + "epoch": 0.66166014930679, + "grad_norm": 2.265625, + "learning_rate": 6.76488888888889e-06, + "loss": 0.3803, + "step": 29780 + }, + { + "epoch": 0.6618823320298614, + "grad_norm": 2.65625, + "learning_rate": 6.760444444444445e-06, + "loss": 0.4003, + "step": 29790 + }, + { + "epoch": 0.6621045147529329, + "grad_norm": 2.03125, + "learning_rate": 6.756e-06, + "loss": 0.3609, + "step": 29800 + }, + { + "epoch": 0.6623266974760043, + "grad_norm": 2.1875, + "learning_rate": 6.751555555555556e-06, + "loss": 0.3755, + "step": 29810 + }, + { + "epoch": 0.6625488801990758, + "grad_norm": 2.484375, + "learning_rate": 6.7471111111111124e-06, + "loss": 0.3864, + "step": 29820 + }, + { + "epoch": 0.6627710629221472, + "grad_norm": 2.625, + "learning_rate": 6.742666666666667e-06, + "loss": 0.3745, + "step": 29830 + }, + { + "epoch": 0.6629932456452187, + "grad_norm": 2.125, + "learning_rate": 6.738222222222223e-06, + "loss": 0.3283, + "step": 29840 + }, + { + "epoch": 0.6632154283682901, + "grad_norm": 2.703125, + "learning_rate": 6.733777777777778e-06, + "loss": 0.3893, + "step": 29850 + }, + { + "epoch": 0.6634376110913616, + "grad_norm": 2.21875, + "learning_rate": 6.729333333333334e-06, + "loss": 0.3899, + "step": 29860 + }, + { + "epoch": 0.663659793814433, + "grad_norm": 3.125, + "learning_rate": 6.72488888888889e-06, + "loss": 0.3545, + "step": 29870 + }, + { + "epoch": 0.6638819765375045, + "grad_norm": 2.421875, + "learning_rate": 6.720444444444445e-06, + "loss": 0.357, + "step": 29880 + }, + { + "epoch": 0.6641041592605759, + "grad_norm": 2.40625, + "learning_rate": 6.716000000000001e-06, + "loss": 0.369, + "step": 29890 + }, + { + "epoch": 0.6643263419836474, + "grad_norm": 2.515625, + "learning_rate": 6.7115555555555554e-06, + "loss": 0.4066, + "step": 29900 + }, + { + "epoch": 0.6645485247067188, + "grad_norm": 2.375, + "learning_rate": 6.707111111111112e-06, + "loss": 0.3894, + "step": 29910 + }, + { + "epoch": 0.6647707074297903, + "grad_norm": 2.703125, + "learning_rate": 6.702666666666668e-06, + "loss": 0.3918, + "step": 29920 + }, + { + "epoch": 0.6649928901528617, + "grad_norm": 2.609375, + "learning_rate": 6.6982222222222225e-06, + "loss": 0.402, + "step": 29930 + }, + { + "epoch": 0.6652150728759332, + "grad_norm": 3.09375, + "learning_rate": 6.693777777777778e-06, + "loss": 0.3942, + "step": 29940 + }, + { + "epoch": 0.6654372555990046, + "grad_norm": 2.234375, + "learning_rate": 6.689333333333335e-06, + "loss": 0.3799, + "step": 29950 + }, + { + "epoch": 0.6656594383220761, + "grad_norm": 2.375, + "learning_rate": 6.68488888888889e-06, + "loss": 0.4013, + "step": 29960 + }, + { + "epoch": 0.6658816210451475, + "grad_norm": 3.0625, + "learning_rate": 6.680444444444445e-06, + "loss": 0.406, + "step": 29970 + }, + { + "epoch": 0.666103803768219, + "grad_norm": 2.34375, + "learning_rate": 6.676e-06, + "loss": 0.3626, + "step": 29980 + }, + { + "epoch": 0.6663259864912905, + "grad_norm": 2.890625, + "learning_rate": 6.671555555555556e-06, + "loss": 0.4231, + "step": 29990 + }, + { + "epoch": 0.6665481692143619, + "grad_norm": 2.359375, + "learning_rate": 6.667111111111112e-06, + "loss": 0.4099, + "step": 30000 + }, + { + "epoch": 0.6667703519374334, + "grad_norm": 2.609375, + "learning_rate": 6.662666666666667e-06, + "loss": 0.3699, + "step": 30010 + }, + { + "epoch": 0.6669925346605048, + "grad_norm": 2.640625, + "learning_rate": 6.658222222222223e-06, + "loss": 0.3677, + "step": 30020 + }, + { + "epoch": 0.6672147173835763, + "grad_norm": 2.4375, + "learning_rate": 6.653777777777778e-06, + "loss": 0.3877, + "step": 30030 + }, + { + "epoch": 0.6674369001066477, + "grad_norm": 2.859375, + "learning_rate": 6.649333333333334e-06, + "loss": 0.3941, + "step": 30040 + }, + { + "epoch": 0.6676590828297192, + "grad_norm": 2.3125, + "learning_rate": 6.64488888888889e-06, + "loss": 0.3895, + "step": 30050 + }, + { + "epoch": 0.6678812655527906, + "grad_norm": 2.671875, + "learning_rate": 6.640444444444445e-06, + "loss": 0.4168, + "step": 30060 + }, + { + "epoch": 0.6681034482758621, + "grad_norm": 2.984375, + "learning_rate": 6.6360000000000005e-06, + "loss": 0.3704, + "step": 30070 + }, + { + "epoch": 0.6683256309989335, + "grad_norm": 2.921875, + "learning_rate": 6.631555555555557e-06, + "loss": 0.3571, + "step": 30080 + }, + { + "epoch": 0.668547813722005, + "grad_norm": 2.828125, + "learning_rate": 6.627111111111112e-06, + "loss": 0.3695, + "step": 30090 + }, + { + "epoch": 0.6687699964450764, + "grad_norm": 2.515625, + "learning_rate": 6.622666666666668e-06, + "loss": 0.3893, + "step": 30100 + }, + { + "epoch": 0.6689921791681479, + "grad_norm": 2.5, + "learning_rate": 6.618222222222222e-06, + "loss": 0.3676, + "step": 30110 + }, + { + "epoch": 0.6692143618912193, + "grad_norm": 2.84375, + "learning_rate": 6.613777777777778e-06, + "loss": 0.3457, + "step": 30120 + }, + { + "epoch": 0.6694365446142908, + "grad_norm": 2.453125, + "learning_rate": 6.609333333333335e-06, + "loss": 0.388, + "step": 30130 + }, + { + "epoch": 0.6696587273373622, + "grad_norm": 2.328125, + "learning_rate": 6.6048888888888895e-06, + "loss": 0.3966, + "step": 30140 + }, + { + "epoch": 0.6698809100604337, + "grad_norm": 2.609375, + "learning_rate": 6.600444444444445e-06, + "loss": 0.3633, + "step": 30150 + }, + { + "epoch": 0.6701030927835051, + "grad_norm": 3.234375, + "learning_rate": 6.596e-06, + "loss": 0.3536, + "step": 30160 + }, + { + "epoch": 0.6703252755065766, + "grad_norm": 2.359375, + "learning_rate": 6.5915555555555566e-06, + "loss": 0.3891, + "step": 30170 + }, + { + "epoch": 0.670547458229648, + "grad_norm": 2.34375, + "learning_rate": 6.587111111111112e-06, + "loss": 0.379, + "step": 30180 + }, + { + "epoch": 0.6707696409527195, + "grad_norm": 2.328125, + "learning_rate": 6.582666666666667e-06, + "loss": 0.379, + "step": 30190 + }, + { + "epoch": 0.670991823675791, + "grad_norm": 2.640625, + "learning_rate": 6.578222222222223e-06, + "loss": 0.3898, + "step": 30200 + }, + { + "epoch": 0.6712140063988624, + "grad_norm": 2.671875, + "learning_rate": 6.573777777777778e-06, + "loss": 0.3695, + "step": 30210 + }, + { + "epoch": 0.6714361891219339, + "grad_norm": 2.875, + "learning_rate": 6.569333333333334e-06, + "loss": 0.3822, + "step": 30220 + }, + { + "epoch": 0.6716583718450053, + "grad_norm": 2.140625, + "learning_rate": 6.56488888888889e-06, + "loss": 0.3455, + "step": 30230 + }, + { + "epoch": 0.6718805545680768, + "grad_norm": 2.390625, + "learning_rate": 6.560444444444445e-06, + "loss": 0.3591, + "step": 30240 + }, + { + "epoch": 0.6721027372911482, + "grad_norm": 2.75, + "learning_rate": 6.556e-06, + "loss": 0.3789, + "step": 30250 + }, + { + "epoch": 0.6723249200142197, + "grad_norm": 2.53125, + "learning_rate": 6.551555555555557e-06, + "loss": 0.3421, + "step": 30260 + }, + { + "epoch": 0.6725471027372911, + "grad_norm": 2.5625, + "learning_rate": 6.547111111111112e-06, + "loss": 0.3677, + "step": 30270 + }, + { + "epoch": 0.6727692854603626, + "grad_norm": 2.984375, + "learning_rate": 6.5426666666666675e-06, + "loss": 0.3675, + "step": 30280 + }, + { + "epoch": 0.672991468183434, + "grad_norm": 2.375, + "learning_rate": 6.538222222222222e-06, + "loss": 0.3698, + "step": 30290 + }, + { + "epoch": 0.6732136509065055, + "grad_norm": 2.828125, + "learning_rate": 6.533777777777778e-06, + "loss": 0.3691, + "step": 30300 + }, + { + "epoch": 0.6734358336295769, + "grad_norm": 2.375, + "learning_rate": 6.5293333333333346e-06, + "loss": 0.3903, + "step": 30310 + }, + { + "epoch": 0.6736580163526484, + "grad_norm": 2.46875, + "learning_rate": 6.524888888888889e-06, + "loss": 0.3781, + "step": 30320 + }, + { + "epoch": 0.6738801990757198, + "grad_norm": 2.53125, + "learning_rate": 6.520444444444445e-06, + "loss": 0.3968, + "step": 30330 + }, + { + "epoch": 0.6741023817987913, + "grad_norm": 2.3125, + "learning_rate": 6.516e-06, + "loss": 0.3784, + "step": 30340 + }, + { + "epoch": 0.6743245645218627, + "grad_norm": 2.28125, + "learning_rate": 6.5115555555555565e-06, + "loss": 0.3678, + "step": 30350 + }, + { + "epoch": 0.6745467472449342, + "grad_norm": 2.375, + "learning_rate": 6.507111111111112e-06, + "loss": 0.3828, + "step": 30360 + }, + { + "epoch": 0.6747689299680056, + "grad_norm": 2.296875, + "learning_rate": 6.502666666666667e-06, + "loss": 0.3999, + "step": 30370 + }, + { + "epoch": 0.6749911126910771, + "grad_norm": 2.171875, + "learning_rate": 6.498222222222223e-06, + "loss": 0.3861, + "step": 30380 + }, + { + "epoch": 0.6752132954141487, + "grad_norm": 2.578125, + "learning_rate": 6.493777777777779e-06, + "loss": 0.3735, + "step": 30390 + }, + { + "epoch": 0.67543547813722, + "grad_norm": 2.59375, + "learning_rate": 6.489333333333334e-06, + "loss": 0.3889, + "step": 30400 + }, + { + "epoch": 0.6756576608602916, + "grad_norm": 2.734375, + "learning_rate": 6.48488888888889e-06, + "loss": 0.3923, + "step": 30410 + }, + { + "epoch": 0.675879843583363, + "grad_norm": 2.5625, + "learning_rate": 6.480444444444445e-06, + "loss": 0.3722, + "step": 30420 + }, + { + "epoch": 0.6761020263064345, + "grad_norm": 2.234375, + "learning_rate": 6.476e-06, + "loss": 0.4002, + "step": 30430 + }, + { + "epoch": 0.6763242090295059, + "grad_norm": 2.828125, + "learning_rate": 6.471555555555557e-06, + "loss": 0.4155, + "step": 30440 + }, + { + "epoch": 0.6765463917525774, + "grad_norm": 2.421875, + "learning_rate": 6.467111111111112e-06, + "loss": 0.4074, + "step": 30450 + }, + { + "epoch": 0.6767685744756488, + "grad_norm": 2.765625, + "learning_rate": 6.462666666666667e-06, + "loss": 0.3554, + "step": 30460 + }, + { + "epoch": 0.6769907571987203, + "grad_norm": 2.234375, + "learning_rate": 6.458222222222222e-06, + "loss": 0.3737, + "step": 30470 + }, + { + "epoch": 0.6772129399217917, + "grad_norm": 2.34375, + "learning_rate": 6.453777777777779e-06, + "loss": 0.3717, + "step": 30480 + }, + { + "epoch": 0.6774351226448632, + "grad_norm": 2.21875, + "learning_rate": 6.4493333333333345e-06, + "loss": 0.3333, + "step": 30490 + }, + { + "epoch": 0.6776573053679346, + "grad_norm": 2.1875, + "learning_rate": 6.444888888888889e-06, + "loss": 0.405, + "step": 30500 + }, + { + "epoch": 0.6778794880910061, + "grad_norm": 2.234375, + "learning_rate": 6.440444444444445e-06, + "loss": 0.3492, + "step": 30510 + }, + { + "epoch": 0.6781016708140775, + "grad_norm": 2.796875, + "learning_rate": 6.436e-06, + "loss": 0.4077, + "step": 30520 + }, + { + "epoch": 0.678323853537149, + "grad_norm": 2.765625, + "learning_rate": 6.431555555555556e-06, + "loss": 0.3826, + "step": 30530 + }, + { + "epoch": 0.6785460362602204, + "grad_norm": 2.578125, + "learning_rate": 6.427111111111112e-06, + "loss": 0.3546, + "step": 30540 + }, + { + "epoch": 0.6787682189832919, + "grad_norm": 2.578125, + "learning_rate": 6.422666666666667e-06, + "loss": 0.3804, + "step": 30550 + }, + { + "epoch": 0.6789904017063633, + "grad_norm": 2.546875, + "learning_rate": 6.418222222222223e-06, + "loss": 0.4225, + "step": 30560 + }, + { + "epoch": 0.6792125844294348, + "grad_norm": 2.96875, + "learning_rate": 6.413777777777779e-06, + "loss": 0.3724, + "step": 30570 + }, + { + "epoch": 0.6794347671525062, + "grad_norm": 2.28125, + "learning_rate": 6.409333333333334e-06, + "loss": 0.3994, + "step": 30580 + }, + { + "epoch": 0.6796569498755777, + "grad_norm": 2.03125, + "learning_rate": 6.40488888888889e-06, + "loss": 0.3607, + "step": 30590 + }, + { + "epoch": 0.6798791325986492, + "grad_norm": 1.921875, + "learning_rate": 6.4004444444444446e-06, + "loss": 0.393, + "step": 30600 + }, + { + "epoch": 0.6801013153217206, + "grad_norm": 2.296875, + "learning_rate": 6.396e-06, + "loss": 0.3754, + "step": 30610 + }, + { + "epoch": 0.6803234980447921, + "grad_norm": 2.765625, + "learning_rate": 6.391555555555557e-06, + "loss": 0.4055, + "step": 30620 + }, + { + "epoch": 0.6805456807678635, + "grad_norm": 2.359375, + "learning_rate": 6.387111111111112e-06, + "loss": 0.3717, + "step": 30630 + }, + { + "epoch": 0.680767863490935, + "grad_norm": 2.375, + "learning_rate": 6.382666666666667e-06, + "loss": 0.3224, + "step": 30640 + }, + { + "epoch": 0.6809900462140064, + "grad_norm": 2.5625, + "learning_rate": 6.378222222222222e-06, + "loss": 0.3843, + "step": 30650 + }, + { + "epoch": 0.6812122289370779, + "grad_norm": 2.0, + "learning_rate": 6.373777777777779e-06, + "loss": 0.4126, + "step": 30660 + }, + { + "epoch": 0.6814344116601493, + "grad_norm": 2.375, + "learning_rate": 6.369333333333334e-06, + "loss": 0.3901, + "step": 30670 + }, + { + "epoch": 0.6816565943832208, + "grad_norm": 2.78125, + "learning_rate": 6.364888888888889e-06, + "loss": 0.3607, + "step": 30680 + }, + { + "epoch": 0.6818787771062922, + "grad_norm": 2.796875, + "learning_rate": 6.360444444444445e-06, + "loss": 0.3835, + "step": 30690 + }, + { + "epoch": 0.6821009598293637, + "grad_norm": 2.515625, + "learning_rate": 6.356000000000001e-06, + "loss": 0.3999, + "step": 30700 + }, + { + "epoch": 0.6823231425524351, + "grad_norm": 2.828125, + "learning_rate": 6.351555555555556e-06, + "loss": 0.3683, + "step": 30710 + }, + { + "epoch": 0.6825453252755066, + "grad_norm": 3.421875, + "learning_rate": 6.347111111111112e-06, + "loss": 0.3753, + "step": 30720 + }, + { + "epoch": 0.682767507998578, + "grad_norm": 2.28125, + "learning_rate": 6.342666666666667e-06, + "loss": 0.375, + "step": 30730 + }, + { + "epoch": 0.6829896907216495, + "grad_norm": 2.390625, + "learning_rate": 6.3382222222222226e-06, + "loss": 0.3567, + "step": 30740 + }, + { + "epoch": 0.6832118734447209, + "grad_norm": 2.625, + "learning_rate": 6.333777777777779e-06, + "loss": 0.3455, + "step": 30750 + }, + { + "epoch": 0.6834340561677924, + "grad_norm": 2.3125, + "learning_rate": 6.329333333333334e-06, + "loss": 0.3536, + "step": 30760 + }, + { + "epoch": 0.6836562388908638, + "grad_norm": 2.6875, + "learning_rate": 6.32488888888889e-06, + "loss": 0.3903, + "step": 30770 + }, + { + "epoch": 0.6838784216139353, + "grad_norm": 2.625, + "learning_rate": 6.3204444444444445e-06, + "loss": 0.402, + "step": 30780 + }, + { + "epoch": 0.6841006043370067, + "grad_norm": 2.828125, + "learning_rate": 6.316000000000001e-06, + "loss": 0.3951, + "step": 30790 + }, + { + "epoch": 0.6843227870600782, + "grad_norm": 2.265625, + "learning_rate": 6.311555555555557e-06, + "loss": 0.3408, + "step": 30800 + }, + { + "epoch": 0.6845449697831497, + "grad_norm": 2.328125, + "learning_rate": 6.3071111111111116e-06, + "loss": 0.3605, + "step": 30810 + }, + { + "epoch": 0.6847671525062211, + "grad_norm": 2.171875, + "learning_rate": 6.302666666666667e-06, + "loss": 0.4112, + "step": 30820 + }, + { + "epoch": 0.6849893352292926, + "grad_norm": 2.453125, + "learning_rate": 6.298222222222222e-06, + "loss": 0.379, + "step": 30830 + }, + { + "epoch": 0.685211517952364, + "grad_norm": 3.140625, + "learning_rate": 6.293777777777779e-06, + "loss": 0.4127, + "step": 30840 + }, + { + "epoch": 0.6854337006754355, + "grad_norm": 2.96875, + "learning_rate": 6.289333333333334e-06, + "loss": 0.4008, + "step": 30850 + }, + { + "epoch": 0.6856558833985069, + "grad_norm": 2.28125, + "learning_rate": 6.284888888888889e-06, + "loss": 0.4076, + "step": 30860 + }, + { + "epoch": 0.6858780661215784, + "grad_norm": 3.203125, + "learning_rate": 6.280444444444445e-06, + "loss": 0.3614, + "step": 30870 + }, + { + "epoch": 0.6861002488446498, + "grad_norm": 2.265625, + "learning_rate": 6.2760000000000006e-06, + "loss": 0.357, + "step": 30880 + }, + { + "epoch": 0.6863224315677213, + "grad_norm": 2.390625, + "learning_rate": 6.271555555555556e-06, + "loss": 0.3925, + "step": 30890 + }, + { + "epoch": 0.6865446142907927, + "grad_norm": 2.59375, + "learning_rate": 6.267111111111112e-06, + "loss": 0.393, + "step": 30900 + }, + { + "epoch": 0.6867667970138642, + "grad_norm": 2.171875, + "learning_rate": 6.262666666666667e-06, + "loss": 0.4081, + "step": 30910 + }, + { + "epoch": 0.6869889797369356, + "grad_norm": 2.125, + "learning_rate": 6.2582222222222225e-06, + "loss": 0.4138, + "step": 30920 + }, + { + "epoch": 0.6872111624600071, + "grad_norm": 2.609375, + "learning_rate": 6.253777777777779e-06, + "loss": 0.38, + "step": 30930 + }, + { + "epoch": 0.6874333451830785, + "grad_norm": 3.125, + "learning_rate": 6.249333333333334e-06, + "loss": 0.397, + "step": 30940 + }, + { + "epoch": 0.68765552790615, + "grad_norm": 2.328125, + "learning_rate": 6.2448888888888896e-06, + "loss": 0.4009, + "step": 30950 + }, + { + "epoch": 0.6878777106292214, + "grad_norm": 3.0625, + "learning_rate": 6.240444444444444e-06, + "loss": 0.3613, + "step": 30960 + }, + { + "epoch": 0.6880998933522929, + "grad_norm": 2.546875, + "learning_rate": 6.236000000000001e-06, + "loss": 0.4026, + "step": 30970 + }, + { + "epoch": 0.6883220760753643, + "grad_norm": 2.40625, + "learning_rate": 6.231555555555557e-06, + "loss": 0.3755, + "step": 30980 + }, + { + "epoch": 0.6885442587984358, + "grad_norm": 2.328125, + "learning_rate": 6.2271111111111115e-06, + "loss": 0.4011, + "step": 30990 + }, + { + "epoch": 0.6887664415215072, + "grad_norm": 2.59375, + "learning_rate": 6.222666666666667e-06, + "loss": 0.3851, + "step": 31000 + }, + { + "epoch": 0.6889886242445787, + "grad_norm": 2.53125, + "learning_rate": 6.218222222222223e-06, + "loss": 0.3872, + "step": 31010 + }, + { + "epoch": 0.6892108069676502, + "grad_norm": 2.3125, + "learning_rate": 6.2137777777777786e-06, + "loss": 0.3654, + "step": 31020 + }, + { + "epoch": 0.6894329896907216, + "grad_norm": 2.25, + "learning_rate": 6.209333333333334e-06, + "loss": 0.3944, + "step": 31030 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 2.8125, + "learning_rate": 6.204888888888889e-06, + "loss": 0.3752, + "step": 31040 + }, + { + "epoch": 0.6898773551368645, + "grad_norm": 2.671875, + "learning_rate": 6.200444444444445e-06, + "loss": 0.3726, + "step": 31050 + }, + { + "epoch": 0.690099537859936, + "grad_norm": 2.515625, + "learning_rate": 6.196000000000001e-06, + "loss": 0.3663, + "step": 31060 + }, + { + "epoch": 0.6903217205830074, + "grad_norm": 2.140625, + "learning_rate": 6.191555555555556e-06, + "loss": 0.3583, + "step": 31070 + }, + { + "epoch": 0.690543903306079, + "grad_norm": 2.265625, + "learning_rate": 6.187111111111112e-06, + "loss": 0.3539, + "step": 31080 + }, + { + "epoch": 0.6907660860291504, + "grad_norm": 2.59375, + "learning_rate": 6.182666666666667e-06, + "loss": 0.4004, + "step": 31090 + }, + { + "epoch": 0.6909882687522219, + "grad_norm": 2.703125, + "learning_rate": 6.178222222222223e-06, + "loss": 0.3762, + "step": 31100 + }, + { + "epoch": 0.6912104514752933, + "grad_norm": 2.046875, + "learning_rate": 6.173777777777779e-06, + "loss": 0.3915, + "step": 31110 + }, + { + "epoch": 0.6914326341983648, + "grad_norm": 2.84375, + "learning_rate": 6.169333333333334e-06, + "loss": 0.4161, + "step": 31120 + }, + { + "epoch": 0.6916548169214362, + "grad_norm": 1.9453125, + "learning_rate": 6.1648888888888895e-06, + "loss": 0.333, + "step": 31130 + }, + { + "epoch": 0.6918769996445077, + "grad_norm": 2.265625, + "learning_rate": 6.160444444444444e-06, + "loss": 0.3612, + "step": 31140 + }, + { + "epoch": 0.6920991823675791, + "grad_norm": 2.234375, + "learning_rate": 6.156000000000001e-06, + "loss": 0.3741, + "step": 31150 + }, + { + "epoch": 0.6923213650906506, + "grad_norm": 2.203125, + "learning_rate": 6.1515555555555566e-06, + "loss": 0.3687, + "step": 31160 + }, + { + "epoch": 0.692543547813722, + "grad_norm": 2.71875, + "learning_rate": 6.147111111111111e-06, + "loss": 0.4184, + "step": 31170 + }, + { + "epoch": 0.6927657305367935, + "grad_norm": 2.0625, + "learning_rate": 6.142666666666667e-06, + "loss": 0.3746, + "step": 31180 + }, + { + "epoch": 0.6929879132598649, + "grad_norm": 2.890625, + "learning_rate": 6.138222222222223e-06, + "loss": 0.4053, + "step": 31190 + }, + { + "epoch": 0.6932100959829364, + "grad_norm": 1.984375, + "learning_rate": 6.1337777777777785e-06, + "loss": 0.3382, + "step": 31200 + }, + { + "epoch": 0.6934322787060079, + "grad_norm": 2.609375, + "learning_rate": 6.129333333333334e-06, + "loss": 0.4028, + "step": 31210 + }, + { + "epoch": 0.6936544614290793, + "grad_norm": 2.6875, + "learning_rate": 6.124888888888889e-06, + "loss": 0.4148, + "step": 31220 + }, + { + "epoch": 0.6938766441521508, + "grad_norm": 2.296875, + "learning_rate": 6.120444444444445e-06, + "loss": 0.3841, + "step": 31230 + }, + { + "epoch": 0.6940988268752222, + "grad_norm": 2.765625, + "learning_rate": 6.116000000000001e-06, + "loss": 0.4052, + "step": 31240 + }, + { + "epoch": 0.6943210095982937, + "grad_norm": 3.359375, + "learning_rate": 6.111555555555556e-06, + "loss": 0.3667, + "step": 31250 + }, + { + "epoch": 0.6945431923213651, + "grad_norm": 2.546875, + "learning_rate": 6.107111111111112e-06, + "loss": 0.3724, + "step": 31260 + }, + { + "epoch": 0.6947653750444366, + "grad_norm": 2.515625, + "learning_rate": 6.102666666666667e-06, + "loss": 0.3665, + "step": 31270 + }, + { + "epoch": 0.694987557767508, + "grad_norm": 2.71875, + "learning_rate": 6.098222222222223e-06, + "loss": 0.3945, + "step": 31280 + }, + { + "epoch": 0.6952097404905795, + "grad_norm": 2.75, + "learning_rate": 6.093777777777779e-06, + "loss": 0.3324, + "step": 31290 + }, + { + "epoch": 0.6954319232136509, + "grad_norm": 2.546875, + "learning_rate": 6.089333333333334e-06, + "loss": 0.3721, + "step": 31300 + }, + { + "epoch": 0.6956541059367224, + "grad_norm": 2.375, + "learning_rate": 6.084888888888889e-06, + "loss": 0.3802, + "step": 31310 + }, + { + "epoch": 0.6958762886597938, + "grad_norm": 2.328125, + "learning_rate": 6.080444444444445e-06, + "loss": 0.3598, + "step": 31320 + }, + { + "epoch": 0.6960984713828653, + "grad_norm": 2.375, + "learning_rate": 6.076000000000001e-06, + "loss": 0.3883, + "step": 31330 + }, + { + "epoch": 0.6963206541059367, + "grad_norm": 2.078125, + "learning_rate": 6.0715555555555565e-06, + "loss": 0.3485, + "step": 31340 + }, + { + "epoch": 0.6965428368290082, + "grad_norm": 2.53125, + "learning_rate": 6.067111111111111e-06, + "loss": 0.4005, + "step": 31350 + }, + { + "epoch": 0.6967650195520796, + "grad_norm": 2.4375, + "learning_rate": 6.062666666666667e-06, + "loss": 0.3968, + "step": 31360 + }, + { + "epoch": 0.6969872022751511, + "grad_norm": 2.25, + "learning_rate": 6.058222222222223e-06, + "loss": 0.3712, + "step": 31370 + }, + { + "epoch": 0.6972093849982225, + "grad_norm": 2.5, + "learning_rate": 6.053777777777778e-06, + "loss": 0.4079, + "step": 31380 + }, + { + "epoch": 0.697431567721294, + "grad_norm": 2.3125, + "learning_rate": 6.049333333333334e-06, + "loss": 0.369, + "step": 31390 + }, + { + "epoch": 0.6976537504443654, + "grad_norm": 2.546875, + "learning_rate": 6.044888888888889e-06, + "loss": 0.3667, + "step": 31400 + }, + { + "epoch": 0.6978759331674369, + "grad_norm": 2.171875, + "learning_rate": 6.0404444444444455e-06, + "loss": 0.3936, + "step": 31410 + }, + { + "epoch": 0.6980981158905084, + "grad_norm": 3.046875, + "learning_rate": 6.036000000000001e-06, + "loss": 0.415, + "step": 31420 + }, + { + "epoch": 0.6983202986135798, + "grad_norm": 2.421875, + "learning_rate": 6.031555555555556e-06, + "loss": 0.3716, + "step": 31430 + }, + { + "epoch": 0.6985424813366513, + "grad_norm": 2.515625, + "learning_rate": 6.027111111111112e-06, + "loss": 0.3485, + "step": 31440 + }, + { + "epoch": 0.6987646640597227, + "grad_norm": 2.15625, + "learning_rate": 6.0226666666666665e-06, + "loss": 0.3572, + "step": 31450 + }, + { + "epoch": 0.6989868467827942, + "grad_norm": 2.671875, + "learning_rate": 6.018222222222223e-06, + "loss": 0.3455, + "step": 31460 + }, + { + "epoch": 0.6992090295058656, + "grad_norm": 2.46875, + "learning_rate": 6.013777777777779e-06, + "loss": 0.38, + "step": 31470 + }, + { + "epoch": 0.6994312122289371, + "grad_norm": 2.9375, + "learning_rate": 6.009333333333334e-06, + "loss": 0.3581, + "step": 31480 + }, + { + "epoch": 0.6996533949520085, + "grad_norm": 2.265625, + "learning_rate": 6.004888888888889e-06, + "loss": 0.4038, + "step": 31490 + }, + { + "epoch": 0.69987557767508, + "grad_norm": 2.4375, + "learning_rate": 6.000444444444445e-06, + "loss": 0.3667, + "step": 31500 + }, + { + "epoch": 0.7000977603981514, + "grad_norm": 2.34375, + "learning_rate": 5.996000000000001e-06, + "loss": 0.3834, + "step": 31510 + }, + { + "epoch": 0.7003199431212229, + "grad_norm": 2.546875, + "learning_rate": 5.991555555555556e-06, + "loss": 0.364, + "step": 31520 + }, + { + "epoch": 0.7005421258442943, + "grad_norm": 2.671875, + "learning_rate": 5.987111111111111e-06, + "loss": 0.4001, + "step": 31530 + }, + { + "epoch": 0.7007643085673658, + "grad_norm": 2.9375, + "learning_rate": 5.982666666666667e-06, + "loss": 0.3787, + "step": 31540 + }, + { + "epoch": 0.7009864912904372, + "grad_norm": 2.90625, + "learning_rate": 5.978222222222223e-06, + "loss": 0.3874, + "step": 31550 + }, + { + "epoch": 0.7012086740135087, + "grad_norm": 2.28125, + "learning_rate": 5.973777777777778e-06, + "loss": 0.3811, + "step": 31560 + }, + { + "epoch": 0.7014308567365801, + "grad_norm": 2.859375, + "learning_rate": 5.969333333333334e-06, + "loss": 0.4122, + "step": 31570 + }, + { + "epoch": 0.7016530394596516, + "grad_norm": 2.328125, + "learning_rate": 5.964888888888889e-06, + "loss": 0.4028, + "step": 31580 + }, + { + "epoch": 0.701875222182723, + "grad_norm": 2.71875, + "learning_rate": 5.960444444444445e-06, + "loss": 0.4028, + "step": 31590 + }, + { + "epoch": 0.7020974049057945, + "grad_norm": 2.25, + "learning_rate": 5.956000000000001e-06, + "loss": 0.3656, + "step": 31600 + }, + { + "epoch": 0.7023195876288659, + "grad_norm": 2.3125, + "learning_rate": 5.951555555555556e-06, + "loss": 0.3823, + "step": 31610 + }, + { + "epoch": 0.7025417703519374, + "grad_norm": 2.328125, + "learning_rate": 5.947111111111112e-06, + "loss": 0.3883, + "step": 31620 + }, + { + "epoch": 0.7027639530750089, + "grad_norm": 2.21875, + "learning_rate": 5.942666666666667e-06, + "loss": 0.3654, + "step": 31630 + }, + { + "epoch": 0.7029861357980803, + "grad_norm": 2.359375, + "learning_rate": 5.938222222222223e-06, + "loss": 0.4223, + "step": 31640 + }, + { + "epoch": 0.7032083185211518, + "grad_norm": 2.375, + "learning_rate": 5.933777777777779e-06, + "loss": 0.3847, + "step": 31650 + }, + { + "epoch": 0.7034305012442232, + "grad_norm": 2.34375, + "learning_rate": 5.9293333333333335e-06, + "loss": 0.3774, + "step": 31660 + }, + { + "epoch": 0.7036526839672947, + "grad_norm": 2.625, + "learning_rate": 5.924888888888889e-06, + "loss": 0.3562, + "step": 31670 + }, + { + "epoch": 0.7038748666903661, + "grad_norm": 2.171875, + "learning_rate": 5.920444444444445e-06, + "loss": 0.3559, + "step": 31680 + }, + { + "epoch": 0.7040970494134376, + "grad_norm": 2.390625, + "learning_rate": 5.916000000000001e-06, + "loss": 0.4212, + "step": 31690 + }, + { + "epoch": 0.704319232136509, + "grad_norm": 2.4375, + "learning_rate": 5.911555555555556e-06, + "loss": 0.3567, + "step": 31700 + }, + { + "epoch": 0.7045414148595806, + "grad_norm": 2.609375, + "learning_rate": 5.907111111111111e-06, + "loss": 0.3667, + "step": 31710 + }, + { + "epoch": 0.704763597582652, + "grad_norm": 2.265625, + "learning_rate": 5.902666666666668e-06, + "loss": 0.3793, + "step": 31720 + }, + { + "epoch": 0.7049857803057235, + "grad_norm": 2.84375, + "learning_rate": 5.8982222222222225e-06, + "loss": 0.3814, + "step": 31730 + }, + { + "epoch": 0.7052079630287948, + "grad_norm": 2.640625, + "learning_rate": 5.893777777777778e-06, + "loss": 0.3853, + "step": 31740 + }, + { + "epoch": 0.7054301457518664, + "grad_norm": 2.546875, + "learning_rate": 5.889333333333334e-06, + "loss": 0.4039, + "step": 31750 + }, + { + "epoch": 0.7056523284749378, + "grad_norm": 2.625, + "learning_rate": 5.884888888888889e-06, + "loss": 0.3638, + "step": 31760 + }, + { + "epoch": 0.7058745111980093, + "grad_norm": 2.75, + "learning_rate": 5.880444444444445e-06, + "loss": 0.433, + "step": 31770 + }, + { + "epoch": 0.7060966939210807, + "grad_norm": 2.703125, + "learning_rate": 5.876000000000001e-06, + "loss": 0.434, + "step": 31780 + }, + { + "epoch": 0.7063188766441522, + "grad_norm": 2.53125, + "learning_rate": 5.871555555555556e-06, + "loss": 0.384, + "step": 31790 + }, + { + "epoch": 0.7065410593672236, + "grad_norm": 2.578125, + "learning_rate": 5.8671111111111115e-06, + "loss": 0.3686, + "step": 31800 + }, + { + "epoch": 0.7067632420902951, + "grad_norm": 2.375, + "learning_rate": 5.862666666666667e-06, + "loss": 0.3816, + "step": 31810 + }, + { + "epoch": 0.7069854248133665, + "grad_norm": 2.171875, + "learning_rate": 5.858222222222223e-06, + "loss": 0.3428, + "step": 31820 + }, + { + "epoch": 0.707207607536438, + "grad_norm": 2.84375, + "learning_rate": 5.853777777777779e-06, + "loss": 0.3623, + "step": 31830 + }, + { + "epoch": 0.7074297902595095, + "grad_norm": 2.453125, + "learning_rate": 5.8493333333333335e-06, + "loss": 0.4127, + "step": 31840 + }, + { + "epoch": 0.7076519729825809, + "grad_norm": 2.71875, + "learning_rate": 5.844888888888889e-06, + "loss": 0.4007, + "step": 31850 + }, + { + "epoch": 0.7078741557056524, + "grad_norm": 2.953125, + "learning_rate": 5.840444444444445e-06, + "loss": 0.4189, + "step": 31860 + }, + { + "epoch": 0.7080963384287238, + "grad_norm": 2.453125, + "learning_rate": 5.8360000000000005e-06, + "loss": 0.3812, + "step": 31870 + }, + { + "epoch": 0.7083185211517953, + "grad_norm": 2.328125, + "learning_rate": 5.831555555555556e-06, + "loss": 0.3905, + "step": 31880 + }, + { + "epoch": 0.7085407038748667, + "grad_norm": 2.390625, + "learning_rate": 5.827111111111111e-06, + "loss": 0.3401, + "step": 31890 + }, + { + "epoch": 0.7087628865979382, + "grad_norm": 2.46875, + "learning_rate": 5.822666666666668e-06, + "loss": 0.3723, + "step": 31900 + }, + { + "epoch": 0.7089850693210096, + "grad_norm": 2.1875, + "learning_rate": 5.818222222222223e-06, + "loss": 0.3983, + "step": 31910 + }, + { + "epoch": 0.7092072520440811, + "grad_norm": 2.65625, + "learning_rate": 5.813777777777778e-06, + "loss": 0.3972, + "step": 31920 + }, + { + "epoch": 0.7094294347671525, + "grad_norm": 2.390625, + "learning_rate": 5.809333333333334e-06, + "loss": 0.3765, + "step": 31930 + }, + { + "epoch": 0.709651617490224, + "grad_norm": 2.234375, + "learning_rate": 5.8048888888888895e-06, + "loss": 0.4079, + "step": 31940 + }, + { + "epoch": 0.7098738002132954, + "grad_norm": 2.609375, + "learning_rate": 5.800444444444445e-06, + "loss": 0.3835, + "step": 31950 + }, + { + "epoch": 0.7100959829363669, + "grad_norm": 2.109375, + "learning_rate": 5.796000000000001e-06, + "loss": 0.3712, + "step": 31960 + }, + { + "epoch": 0.7103181656594383, + "grad_norm": 2.6875, + "learning_rate": 5.791555555555556e-06, + "loss": 0.3595, + "step": 31970 + }, + { + "epoch": 0.7105403483825098, + "grad_norm": 2.734375, + "learning_rate": 5.7871111111111115e-06, + "loss": 0.4171, + "step": 31980 + }, + { + "epoch": 0.7107625311055812, + "grad_norm": 2.5625, + "learning_rate": 5.782666666666667e-06, + "loss": 0.375, + "step": 31990 + }, + { + "epoch": 0.7109847138286527, + "grad_norm": 2.078125, + "learning_rate": 5.778222222222223e-06, + "loss": 0.3762, + "step": 32000 + }, + { + "epoch": 0.7112068965517241, + "grad_norm": 2.390625, + "learning_rate": 5.7737777777777785e-06, + "loss": 0.3487, + "step": 32010 + }, + { + "epoch": 0.7114290792747956, + "grad_norm": 2.59375, + "learning_rate": 5.769333333333333e-06, + "loss": 0.387, + "step": 32020 + }, + { + "epoch": 0.711651261997867, + "grad_norm": 2.28125, + "learning_rate": 5.76488888888889e-06, + "loss": 0.3527, + "step": 32030 + }, + { + "epoch": 0.7118734447209385, + "grad_norm": 2.5, + "learning_rate": 5.760444444444445e-06, + "loss": 0.3559, + "step": 32040 + }, + { + "epoch": 0.71209562744401, + "grad_norm": 2.1875, + "learning_rate": 5.7560000000000005e-06, + "loss": 0.3764, + "step": 32050 + }, + { + "epoch": 0.7123178101670814, + "grad_norm": 2.078125, + "learning_rate": 5.751555555555556e-06, + "loss": 0.4182, + "step": 32060 + }, + { + "epoch": 0.7125399928901529, + "grad_norm": 2.625, + "learning_rate": 5.747111111111111e-06, + "loss": 0.4145, + "step": 32070 + }, + { + "epoch": 0.7127621756132243, + "grad_norm": 2.546875, + "learning_rate": 5.7426666666666675e-06, + "loss": 0.3925, + "step": 32080 + }, + { + "epoch": 0.7129843583362958, + "grad_norm": 2.671875, + "learning_rate": 5.738222222222223e-06, + "loss": 0.3647, + "step": 32090 + }, + { + "epoch": 0.7132065410593672, + "grad_norm": 2.765625, + "learning_rate": 5.733777777777778e-06, + "loss": 0.4259, + "step": 32100 + }, + { + "epoch": 0.7134287237824387, + "grad_norm": 3.03125, + "learning_rate": 5.729333333333334e-06, + "loss": 0.363, + "step": 32110 + }, + { + "epoch": 0.7136509065055101, + "grad_norm": 2.78125, + "learning_rate": 5.7248888888888895e-06, + "loss": 0.385, + "step": 32120 + }, + { + "epoch": 0.7138730892285816, + "grad_norm": 3.0, + "learning_rate": 5.720444444444445e-06, + "loss": 0.3991, + "step": 32130 + }, + { + "epoch": 0.714095271951653, + "grad_norm": 2.9375, + "learning_rate": 5.716000000000001e-06, + "loss": 0.374, + "step": 32140 + }, + { + "epoch": 0.7143174546747245, + "grad_norm": 2.609375, + "learning_rate": 5.711555555555556e-06, + "loss": 0.3876, + "step": 32150 + }, + { + "epoch": 0.7145396373977959, + "grad_norm": 2.59375, + "learning_rate": 5.707111111111111e-06, + "loss": 0.4159, + "step": 32160 + }, + { + "epoch": 0.7147618201208674, + "grad_norm": 2.46875, + "learning_rate": 5.702666666666667e-06, + "loss": 0.3928, + "step": 32170 + }, + { + "epoch": 0.7149840028439388, + "grad_norm": 2.875, + "learning_rate": 5.698222222222223e-06, + "loss": 0.3942, + "step": 32180 + }, + { + "epoch": 0.7152061855670103, + "grad_norm": 2.359375, + "learning_rate": 5.6937777777777785e-06, + "loss": 0.3635, + "step": 32190 + }, + { + "epoch": 0.7154283682900817, + "grad_norm": 2.484375, + "learning_rate": 5.689333333333333e-06, + "loss": 0.3683, + "step": 32200 + }, + { + "epoch": 0.7156505510131532, + "grad_norm": 2.125, + "learning_rate": 5.68488888888889e-06, + "loss": 0.3795, + "step": 32210 + }, + { + "epoch": 0.7158727337362246, + "grad_norm": 2.890625, + "learning_rate": 5.680444444444445e-06, + "loss": 0.3697, + "step": 32220 + }, + { + "epoch": 0.7160949164592961, + "grad_norm": 2.40625, + "learning_rate": 5.676e-06, + "loss": 0.3657, + "step": 32230 + }, + { + "epoch": 0.7163170991823676, + "grad_norm": 2.765625, + "learning_rate": 5.671555555555556e-06, + "loss": 0.3692, + "step": 32240 + }, + { + "epoch": 0.716539281905439, + "grad_norm": 2.515625, + "learning_rate": 5.667111111111112e-06, + "loss": 0.3662, + "step": 32250 + }, + { + "epoch": 0.7167614646285105, + "grad_norm": 2.6875, + "learning_rate": 5.6626666666666675e-06, + "loss": 0.3842, + "step": 32260 + }, + { + "epoch": 0.7169836473515819, + "grad_norm": 2.765625, + "learning_rate": 5.658222222222223e-06, + "loss": 0.3638, + "step": 32270 + }, + { + "epoch": 0.7172058300746534, + "grad_norm": 2.28125, + "learning_rate": 5.653777777777778e-06, + "loss": 0.4022, + "step": 32280 + }, + { + "epoch": 0.7174280127977248, + "grad_norm": 3.25, + "learning_rate": 5.649333333333334e-06, + "loss": 0.3911, + "step": 32290 + }, + { + "epoch": 0.7176501955207963, + "grad_norm": 2.28125, + "learning_rate": 5.644888888888889e-06, + "loss": 0.3572, + "step": 32300 + }, + { + "epoch": 0.7178723782438677, + "grad_norm": 2.5625, + "learning_rate": 5.640444444444445e-06, + "loss": 0.3877, + "step": 32310 + }, + { + "epoch": 0.7180945609669392, + "grad_norm": 2.46875, + "learning_rate": 5.636000000000001e-06, + "loss": 0.4065, + "step": 32320 + }, + { + "epoch": 0.7183167436900106, + "grad_norm": 2.46875, + "learning_rate": 5.631555555555556e-06, + "loss": 0.3379, + "step": 32330 + }, + { + "epoch": 0.7185389264130821, + "grad_norm": 2.140625, + "learning_rate": 5.627111111111112e-06, + "loss": 0.3465, + "step": 32340 + }, + { + "epoch": 0.7187611091361535, + "grad_norm": 2.515625, + "learning_rate": 5.622666666666667e-06, + "loss": 0.4159, + "step": 32350 + }, + { + "epoch": 0.718983291859225, + "grad_norm": 2.703125, + "learning_rate": 5.618222222222223e-06, + "loss": 0.3727, + "step": 32360 + }, + { + "epoch": 0.7192054745822964, + "grad_norm": 2.265625, + "learning_rate": 5.613777777777778e-06, + "loss": 0.3809, + "step": 32370 + }, + { + "epoch": 0.719427657305368, + "grad_norm": 2.453125, + "learning_rate": 5.609333333333333e-06, + "loss": 0.35, + "step": 32380 + }, + { + "epoch": 0.7196498400284393, + "grad_norm": 2.1875, + "learning_rate": 5.60488888888889e-06, + "loss": 0.4121, + "step": 32390 + }, + { + "epoch": 0.7198720227515109, + "grad_norm": 2.546875, + "learning_rate": 5.600444444444445e-06, + "loss": 0.435, + "step": 32400 + }, + { + "epoch": 0.7200942054745822, + "grad_norm": 2.546875, + "learning_rate": 5.596e-06, + "loss": 0.3537, + "step": 32410 + }, + { + "epoch": 0.7203163881976538, + "grad_norm": 2.796875, + "learning_rate": 5.591555555555556e-06, + "loss": 0.3979, + "step": 32420 + }, + { + "epoch": 0.7205385709207252, + "grad_norm": 2.40625, + "learning_rate": 5.587111111111112e-06, + "loss": 0.3518, + "step": 32430 + }, + { + "epoch": 0.7207607536437967, + "grad_norm": 2.484375, + "learning_rate": 5.582666666666667e-06, + "loss": 0.3972, + "step": 32440 + }, + { + "epoch": 0.7209829363668682, + "grad_norm": 2.4375, + "learning_rate": 5.578222222222223e-06, + "loss": 0.399, + "step": 32450 + }, + { + "epoch": 0.7212051190899396, + "grad_norm": 2.203125, + "learning_rate": 5.573777777777778e-06, + "loss": 0.4075, + "step": 32460 + }, + { + "epoch": 0.7214273018130111, + "grad_norm": 2.65625, + "learning_rate": 5.569333333333334e-06, + "loss": 0.383, + "step": 32470 + }, + { + "epoch": 0.7216494845360825, + "grad_norm": 2.53125, + "learning_rate": 5.564888888888889e-06, + "loss": 0.4107, + "step": 32480 + }, + { + "epoch": 0.721871667259154, + "grad_norm": 2.734375, + "learning_rate": 5.560444444444445e-06, + "loss": 0.3835, + "step": 32490 + }, + { + "epoch": 0.7220938499822254, + "grad_norm": 2.0625, + "learning_rate": 5.556000000000001e-06, + "loss": 0.4052, + "step": 32500 + }, + { + "epoch": 0.7223160327052969, + "grad_norm": 2.421875, + "learning_rate": 5.5515555555555555e-06, + "loss": 0.3796, + "step": 32510 + }, + { + "epoch": 0.7225382154283683, + "grad_norm": 3.296875, + "learning_rate": 5.547111111111112e-06, + "loss": 0.386, + "step": 32520 + }, + { + "epoch": 0.7227603981514398, + "grad_norm": 2.5, + "learning_rate": 5.542666666666667e-06, + "loss": 0.3719, + "step": 32530 + }, + { + "epoch": 0.7229825808745112, + "grad_norm": 2.703125, + "learning_rate": 5.538222222222223e-06, + "loss": 0.3423, + "step": 32540 + }, + { + "epoch": 0.7232047635975827, + "grad_norm": 2.71875, + "learning_rate": 5.533777777777778e-06, + "loss": 0.378, + "step": 32550 + }, + { + "epoch": 0.7234269463206541, + "grad_norm": 2.265625, + "learning_rate": 5.529333333333334e-06, + "loss": 0.3324, + "step": 32560 + }, + { + "epoch": 0.7236491290437256, + "grad_norm": 2.265625, + "learning_rate": 5.52488888888889e-06, + "loss": 0.3906, + "step": 32570 + }, + { + "epoch": 0.723871311766797, + "grad_norm": 2.515625, + "learning_rate": 5.5204444444444445e-06, + "loss": 0.3532, + "step": 32580 + }, + { + "epoch": 0.7240934944898685, + "grad_norm": 2.390625, + "learning_rate": 5.516e-06, + "loss": 0.4058, + "step": 32590 + }, + { + "epoch": 0.7243156772129399, + "grad_norm": 2.8125, + "learning_rate": 5.511555555555556e-06, + "loss": 0.3908, + "step": 32600 + }, + { + "epoch": 0.7245378599360114, + "grad_norm": 2.8125, + "learning_rate": 5.507111111111112e-06, + "loss": 0.3788, + "step": 32610 + }, + { + "epoch": 0.7247600426590828, + "grad_norm": 2.75, + "learning_rate": 5.502666666666667e-06, + "loss": 0.4041, + "step": 32620 + }, + { + "epoch": 0.7249822253821543, + "grad_norm": 2.34375, + "learning_rate": 5.498222222222223e-06, + "loss": 0.3679, + "step": 32630 + }, + { + "epoch": 0.7252044081052257, + "grad_norm": 2.796875, + "learning_rate": 5.493777777777778e-06, + "loss": 0.4047, + "step": 32640 + }, + { + "epoch": 0.7254265908282972, + "grad_norm": 2.296875, + "learning_rate": 5.489333333333334e-06, + "loss": 0.3897, + "step": 32650 + }, + { + "epoch": 0.7256487735513687, + "grad_norm": 2.4375, + "learning_rate": 5.484888888888889e-06, + "loss": 0.3811, + "step": 32660 + }, + { + "epoch": 0.7258709562744401, + "grad_norm": 2.15625, + "learning_rate": 5.480444444444445e-06, + "loss": 0.3456, + "step": 32670 + }, + { + "epoch": 0.7260931389975116, + "grad_norm": 2.40625, + "learning_rate": 5.476000000000001e-06, + "loss": 0.3679, + "step": 32680 + }, + { + "epoch": 0.726315321720583, + "grad_norm": 2.71875, + "learning_rate": 5.4715555555555554e-06, + "loss": 0.3522, + "step": 32690 + }, + { + "epoch": 0.7265375044436545, + "grad_norm": 2.765625, + "learning_rate": 5.467111111111112e-06, + "loss": 0.4007, + "step": 32700 + }, + { + "epoch": 0.7267596871667259, + "grad_norm": 2.078125, + "learning_rate": 5.462666666666667e-06, + "loss": 0.3458, + "step": 32710 + }, + { + "epoch": 0.7269818698897974, + "grad_norm": 2.65625, + "learning_rate": 5.4582222222222225e-06, + "loss": 0.3631, + "step": 32720 + }, + { + "epoch": 0.7272040526128688, + "grad_norm": 2.296875, + "learning_rate": 5.453777777777778e-06, + "loss": 0.379, + "step": 32730 + }, + { + "epoch": 0.7274262353359403, + "grad_norm": 2.328125, + "learning_rate": 5.449333333333334e-06, + "loss": 0.3544, + "step": 32740 + }, + { + "epoch": 0.7276484180590117, + "grad_norm": 2.953125, + "learning_rate": 5.44488888888889e-06, + "loss": 0.3485, + "step": 32750 + }, + { + "epoch": 0.7278706007820832, + "grad_norm": 2.578125, + "learning_rate": 5.4404444444444444e-06, + "loss": 0.4027, + "step": 32760 + }, + { + "epoch": 0.7280927835051546, + "grad_norm": 2.6875, + "learning_rate": 5.436e-06, + "loss": 0.4076, + "step": 32770 + }, + { + "epoch": 0.7283149662282261, + "grad_norm": 2.203125, + "learning_rate": 5.431555555555556e-06, + "loss": 0.3965, + "step": 32780 + }, + { + "epoch": 0.7285371489512975, + "grad_norm": 2.25, + "learning_rate": 5.4271111111111115e-06, + "loss": 0.3601, + "step": 32790 + }, + { + "epoch": 0.728759331674369, + "grad_norm": 2.1875, + "learning_rate": 5.422666666666667e-06, + "loss": 0.3642, + "step": 32800 + }, + { + "epoch": 0.7289815143974404, + "grad_norm": 2.609375, + "learning_rate": 5.418222222222223e-06, + "loss": 0.3831, + "step": 32810 + }, + { + "epoch": 0.7292036971205119, + "grad_norm": 2.59375, + "learning_rate": 5.413777777777778e-06, + "loss": 0.3745, + "step": 32820 + }, + { + "epoch": 0.7294258798435833, + "grad_norm": 2.40625, + "learning_rate": 5.409333333333334e-06, + "loss": 0.3593, + "step": 32830 + }, + { + "epoch": 0.7296480625666548, + "grad_norm": 2.4375, + "learning_rate": 5.404888888888889e-06, + "loss": 0.3802, + "step": 32840 + }, + { + "epoch": 0.7298702452897262, + "grad_norm": 2.703125, + "learning_rate": 5.400444444444445e-06, + "loss": 0.3895, + "step": 32850 + }, + { + "epoch": 0.7300924280127977, + "grad_norm": 2.6875, + "learning_rate": 5.3960000000000005e-06, + "loss": 0.3854, + "step": 32860 + }, + { + "epoch": 0.7303146107358692, + "grad_norm": 2.796875, + "learning_rate": 5.391555555555556e-06, + "loss": 0.389, + "step": 32870 + }, + { + "epoch": 0.7305367934589406, + "grad_norm": 3.140625, + "learning_rate": 5.387111111111112e-06, + "loss": 0.3785, + "step": 32880 + }, + { + "epoch": 0.7307589761820121, + "grad_norm": 2.359375, + "learning_rate": 5.382666666666667e-06, + "loss": 0.4139, + "step": 32890 + }, + { + "epoch": 0.7309811589050835, + "grad_norm": 2.5625, + "learning_rate": 5.3782222222222224e-06, + "loss": 0.3552, + "step": 32900 + }, + { + "epoch": 0.731203341628155, + "grad_norm": 2.46875, + "learning_rate": 5.373777777777778e-06, + "loss": 0.3713, + "step": 32910 + }, + { + "epoch": 0.7314255243512264, + "grad_norm": 2.421875, + "learning_rate": 5.369333333333334e-06, + "loss": 0.3602, + "step": 32920 + }, + { + "epoch": 0.7316477070742979, + "grad_norm": 2.34375, + "learning_rate": 5.3648888888888895e-06, + "loss": 0.3437, + "step": 32930 + }, + { + "epoch": 0.7318698897973693, + "grad_norm": 2.25, + "learning_rate": 5.360444444444445e-06, + "loss": 0.3611, + "step": 32940 + }, + { + "epoch": 0.7320920725204408, + "grad_norm": 2.8125, + "learning_rate": 5.356e-06, + "loss": 0.3704, + "step": 32950 + }, + { + "epoch": 0.7323142552435122, + "grad_norm": 2.515625, + "learning_rate": 5.351555555555557e-06, + "loss": 0.4018, + "step": 32960 + }, + { + "epoch": 0.7325364379665837, + "grad_norm": 2.53125, + "learning_rate": 5.3471111111111114e-06, + "loss": 0.4128, + "step": 32970 + }, + { + "epoch": 0.7327586206896551, + "grad_norm": 2.046875, + "learning_rate": 5.342666666666667e-06, + "loss": 0.3732, + "step": 32980 + }, + { + "epoch": 0.7329808034127266, + "grad_norm": 2.421875, + "learning_rate": 5.338222222222223e-06, + "loss": 0.3742, + "step": 32990 + }, + { + "epoch": 0.733202986135798, + "grad_norm": 2.40625, + "learning_rate": 5.333777777777778e-06, + "loss": 0.3759, + "step": 33000 + }, + { + "epoch": 0.7334251688588695, + "grad_norm": 3.125, + "learning_rate": 5.329333333333334e-06, + "loss": 0.3769, + "step": 33010 + }, + { + "epoch": 0.7336473515819409, + "grad_norm": 2.828125, + "learning_rate": 5.324888888888889e-06, + "loss": 0.3619, + "step": 33020 + }, + { + "epoch": 0.7338695343050124, + "grad_norm": 2.484375, + "learning_rate": 5.320444444444445e-06, + "loss": 0.4092, + "step": 33030 + }, + { + "epoch": 0.7340917170280838, + "grad_norm": 2.625, + "learning_rate": 5.3160000000000004e-06, + "loss": 0.4058, + "step": 33040 + }, + { + "epoch": 0.7343138997511554, + "grad_norm": 2.65625, + "learning_rate": 5.311555555555556e-06, + "loss": 0.4174, + "step": 33050 + }, + { + "epoch": 0.7345360824742269, + "grad_norm": 2.3125, + "learning_rate": 5.307111111111112e-06, + "loss": 0.4063, + "step": 33060 + }, + { + "epoch": 0.7347582651972983, + "grad_norm": 2.75, + "learning_rate": 5.302666666666667e-06, + "loss": 0.3657, + "step": 33070 + }, + { + "epoch": 0.7349804479203698, + "grad_norm": 2.328125, + "learning_rate": 5.298222222222222e-06, + "loss": 0.3567, + "step": 33080 + }, + { + "epoch": 0.7352026306434412, + "grad_norm": 2.578125, + "learning_rate": 5.293777777777778e-06, + "loss": 0.395, + "step": 33090 + }, + { + "epoch": 0.7354248133665127, + "grad_norm": 2.421875, + "learning_rate": 5.289333333333334e-06, + "loss": 0.3675, + "step": 33100 + }, + { + "epoch": 0.7356469960895841, + "grad_norm": 2.515625, + "learning_rate": 5.2848888888888894e-06, + "loss": 0.4294, + "step": 33110 + }, + { + "epoch": 0.7358691788126556, + "grad_norm": 2.65625, + "learning_rate": 5.280444444444445e-06, + "loss": 0.3893, + "step": 33120 + }, + { + "epoch": 0.736091361535727, + "grad_norm": 2.8125, + "learning_rate": 5.276e-06, + "loss": 0.3707, + "step": 33130 + }, + { + "epoch": 0.7363135442587985, + "grad_norm": 2.375, + "learning_rate": 5.2715555555555565e-06, + "loss": 0.4016, + "step": 33140 + }, + { + "epoch": 0.7365357269818699, + "grad_norm": 2.484375, + "learning_rate": 5.267111111111111e-06, + "loss": 0.3467, + "step": 33150 + }, + { + "epoch": 0.7367579097049414, + "grad_norm": 2.078125, + "learning_rate": 5.262666666666667e-06, + "loss": 0.3979, + "step": 33160 + }, + { + "epoch": 0.7369800924280128, + "grad_norm": 2.421875, + "learning_rate": 5.258222222222223e-06, + "loss": 0.4023, + "step": 33170 + }, + { + "epoch": 0.7372022751510843, + "grad_norm": 2.171875, + "learning_rate": 5.2537777777777784e-06, + "loss": 0.3649, + "step": 33180 + }, + { + "epoch": 0.7374244578741557, + "grad_norm": 2.5625, + "learning_rate": 5.249333333333334e-06, + "loss": 0.3974, + "step": 33190 + }, + { + "epoch": 0.7376466405972272, + "grad_norm": 2.59375, + "learning_rate": 5.244888888888889e-06, + "loss": 0.3581, + "step": 33200 + }, + { + "epoch": 0.7378688233202986, + "grad_norm": 2.90625, + "learning_rate": 5.240444444444445e-06, + "loss": 0.3588, + "step": 33210 + }, + { + "epoch": 0.7380910060433701, + "grad_norm": 2.796875, + "learning_rate": 5.236e-06, + "loss": 0.367, + "step": 33220 + }, + { + "epoch": 0.7383131887664415, + "grad_norm": 2.703125, + "learning_rate": 5.231555555555556e-06, + "loss": 0.4256, + "step": 33230 + }, + { + "epoch": 0.738535371489513, + "grad_norm": 2.5625, + "learning_rate": 5.227111111111112e-06, + "loss": 0.3773, + "step": 33240 + }, + { + "epoch": 0.7387575542125844, + "grad_norm": 2.46875, + "learning_rate": 5.222666666666667e-06, + "loss": 0.3831, + "step": 33250 + }, + { + "epoch": 0.7389797369356559, + "grad_norm": 2.703125, + "learning_rate": 5.218222222222222e-06, + "loss": 0.3948, + "step": 33260 + }, + { + "epoch": 0.7392019196587274, + "grad_norm": 2.65625, + "learning_rate": 5.213777777777779e-06, + "loss": 0.3287, + "step": 33270 + }, + { + "epoch": 0.7394241023817988, + "grad_norm": 2.59375, + "learning_rate": 5.209333333333334e-06, + "loss": 0.3519, + "step": 33280 + }, + { + "epoch": 0.7396462851048703, + "grad_norm": 2.53125, + "learning_rate": 5.204888888888889e-06, + "loss": 0.3808, + "step": 33290 + }, + { + "epoch": 0.7398684678279417, + "grad_norm": 2.40625, + "learning_rate": 5.200444444444445e-06, + "loss": 0.3694, + "step": 33300 + }, + { + "epoch": 0.7400906505510132, + "grad_norm": 2.3125, + "learning_rate": 5.196e-06, + "loss": 0.3577, + "step": 33310 + }, + { + "epoch": 0.7403128332740846, + "grad_norm": 2.875, + "learning_rate": 5.1915555555555564e-06, + "loss": 0.4061, + "step": 33320 + }, + { + "epoch": 0.7405350159971561, + "grad_norm": 2.703125, + "learning_rate": 5.187111111111111e-06, + "loss": 0.3947, + "step": 33330 + }, + { + "epoch": 0.7407571987202275, + "grad_norm": 2.359375, + "learning_rate": 5.182666666666667e-06, + "loss": 0.3818, + "step": 33340 + }, + { + "epoch": 0.740979381443299, + "grad_norm": 2.296875, + "learning_rate": 5.178222222222223e-06, + "loss": 0.3778, + "step": 33350 + }, + { + "epoch": 0.7412015641663704, + "grad_norm": 2.0, + "learning_rate": 5.173777777777778e-06, + "loss": 0.3661, + "step": 33360 + }, + { + "epoch": 0.7414237468894419, + "grad_norm": 2.59375, + "learning_rate": 5.169333333333334e-06, + "loss": 0.364, + "step": 33370 + }, + { + "epoch": 0.7416459296125133, + "grad_norm": 2.578125, + "learning_rate": 5.164888888888889e-06, + "loss": 0.3494, + "step": 33380 + }, + { + "epoch": 0.7418681123355848, + "grad_norm": 2.515625, + "learning_rate": 5.160444444444445e-06, + "loss": 0.3959, + "step": 33390 + }, + { + "epoch": 0.7420902950586562, + "grad_norm": 2.703125, + "learning_rate": 5.156e-06, + "loss": 0.3567, + "step": 33400 + }, + { + "epoch": 0.7423124777817277, + "grad_norm": 2.421875, + "learning_rate": 5.151555555555556e-06, + "loss": 0.3257, + "step": 33410 + }, + { + "epoch": 0.7425346605047991, + "grad_norm": 2.265625, + "learning_rate": 5.147111111111112e-06, + "loss": 0.3331, + "step": 33420 + }, + { + "epoch": 0.7427568432278706, + "grad_norm": 3.046875, + "learning_rate": 5.1426666666666665e-06, + "loss": 0.3841, + "step": 33430 + }, + { + "epoch": 0.742979025950942, + "grad_norm": 2.484375, + "learning_rate": 5.138222222222222e-06, + "loss": 0.3922, + "step": 33440 + }, + { + "epoch": 0.7432012086740135, + "grad_norm": 2.1875, + "learning_rate": 5.133777777777779e-06, + "loss": 0.3382, + "step": 33450 + }, + { + "epoch": 0.7434233913970849, + "grad_norm": 2.484375, + "learning_rate": 5.129333333333334e-06, + "loss": 0.3916, + "step": 33460 + }, + { + "epoch": 0.7436455741201564, + "grad_norm": 2.546875, + "learning_rate": 5.124888888888889e-06, + "loss": 0.3768, + "step": 33470 + }, + { + "epoch": 0.7438677568432279, + "grad_norm": 2.5, + "learning_rate": 5.120444444444445e-06, + "loss": 0.35, + "step": 33480 + }, + { + "epoch": 0.7440899395662993, + "grad_norm": 2.578125, + "learning_rate": 5.116000000000001e-06, + "loss": 0.3907, + "step": 33490 + }, + { + "epoch": 0.7443121222893708, + "grad_norm": 2.265625, + "learning_rate": 5.111555555555556e-06, + "loss": 0.4041, + "step": 33500 + }, + { + "epoch": 0.7445343050124422, + "grad_norm": 2.5625, + "learning_rate": 5.107111111111111e-06, + "loss": 0.3545, + "step": 33510 + }, + { + "epoch": 0.7447564877355137, + "grad_norm": 2.359375, + "learning_rate": 5.102666666666667e-06, + "loss": 0.4054, + "step": 33520 + }, + { + "epoch": 0.7449786704585851, + "grad_norm": 2.75, + "learning_rate": 5.0982222222222226e-06, + "loss": 0.3445, + "step": 33530 + }, + { + "epoch": 0.7452008531816566, + "grad_norm": 2.4375, + "learning_rate": 5.093777777777778e-06, + "loss": 0.377, + "step": 33540 + }, + { + "epoch": 0.745423035904728, + "grad_norm": 2.265625, + "learning_rate": 5.089333333333334e-06, + "loss": 0.3659, + "step": 33550 + }, + { + "epoch": 0.7456452186277995, + "grad_norm": 2.4375, + "learning_rate": 5.084888888888889e-06, + "loss": 0.3905, + "step": 33560 + }, + { + "epoch": 0.7458674013508709, + "grad_norm": 3.0625, + "learning_rate": 5.0804444444444445e-06, + "loss": 0.4162, + "step": 33570 + }, + { + "epoch": 0.7460895840739424, + "grad_norm": 2.5625, + "learning_rate": 5.076000000000001e-06, + "loss": 0.4015, + "step": 33580 + }, + { + "epoch": 0.7463117667970138, + "grad_norm": 2.296875, + "learning_rate": 5.071555555555556e-06, + "loss": 0.3701, + "step": 33590 + }, + { + "epoch": 0.7465339495200853, + "grad_norm": 2.90625, + "learning_rate": 5.0671111111111116e-06, + "loss": 0.3764, + "step": 33600 + }, + { + "epoch": 0.7467561322431567, + "grad_norm": 1.828125, + "learning_rate": 5.062666666666666e-06, + "loss": 0.3787, + "step": 33610 + }, + { + "epoch": 0.7469783149662282, + "grad_norm": 2.328125, + "learning_rate": 5.058222222222222e-06, + "loss": 0.3511, + "step": 33620 + }, + { + "epoch": 0.7472004976892996, + "grad_norm": 2.234375, + "learning_rate": 5.053777777777779e-06, + "loss": 0.386, + "step": 33630 + }, + { + "epoch": 0.7474226804123711, + "grad_norm": 2.546875, + "learning_rate": 5.0493333333333335e-06, + "loss": 0.4056, + "step": 33640 + }, + { + "epoch": 0.7476448631354425, + "grad_norm": 2.625, + "learning_rate": 5.044888888888889e-06, + "loss": 0.4146, + "step": 33650 + }, + { + "epoch": 0.747867045858514, + "grad_norm": 2.453125, + "learning_rate": 5.040444444444445e-06, + "loss": 0.3825, + "step": 33660 + }, + { + "epoch": 0.7480892285815854, + "grad_norm": 2.640625, + "learning_rate": 5.0360000000000006e-06, + "loss": 0.3307, + "step": 33670 + }, + { + "epoch": 0.748311411304657, + "grad_norm": 2.5, + "learning_rate": 5.031555555555556e-06, + "loss": 0.3789, + "step": 33680 + }, + { + "epoch": 0.7485335940277285, + "grad_norm": 2.625, + "learning_rate": 5.027111111111111e-06, + "loss": 0.3696, + "step": 33690 + }, + { + "epoch": 0.7487557767507999, + "grad_norm": 2.5, + "learning_rate": 5.022666666666667e-06, + "loss": 0.4013, + "step": 33700 + }, + { + "epoch": 0.7489779594738714, + "grad_norm": 2.5625, + "learning_rate": 5.0182222222222225e-06, + "loss": 0.3663, + "step": 33710 + }, + { + "epoch": 0.7492001421969428, + "grad_norm": 2.53125, + "learning_rate": 5.013777777777778e-06, + "loss": 0.3976, + "step": 33720 + }, + { + "epoch": 0.7494223249200143, + "grad_norm": 2.71875, + "learning_rate": 5.009333333333334e-06, + "loss": 0.3801, + "step": 33730 + }, + { + "epoch": 0.7496445076430857, + "grad_norm": 2.140625, + "learning_rate": 5.004888888888889e-06, + "loss": 0.3946, + "step": 33740 + }, + { + "epoch": 0.7498666903661572, + "grad_norm": 2.1875, + "learning_rate": 5.000444444444444e-06, + "loss": 0.3812, + "step": 33750 + }, + { + "epoch": 0.7500888730892286, + "grad_norm": 2.765625, + "learning_rate": 4.996e-06, + "loss": 0.4265, + "step": 33760 + }, + { + "epoch": 0.7503110558123001, + "grad_norm": 2.359375, + "learning_rate": 4.991555555555556e-06, + "loss": 0.3479, + "step": 33770 + }, + { + "epoch": 0.7505332385353715, + "grad_norm": 2.484375, + "learning_rate": 4.9871111111111115e-06, + "loss": 0.3982, + "step": 33780 + }, + { + "epoch": 0.750755421258443, + "grad_norm": 2.796875, + "learning_rate": 4.982666666666667e-06, + "loss": 0.4098, + "step": 33790 + }, + { + "epoch": 0.7509776039815144, + "grad_norm": 2.78125, + "learning_rate": 4.978222222222223e-06, + "loss": 0.4078, + "step": 33800 + }, + { + "epoch": 0.7511997867045859, + "grad_norm": 2.21875, + "learning_rate": 4.973777777777778e-06, + "loss": 0.3804, + "step": 33810 + }, + { + "epoch": 0.7514219694276573, + "grad_norm": 2.453125, + "learning_rate": 4.969333333333333e-06, + "loss": 0.3774, + "step": 33820 + }, + { + "epoch": 0.7516441521507288, + "grad_norm": 2.421875, + "learning_rate": 4.964888888888889e-06, + "loss": 0.3881, + "step": 33830 + }, + { + "epoch": 0.7518663348738002, + "grad_norm": 2.71875, + "learning_rate": 4.960444444444445e-06, + "loss": 0.3709, + "step": 33840 + }, + { + "epoch": 0.7520885175968717, + "grad_norm": 2.5, + "learning_rate": 4.9560000000000005e-06, + "loss": 0.3514, + "step": 33850 + }, + { + "epoch": 0.7523107003199431, + "grad_norm": 2.296875, + "learning_rate": 4.951555555555556e-06, + "loss": 0.3587, + "step": 33860 + }, + { + "epoch": 0.7525328830430146, + "grad_norm": 2.390625, + "learning_rate": 4.947111111111111e-06, + "loss": 0.3717, + "step": 33870 + }, + { + "epoch": 0.7527550657660861, + "grad_norm": 2.828125, + "learning_rate": 4.9426666666666676e-06, + "loss": 0.3629, + "step": 33880 + }, + { + "epoch": 0.7529772484891575, + "grad_norm": 2.171875, + "learning_rate": 4.938222222222222e-06, + "loss": 0.3995, + "step": 33890 + }, + { + "epoch": 0.753199431212229, + "grad_norm": 2.890625, + "learning_rate": 4.933777777777778e-06, + "loss": 0.3856, + "step": 33900 + }, + { + "epoch": 0.7534216139353004, + "grad_norm": 2.859375, + "learning_rate": 4.929333333333334e-06, + "loss": 0.382, + "step": 33910 + }, + { + "epoch": 0.7536437966583719, + "grad_norm": 2.578125, + "learning_rate": 4.924888888888889e-06, + "loss": 0.3794, + "step": 33920 + }, + { + "epoch": 0.7538659793814433, + "grad_norm": 2.4375, + "learning_rate": 4.920444444444445e-06, + "loss": 0.3611, + "step": 33930 + }, + { + "epoch": 0.7540881621045148, + "grad_norm": 2.640625, + "learning_rate": 4.916e-06, + "loss": 0.3722, + "step": 33940 + }, + { + "epoch": 0.7543103448275862, + "grad_norm": 2.265625, + "learning_rate": 4.911555555555556e-06, + "loss": 0.366, + "step": 33950 + }, + { + "epoch": 0.7545325275506577, + "grad_norm": 2.390625, + "learning_rate": 4.907111111111111e-06, + "loss": 0.3687, + "step": 33960 + }, + { + "epoch": 0.7547547102737291, + "grad_norm": 2.421875, + "learning_rate": 4.902666666666667e-06, + "loss": 0.3423, + "step": 33970 + }, + { + "epoch": 0.7549768929968006, + "grad_norm": 2.46875, + "learning_rate": 4.898222222222223e-06, + "loss": 0.4181, + "step": 33980 + }, + { + "epoch": 0.755199075719872, + "grad_norm": 3.125, + "learning_rate": 4.8937777777777785e-06, + "loss": 0.3907, + "step": 33990 + }, + { + "epoch": 0.7554212584429435, + "grad_norm": 2.828125, + "learning_rate": 4.889333333333333e-06, + "loss": 0.3739, + "step": 34000 + }, + { + "epoch": 0.7556434411660149, + "grad_norm": 2.578125, + "learning_rate": 4.884888888888889e-06, + "loss": 0.3817, + "step": 34010 + }, + { + "epoch": 0.7558656238890864, + "grad_norm": 2.328125, + "learning_rate": 4.880444444444445e-06, + "loss": 0.3424, + "step": 34020 + }, + { + "epoch": 0.7560878066121578, + "grad_norm": 2.328125, + "learning_rate": 4.876e-06, + "loss": 0.3504, + "step": 34030 + }, + { + "epoch": 0.7563099893352293, + "grad_norm": 2.796875, + "learning_rate": 4.871555555555556e-06, + "loss": 0.4067, + "step": 34040 + }, + { + "epoch": 0.7565321720583007, + "grad_norm": 2.890625, + "learning_rate": 4.867111111111111e-06, + "loss": 0.3779, + "step": 34050 + }, + { + "epoch": 0.7567543547813722, + "grad_norm": 2.296875, + "learning_rate": 4.8626666666666675e-06, + "loss": 0.3708, + "step": 34060 + }, + { + "epoch": 0.7569765375044436, + "grad_norm": 2.0625, + "learning_rate": 4.858222222222222e-06, + "loss": 0.3694, + "step": 34070 + }, + { + "epoch": 0.7571987202275151, + "grad_norm": 2.5625, + "learning_rate": 4.853777777777778e-06, + "loss": 0.3966, + "step": 34080 + }, + { + "epoch": 0.7574209029505866, + "grad_norm": 2.390625, + "learning_rate": 4.849333333333334e-06, + "loss": 0.3978, + "step": 34090 + }, + { + "epoch": 0.757643085673658, + "grad_norm": 2.3125, + "learning_rate": 4.844888888888889e-06, + "loss": 0.3731, + "step": 34100 + }, + { + "epoch": 0.7578652683967295, + "grad_norm": 2.21875, + "learning_rate": 4.840444444444445e-06, + "loss": 0.352, + "step": 34110 + }, + { + "epoch": 0.7580874511198009, + "grad_norm": 2.328125, + "learning_rate": 4.836e-06, + "loss": 0.367, + "step": 34120 + }, + { + "epoch": 0.7583096338428724, + "grad_norm": 2.4375, + "learning_rate": 4.831555555555556e-06, + "loss": 0.3731, + "step": 34130 + }, + { + "epoch": 0.7585318165659438, + "grad_norm": 2.078125, + "learning_rate": 4.827111111111111e-06, + "loss": 0.3773, + "step": 34140 + }, + { + "epoch": 0.7587539992890153, + "grad_norm": 2.171875, + "learning_rate": 4.822666666666667e-06, + "loss": 0.4028, + "step": 34150 + }, + { + "epoch": 0.7589761820120867, + "grad_norm": 2.734375, + "learning_rate": 4.818222222222223e-06, + "loss": 0.3595, + "step": 34160 + }, + { + "epoch": 0.7591983647351582, + "grad_norm": 3.046875, + "learning_rate": 4.813777777777778e-06, + "loss": 0.3652, + "step": 34170 + }, + { + "epoch": 0.7594205474582296, + "grad_norm": 2.3125, + "learning_rate": 4.809333333333333e-06, + "loss": 0.3695, + "step": 34180 + }, + { + "epoch": 0.7596427301813011, + "grad_norm": 2.375, + "learning_rate": 4.80488888888889e-06, + "loss": 0.3856, + "step": 34190 + }, + { + "epoch": 0.7598649129043725, + "grad_norm": 2.40625, + "learning_rate": 4.800444444444445e-06, + "loss": 0.3393, + "step": 34200 + }, + { + "epoch": 0.760087095627444, + "grad_norm": 2.59375, + "learning_rate": 4.796e-06, + "loss": 0.3476, + "step": 34210 + }, + { + "epoch": 0.7603092783505154, + "grad_norm": 2.90625, + "learning_rate": 4.791555555555556e-06, + "loss": 0.3445, + "step": 34220 + }, + { + "epoch": 0.7605314610735869, + "grad_norm": 2.59375, + "learning_rate": 4.787111111111111e-06, + "loss": 0.3893, + "step": 34230 + }, + { + "epoch": 0.7607536437966583, + "grad_norm": 2.5, + "learning_rate": 4.782666666666667e-06, + "loss": 0.4079, + "step": 34240 + }, + { + "epoch": 0.7609758265197298, + "grad_norm": 2.515625, + "learning_rate": 4.778222222222222e-06, + "loss": 0.3897, + "step": 34250 + }, + { + "epoch": 0.7611980092428012, + "grad_norm": 2.765625, + "learning_rate": 4.773777777777778e-06, + "loss": 0.3997, + "step": 34260 + }, + { + "epoch": 0.7614201919658727, + "grad_norm": 2.34375, + "learning_rate": 4.769333333333334e-06, + "loss": 0.3641, + "step": 34270 + }, + { + "epoch": 0.7616423746889441, + "grad_norm": 2.515625, + "learning_rate": 4.764888888888889e-06, + "loss": 0.3564, + "step": 34280 + }, + { + "epoch": 0.7618645574120156, + "grad_norm": 2.625, + "learning_rate": 4.760444444444445e-06, + "loss": 0.3899, + "step": 34290 + }, + { + "epoch": 0.7620867401350871, + "grad_norm": 2.546875, + "learning_rate": 4.756000000000001e-06, + "loss": 0.3494, + "step": 34300 + }, + { + "epoch": 0.7623089228581585, + "grad_norm": 2.484375, + "learning_rate": 4.7515555555555556e-06, + "loss": 0.4105, + "step": 34310 + }, + { + "epoch": 0.76253110558123, + "grad_norm": 2.4375, + "learning_rate": 4.747111111111111e-06, + "loss": 0.3266, + "step": 34320 + }, + { + "epoch": 0.7627532883043014, + "grad_norm": 2.3125, + "learning_rate": 4.742666666666667e-06, + "loss": 0.3259, + "step": 34330 + }, + { + "epoch": 0.762975471027373, + "grad_norm": 2.921875, + "learning_rate": 4.738222222222223e-06, + "loss": 0.3529, + "step": 34340 + }, + { + "epoch": 0.7631976537504443, + "grad_norm": 2.375, + "learning_rate": 4.733777777777778e-06, + "loss": 0.4033, + "step": 34350 + }, + { + "epoch": 0.7634198364735159, + "grad_norm": 2.53125, + "learning_rate": 4.729333333333333e-06, + "loss": 0.3745, + "step": 34360 + }, + { + "epoch": 0.7636420191965873, + "grad_norm": 2.09375, + "learning_rate": 4.72488888888889e-06, + "loss": 0.3941, + "step": 34370 + }, + { + "epoch": 0.7638642019196588, + "grad_norm": 2.078125, + "learning_rate": 4.7204444444444446e-06, + "loss": 0.3911, + "step": 34380 + }, + { + "epoch": 0.7640863846427302, + "grad_norm": 2.65625, + "learning_rate": 4.716e-06, + "loss": 0.3599, + "step": 34390 + }, + { + "epoch": 0.7643085673658017, + "grad_norm": 2.984375, + "learning_rate": 4.711555555555556e-06, + "loss": 0.4085, + "step": 34400 + }, + { + "epoch": 0.7645307500888731, + "grad_norm": 2.1875, + "learning_rate": 4.707111111111112e-06, + "loss": 0.3546, + "step": 34410 + }, + { + "epoch": 0.7647529328119446, + "grad_norm": 3.125, + "learning_rate": 4.702666666666667e-06, + "loss": 0.4001, + "step": 34420 + }, + { + "epoch": 0.764975115535016, + "grad_norm": 2.65625, + "learning_rate": 4.698222222222222e-06, + "loss": 0.4021, + "step": 34430 + }, + { + "epoch": 0.7651972982580875, + "grad_norm": 3.296875, + "learning_rate": 4.693777777777778e-06, + "loss": 0.3948, + "step": 34440 + }, + { + "epoch": 0.7654194809811589, + "grad_norm": 2.203125, + "learning_rate": 4.6893333333333336e-06, + "loss": 0.3804, + "step": 34450 + }, + { + "epoch": 0.7656416637042304, + "grad_norm": 2.484375, + "learning_rate": 4.684888888888889e-06, + "loss": 0.3928, + "step": 34460 + }, + { + "epoch": 0.7658638464273018, + "grad_norm": 2.75, + "learning_rate": 4.680444444444445e-06, + "loss": 0.3607, + "step": 34470 + }, + { + "epoch": 0.7660860291503733, + "grad_norm": 2.390625, + "learning_rate": 4.676000000000001e-06, + "loss": 0.4113, + "step": 34480 + }, + { + "epoch": 0.7663082118734447, + "grad_norm": 2.203125, + "learning_rate": 4.6715555555555555e-06, + "loss": 0.3627, + "step": 34490 + }, + { + "epoch": 0.7665303945965162, + "grad_norm": 2.34375, + "learning_rate": 4.667111111111112e-06, + "loss": 0.3499, + "step": 34500 + }, + { + "epoch": 0.7667525773195877, + "grad_norm": 2.46875, + "learning_rate": 4.662666666666667e-06, + "loss": 0.3732, + "step": 34510 + }, + { + "epoch": 0.7669747600426591, + "grad_norm": 1.953125, + "learning_rate": 4.6582222222222226e-06, + "loss": 0.3323, + "step": 34520 + }, + { + "epoch": 0.7671969427657306, + "grad_norm": 2.5625, + "learning_rate": 4.653777777777778e-06, + "loss": 0.3621, + "step": 34530 + }, + { + "epoch": 0.767419125488802, + "grad_norm": 2.625, + "learning_rate": 4.649333333333333e-06, + "loss": 0.4036, + "step": 34540 + }, + { + "epoch": 0.7676413082118735, + "grad_norm": 2.609375, + "learning_rate": 4.64488888888889e-06, + "loss": 0.3702, + "step": 34550 + }, + { + "epoch": 0.7678634909349449, + "grad_norm": 2.34375, + "learning_rate": 4.6404444444444445e-06, + "loss": 0.3808, + "step": 34560 + }, + { + "epoch": 0.7680856736580164, + "grad_norm": 2.6875, + "learning_rate": 4.636e-06, + "loss": 0.3658, + "step": 34570 + }, + { + "epoch": 0.7683078563810878, + "grad_norm": 2.84375, + "learning_rate": 4.631555555555556e-06, + "loss": 0.3817, + "step": 34580 + }, + { + "epoch": 0.7685300391041593, + "grad_norm": 2.65625, + "learning_rate": 4.6271111111111116e-06, + "loss": 0.3879, + "step": 34590 + }, + { + "epoch": 0.7687522218272307, + "grad_norm": 2.359375, + "learning_rate": 4.622666666666667e-06, + "loss": 0.3624, + "step": 34600 + }, + { + "epoch": 0.7689744045503022, + "grad_norm": 2.40625, + "learning_rate": 4.618222222222223e-06, + "loss": 0.3752, + "step": 34610 + }, + { + "epoch": 0.7691965872733736, + "grad_norm": 2.078125, + "learning_rate": 4.613777777777778e-06, + "loss": 0.3627, + "step": 34620 + }, + { + "epoch": 0.7694187699964451, + "grad_norm": 2.890625, + "learning_rate": 4.6093333333333335e-06, + "loss": 0.389, + "step": 34630 + }, + { + "epoch": 0.7696409527195165, + "grad_norm": 2.71875, + "learning_rate": 4.604888888888889e-06, + "loss": 0.3609, + "step": 34640 + }, + { + "epoch": 0.769863135442588, + "grad_norm": 2.484375, + "learning_rate": 4.600444444444445e-06, + "loss": 0.3974, + "step": 34650 + }, + { + "epoch": 0.7700853181656594, + "grad_norm": 2.5, + "learning_rate": 4.5960000000000006e-06, + "loss": 0.4181, + "step": 34660 + }, + { + "epoch": 0.7703075008887309, + "grad_norm": 2.453125, + "learning_rate": 4.591555555555555e-06, + "loss": 0.3805, + "step": 34670 + }, + { + "epoch": 0.7705296836118023, + "grad_norm": 2.59375, + "learning_rate": 4.587111111111112e-06, + "loss": 0.374, + "step": 34680 + }, + { + "epoch": 0.7707518663348738, + "grad_norm": 2.21875, + "learning_rate": 4.582666666666667e-06, + "loss": 0.3311, + "step": 34690 + }, + { + "epoch": 0.7709740490579452, + "grad_norm": 2.4375, + "learning_rate": 4.5782222222222225e-06, + "loss": 0.3876, + "step": 34700 + }, + { + "epoch": 0.7711962317810167, + "grad_norm": 2.640625, + "learning_rate": 4.573777777777778e-06, + "loss": 0.3584, + "step": 34710 + }, + { + "epoch": 0.7714184145040882, + "grad_norm": 2.546875, + "learning_rate": 4.569333333333334e-06, + "loss": 0.4144, + "step": 34720 + }, + { + "epoch": 0.7716405972271596, + "grad_norm": 2.71875, + "learning_rate": 4.5648888888888895e-06, + "loss": 0.3428, + "step": 34730 + }, + { + "epoch": 0.7718627799502311, + "grad_norm": 2.859375, + "learning_rate": 4.560444444444444e-06, + "loss": 0.3809, + "step": 34740 + }, + { + "epoch": 0.7720849626733025, + "grad_norm": 2.46875, + "learning_rate": 4.556e-06, + "loss": 0.4017, + "step": 34750 + }, + { + "epoch": 0.772307145396374, + "grad_norm": 2.28125, + "learning_rate": 4.551555555555556e-06, + "loss": 0.3841, + "step": 34760 + }, + { + "epoch": 0.7725293281194454, + "grad_norm": 3.078125, + "learning_rate": 4.5471111111111115e-06, + "loss": 0.3991, + "step": 34770 + }, + { + "epoch": 0.7727515108425169, + "grad_norm": 2.625, + "learning_rate": 4.542666666666667e-06, + "loss": 0.4064, + "step": 34780 + }, + { + "epoch": 0.7729736935655883, + "grad_norm": 2.90625, + "learning_rate": 4.538222222222223e-06, + "loss": 0.3668, + "step": 34790 + }, + { + "epoch": 0.7731958762886598, + "grad_norm": 2.515625, + "learning_rate": 4.533777777777778e-06, + "loss": 0.4388, + "step": 34800 + }, + { + "epoch": 0.7734180590117312, + "grad_norm": 2.59375, + "learning_rate": 4.529333333333334e-06, + "loss": 0.3686, + "step": 34810 + }, + { + "epoch": 0.7736402417348027, + "grad_norm": 2.390625, + "learning_rate": 4.524888888888889e-06, + "loss": 0.3894, + "step": 34820 + }, + { + "epoch": 0.7738624244578741, + "grad_norm": 2.859375, + "learning_rate": 4.520444444444445e-06, + "loss": 0.3523, + "step": 34830 + }, + { + "epoch": 0.7740846071809456, + "grad_norm": 2.546875, + "learning_rate": 4.5160000000000005e-06, + "loss": 0.3411, + "step": 34840 + }, + { + "epoch": 0.774306789904017, + "grad_norm": 3.046875, + "learning_rate": 4.511555555555555e-06, + "loss": 0.376, + "step": 34850 + }, + { + "epoch": 0.7745289726270885, + "grad_norm": 2.875, + "learning_rate": 4.507111111111112e-06, + "loss": 0.3893, + "step": 34860 + }, + { + "epoch": 0.7747511553501599, + "grad_norm": 2.3125, + "learning_rate": 4.502666666666667e-06, + "loss": 0.3377, + "step": 34870 + }, + { + "epoch": 0.7749733380732314, + "grad_norm": 2.5625, + "learning_rate": 4.498222222222222e-06, + "loss": 0.396, + "step": 34880 + }, + { + "epoch": 0.7751955207963028, + "grad_norm": 2.3125, + "learning_rate": 4.493777777777778e-06, + "loss": 0.3816, + "step": 34890 + }, + { + "epoch": 0.7754177035193743, + "grad_norm": 2.390625, + "learning_rate": 4.489333333333334e-06, + "loss": 0.3772, + "step": 34900 + }, + { + "epoch": 0.7756398862424458, + "grad_norm": 2.5625, + "learning_rate": 4.4848888888888895e-06, + "loss": 0.3348, + "step": 34910 + }, + { + "epoch": 0.7758620689655172, + "grad_norm": 2.390625, + "learning_rate": 4.480444444444445e-06, + "loss": 0.4088, + "step": 34920 + }, + { + "epoch": 0.7760842516885887, + "grad_norm": 2.703125, + "learning_rate": 4.476e-06, + "loss": 0.4077, + "step": 34930 + }, + { + "epoch": 0.7763064344116601, + "grad_norm": 2.296875, + "learning_rate": 4.471555555555556e-06, + "loss": 0.3465, + "step": 34940 + }, + { + "epoch": 0.7765286171347316, + "grad_norm": 2.9375, + "learning_rate": 4.467111111111111e-06, + "loss": 0.3538, + "step": 34950 + }, + { + "epoch": 0.776750799857803, + "grad_norm": 2.734375, + "learning_rate": 4.462666666666667e-06, + "loss": 0.3795, + "step": 34960 + }, + { + "epoch": 0.7769729825808745, + "grad_norm": 2.609375, + "learning_rate": 4.458222222222223e-06, + "loss": 0.3513, + "step": 34970 + }, + { + "epoch": 0.777195165303946, + "grad_norm": 3.5625, + "learning_rate": 4.453777777777778e-06, + "loss": 0.3852, + "step": 34980 + }, + { + "epoch": 0.7774173480270175, + "grad_norm": 3.203125, + "learning_rate": 4.449333333333334e-06, + "loss": 0.3594, + "step": 34990 + }, + { + "epoch": 0.7776395307500888, + "grad_norm": 2.125, + "learning_rate": 4.444888888888889e-06, + "loss": 0.3742, + "step": 35000 + }, + { + "epoch": 0.7778617134731604, + "grad_norm": 2.40625, + "learning_rate": 4.440444444444445e-06, + "loss": 0.4009, + "step": 35010 + }, + { + "epoch": 0.7780838961962318, + "grad_norm": 2.375, + "learning_rate": 4.436e-06, + "loss": 0.3819, + "step": 35020 + }, + { + "epoch": 0.7783060789193033, + "grad_norm": 2.59375, + "learning_rate": 4.431555555555556e-06, + "loss": 0.3715, + "step": 35030 + }, + { + "epoch": 0.7785282616423747, + "grad_norm": 2.84375, + "learning_rate": 4.427111111111112e-06, + "loss": 0.3911, + "step": 35040 + }, + { + "epoch": 0.7787504443654462, + "grad_norm": 2.5, + "learning_rate": 4.422666666666667e-06, + "loss": 0.4007, + "step": 35050 + }, + { + "epoch": 0.7789726270885176, + "grad_norm": 2.359375, + "learning_rate": 4.418222222222222e-06, + "loss": 0.3836, + "step": 35060 + }, + { + "epoch": 0.7791948098115891, + "grad_norm": 2.453125, + "learning_rate": 4.413777777777778e-06, + "loss": 0.3945, + "step": 35070 + }, + { + "epoch": 0.7794169925346605, + "grad_norm": 2.546875, + "learning_rate": 4.409333333333334e-06, + "loss": 0.3924, + "step": 35080 + }, + { + "epoch": 0.779639175257732, + "grad_norm": 2.515625, + "learning_rate": 4.404888888888889e-06, + "loss": 0.3972, + "step": 35090 + }, + { + "epoch": 0.7798613579808034, + "grad_norm": 2.75, + "learning_rate": 4.400444444444445e-06, + "loss": 0.4165, + "step": 35100 + }, + { + "epoch": 0.7800835407038749, + "grad_norm": 2.8125, + "learning_rate": 4.396e-06, + "loss": 0.3743, + "step": 35110 + }, + { + "epoch": 0.7803057234269464, + "grad_norm": 3.046875, + "learning_rate": 4.3915555555555565e-06, + "loss": 0.3532, + "step": 35120 + }, + { + "epoch": 0.7805279061500178, + "grad_norm": 2.328125, + "learning_rate": 4.387111111111111e-06, + "loss": 0.373, + "step": 35130 + }, + { + "epoch": 0.7807500888730893, + "grad_norm": 2.75, + "learning_rate": 4.382666666666667e-06, + "loss": 0.3815, + "step": 35140 + }, + { + "epoch": 0.7809722715961607, + "grad_norm": 2.484375, + "learning_rate": 4.378222222222223e-06, + "loss": 0.3535, + "step": 35150 + }, + { + "epoch": 0.7811944543192322, + "grad_norm": 2.453125, + "learning_rate": 4.3737777777777775e-06, + "loss": 0.371, + "step": 35160 + }, + { + "epoch": 0.7814166370423036, + "grad_norm": 2.25, + "learning_rate": 4.369333333333334e-06, + "loss": 0.3683, + "step": 35170 + }, + { + "epoch": 0.7816388197653751, + "grad_norm": 3.359375, + "learning_rate": 4.364888888888889e-06, + "loss": 0.3765, + "step": 35180 + }, + { + "epoch": 0.7818610024884465, + "grad_norm": 2.390625, + "learning_rate": 4.360444444444445e-06, + "loss": 0.3667, + "step": 35190 + }, + { + "epoch": 0.782083185211518, + "grad_norm": 2.125, + "learning_rate": 4.356e-06, + "loss": 0.3965, + "step": 35200 + }, + { + "epoch": 0.7823053679345894, + "grad_norm": 2.703125, + "learning_rate": 4.351555555555556e-06, + "loss": 0.3767, + "step": 35210 + }, + { + "epoch": 0.7825275506576609, + "grad_norm": 2.890625, + "learning_rate": 4.347111111111112e-06, + "loss": 0.3617, + "step": 35220 + }, + { + "epoch": 0.7827497333807323, + "grad_norm": 2.203125, + "learning_rate": 4.342666666666667e-06, + "loss": 0.3547, + "step": 35230 + }, + { + "epoch": 0.7829719161038038, + "grad_norm": 2.09375, + "learning_rate": 4.338222222222222e-06, + "loss": 0.3716, + "step": 35240 + }, + { + "epoch": 0.7831940988268752, + "grad_norm": 2.375, + "learning_rate": 4.333777777777778e-06, + "loss": 0.378, + "step": 35250 + }, + { + "epoch": 0.7834162815499467, + "grad_norm": 2.6875, + "learning_rate": 4.329333333333334e-06, + "loss": 0.3583, + "step": 35260 + }, + { + "epoch": 0.7836384642730181, + "grad_norm": 2.90625, + "learning_rate": 4.324888888888889e-06, + "loss": 0.3724, + "step": 35270 + }, + { + "epoch": 0.7838606469960896, + "grad_norm": 2.640625, + "learning_rate": 4.320444444444445e-06, + "loss": 0.3697, + "step": 35280 + }, + { + "epoch": 0.784082829719161, + "grad_norm": 2.171875, + "learning_rate": 4.316e-06, + "loss": 0.3935, + "step": 35290 + }, + { + "epoch": 0.7843050124422325, + "grad_norm": 2.140625, + "learning_rate": 4.311555555555556e-06, + "loss": 0.3499, + "step": 35300 + }, + { + "epoch": 0.7845271951653039, + "grad_norm": 2.515625, + "learning_rate": 4.307111111111111e-06, + "loss": 0.4539, + "step": 35310 + }, + { + "epoch": 0.7847493778883754, + "grad_norm": 2.546875, + "learning_rate": 4.302666666666667e-06, + "loss": 0.3557, + "step": 35320 + }, + { + "epoch": 0.7849715606114469, + "grad_norm": 2.71875, + "learning_rate": 4.298222222222223e-06, + "loss": 0.3916, + "step": 35330 + }, + { + "epoch": 0.7851937433345183, + "grad_norm": 2.5, + "learning_rate": 4.293777777777778e-06, + "loss": 0.3796, + "step": 35340 + }, + { + "epoch": 0.7854159260575898, + "grad_norm": 2.6875, + "learning_rate": 4.289333333333334e-06, + "loss": 0.4244, + "step": 35350 + }, + { + "epoch": 0.7856381087806612, + "grad_norm": 2.46875, + "learning_rate": 4.284888888888889e-06, + "loss": 0.3966, + "step": 35360 + }, + { + "epoch": 0.7858602915037327, + "grad_norm": 1.7421875, + "learning_rate": 4.2804444444444445e-06, + "loss": 0.3677, + "step": 35370 + }, + { + "epoch": 0.7860824742268041, + "grad_norm": 2.296875, + "learning_rate": 4.276e-06, + "loss": 0.3921, + "step": 35380 + }, + { + "epoch": 0.7863046569498756, + "grad_norm": 2.859375, + "learning_rate": 4.271555555555556e-06, + "loss": 0.4123, + "step": 35390 + }, + { + "epoch": 0.786526839672947, + "grad_norm": 2.34375, + "learning_rate": 4.267111111111112e-06, + "loss": 0.3939, + "step": 35400 + }, + { + "epoch": 0.7867490223960185, + "grad_norm": 2.453125, + "learning_rate": 4.262666666666667e-06, + "loss": 0.3451, + "step": 35410 + }, + { + "epoch": 0.7869712051190899, + "grad_norm": 2.296875, + "learning_rate": 4.258222222222222e-06, + "loss": 0.4128, + "step": 35420 + }, + { + "epoch": 0.7871933878421614, + "grad_norm": 2.515625, + "learning_rate": 4.253777777777779e-06, + "loss": 0.3621, + "step": 35430 + }, + { + "epoch": 0.7874155705652328, + "grad_norm": 2.5625, + "learning_rate": 4.2493333333333335e-06, + "loss": 0.3936, + "step": 35440 + }, + { + "epoch": 0.7876377532883043, + "grad_norm": 2.296875, + "learning_rate": 4.244888888888889e-06, + "loss": 0.3349, + "step": 35450 + }, + { + "epoch": 0.7878599360113757, + "grad_norm": 2.28125, + "learning_rate": 4.240444444444445e-06, + "loss": 0.3618, + "step": 35460 + }, + { + "epoch": 0.7880821187344472, + "grad_norm": 2.203125, + "learning_rate": 4.236e-06, + "loss": 0.3727, + "step": 35470 + }, + { + "epoch": 0.7883043014575186, + "grad_norm": 2.453125, + "learning_rate": 4.231555555555556e-06, + "loss": 0.36, + "step": 35480 + }, + { + "epoch": 0.7885264841805901, + "grad_norm": 2.40625, + "learning_rate": 4.227111111111111e-06, + "loss": 0.3867, + "step": 35490 + }, + { + "epoch": 0.7887486669036615, + "grad_norm": 2.390625, + "learning_rate": 4.222666666666667e-06, + "loss": 0.3428, + "step": 35500 + }, + { + "epoch": 0.788970849626733, + "grad_norm": 2.203125, + "learning_rate": 4.2182222222222225e-06, + "loss": 0.3838, + "step": 35510 + }, + { + "epoch": 0.7891930323498044, + "grad_norm": 2.546875, + "learning_rate": 4.213777777777778e-06, + "loss": 0.3552, + "step": 35520 + }, + { + "epoch": 0.7894152150728759, + "grad_norm": 2.265625, + "learning_rate": 4.209333333333334e-06, + "loss": 0.3469, + "step": 35530 + }, + { + "epoch": 0.7896373977959474, + "grad_norm": 2.640625, + "learning_rate": 4.20488888888889e-06, + "loss": 0.3799, + "step": 35540 + }, + { + "epoch": 0.7898595805190188, + "grad_norm": 2.078125, + "learning_rate": 4.2004444444444445e-06, + "loss": 0.351, + "step": 35550 + }, + { + "epoch": 0.7900817632420903, + "grad_norm": 2.734375, + "learning_rate": 4.196e-06, + "loss": 0.334, + "step": 35560 + }, + { + "epoch": 0.7903039459651617, + "grad_norm": 2.15625, + "learning_rate": 4.191555555555556e-06, + "loss": 0.3548, + "step": 35570 + }, + { + "epoch": 0.7905261286882332, + "grad_norm": 3.109375, + "learning_rate": 4.1871111111111115e-06, + "loss": 0.3785, + "step": 35580 + }, + { + "epoch": 0.7907483114113046, + "grad_norm": 2.390625, + "learning_rate": 4.182666666666667e-06, + "loss": 0.3791, + "step": 35590 + }, + { + "epoch": 0.7909704941343761, + "grad_norm": 2.40625, + "learning_rate": 4.178222222222222e-06, + "loss": 0.3997, + "step": 35600 + }, + { + "epoch": 0.7911926768574475, + "grad_norm": 3.515625, + "learning_rate": 4.173777777777779e-06, + "loss": 0.3568, + "step": 35610 + }, + { + "epoch": 0.791414859580519, + "grad_norm": 2.65625, + "learning_rate": 4.1693333333333335e-06, + "loss": 0.3632, + "step": 35620 + }, + { + "epoch": 0.7916370423035904, + "grad_norm": 2.859375, + "learning_rate": 4.164888888888889e-06, + "loss": 0.4123, + "step": 35630 + }, + { + "epoch": 0.791859225026662, + "grad_norm": 2.40625, + "learning_rate": 4.160444444444445e-06, + "loss": 0.3667, + "step": 35640 + }, + { + "epoch": 0.7920814077497333, + "grad_norm": 2.5625, + "learning_rate": 4.1560000000000005e-06, + "loss": 0.3872, + "step": 35650 + }, + { + "epoch": 0.7923035904728049, + "grad_norm": 2.5, + "learning_rate": 4.151555555555556e-06, + "loss": 0.3686, + "step": 35660 + }, + { + "epoch": 0.7925257731958762, + "grad_norm": 3.3125, + "learning_rate": 4.147111111111111e-06, + "loss": 0.3643, + "step": 35670 + }, + { + "epoch": 0.7927479559189478, + "grad_norm": 2.71875, + "learning_rate": 4.142666666666667e-06, + "loss": 0.3725, + "step": 35680 + }, + { + "epoch": 0.7929701386420192, + "grad_norm": 2.46875, + "learning_rate": 4.1382222222222224e-06, + "loss": 0.4178, + "step": 35690 + }, + { + "epoch": 0.7931923213650907, + "grad_norm": 2.15625, + "learning_rate": 4.133777777777778e-06, + "loss": 0.3527, + "step": 35700 + }, + { + "epoch": 0.793414504088162, + "grad_norm": 2.4375, + "learning_rate": 4.129333333333334e-06, + "loss": 0.4105, + "step": 35710 + }, + { + "epoch": 0.7936366868112336, + "grad_norm": 2.671875, + "learning_rate": 4.1248888888888895e-06, + "loss": 0.3827, + "step": 35720 + }, + { + "epoch": 0.7938588695343051, + "grad_norm": 2.109375, + "learning_rate": 4.120444444444444e-06, + "loss": 0.3481, + "step": 35730 + }, + { + "epoch": 0.7940810522573765, + "grad_norm": 2.484375, + "learning_rate": 4.116000000000001e-06, + "loss": 0.378, + "step": 35740 + }, + { + "epoch": 0.794303234980448, + "grad_norm": 2.9375, + "learning_rate": 4.111555555555556e-06, + "loss": 0.3797, + "step": 35750 + }, + { + "epoch": 0.7945254177035194, + "grad_norm": 3.125, + "learning_rate": 4.1071111111111114e-06, + "loss": 0.4157, + "step": 35760 + }, + { + "epoch": 0.7947476004265909, + "grad_norm": 2.828125, + "learning_rate": 4.102666666666667e-06, + "loss": 0.3935, + "step": 35770 + }, + { + "epoch": 0.7949697831496623, + "grad_norm": 2.984375, + "learning_rate": 4.098222222222222e-06, + "loss": 0.395, + "step": 35780 + }, + { + "epoch": 0.7951919658727338, + "grad_norm": 2.734375, + "learning_rate": 4.0937777777777785e-06, + "loss": 0.3708, + "step": 35790 + }, + { + "epoch": 0.7954141485958052, + "grad_norm": 2.78125, + "learning_rate": 4.089333333333333e-06, + "loss": 0.3731, + "step": 35800 + }, + { + "epoch": 0.7956363313188767, + "grad_norm": 2.59375, + "learning_rate": 4.084888888888889e-06, + "loss": 0.3528, + "step": 35810 + }, + { + "epoch": 0.7958585140419481, + "grad_norm": 2.625, + "learning_rate": 4.080444444444445e-06, + "loss": 0.3945, + "step": 35820 + }, + { + "epoch": 0.7960806967650196, + "grad_norm": 2.765625, + "learning_rate": 4.0760000000000004e-06, + "loss": 0.3837, + "step": 35830 + }, + { + "epoch": 0.796302879488091, + "grad_norm": 1.953125, + "learning_rate": 4.071555555555556e-06, + "loss": 0.3751, + "step": 35840 + }, + { + "epoch": 0.7965250622111625, + "grad_norm": 2.21875, + "learning_rate": 4.067111111111112e-06, + "loss": 0.3691, + "step": 35850 + }, + { + "epoch": 0.7967472449342339, + "grad_norm": 2.109375, + "learning_rate": 4.062666666666667e-06, + "loss": 0.3627, + "step": 35860 + }, + { + "epoch": 0.7969694276573054, + "grad_norm": 2.59375, + "learning_rate": 4.058222222222222e-06, + "loss": 0.3748, + "step": 35870 + }, + { + "epoch": 0.7971916103803768, + "grad_norm": 2.40625, + "learning_rate": 4.053777777777778e-06, + "loss": 0.3858, + "step": 35880 + }, + { + "epoch": 0.7974137931034483, + "grad_norm": 2.421875, + "learning_rate": 4.049333333333334e-06, + "loss": 0.3567, + "step": 35890 + }, + { + "epoch": 0.7976359758265197, + "grad_norm": 2.28125, + "learning_rate": 4.0448888888888894e-06, + "loss": 0.3555, + "step": 35900 + }, + { + "epoch": 0.7978581585495912, + "grad_norm": 2.4375, + "learning_rate": 4.040444444444444e-06, + "loss": 0.3841, + "step": 35910 + }, + { + "epoch": 0.7980803412726626, + "grad_norm": 2.5625, + "learning_rate": 4.036000000000001e-06, + "loss": 0.3513, + "step": 35920 + }, + { + "epoch": 0.7983025239957341, + "grad_norm": 2.171875, + "learning_rate": 4.031555555555556e-06, + "loss": 0.3697, + "step": 35930 + }, + { + "epoch": 0.7985247067188056, + "grad_norm": 2.578125, + "learning_rate": 4.027111111111111e-06, + "loss": 0.3785, + "step": 35940 + }, + { + "epoch": 0.798746889441877, + "grad_norm": 2.625, + "learning_rate": 4.022666666666667e-06, + "loss": 0.3818, + "step": 35950 + }, + { + "epoch": 0.7989690721649485, + "grad_norm": 2.34375, + "learning_rate": 4.018222222222223e-06, + "loss": 0.3765, + "step": 35960 + }, + { + "epoch": 0.7991912548880199, + "grad_norm": 2.578125, + "learning_rate": 4.0137777777777784e-06, + "loss": 0.3733, + "step": 35970 + }, + { + "epoch": 0.7994134376110914, + "grad_norm": 2.03125, + "learning_rate": 4.009333333333333e-06, + "loss": 0.3615, + "step": 35980 + }, + { + "epoch": 0.7996356203341628, + "grad_norm": 2.953125, + "learning_rate": 4.004888888888889e-06, + "loss": 0.3991, + "step": 35990 + }, + { + "epoch": 0.7998578030572343, + "grad_norm": 2.71875, + "learning_rate": 4.000444444444445e-06, + "loss": 0.4218, + "step": 36000 + }, + { + "epoch": 0.8000799857803057, + "grad_norm": 2.78125, + "learning_rate": 3.996e-06, + "loss": 0.4153, + "step": 36010 + }, + { + "epoch": 0.8003021685033772, + "grad_norm": 2.609375, + "learning_rate": 3.991555555555556e-06, + "loss": 0.416, + "step": 36020 + }, + { + "epoch": 0.8005243512264486, + "grad_norm": 2.71875, + "learning_rate": 3.987111111111112e-06, + "loss": 0.3752, + "step": 36030 + }, + { + "epoch": 0.8007465339495201, + "grad_norm": 2.25, + "learning_rate": 3.982666666666667e-06, + "loss": 0.4093, + "step": 36040 + }, + { + "epoch": 0.8009687166725915, + "grad_norm": 2.546875, + "learning_rate": 3.978222222222223e-06, + "loss": 0.3814, + "step": 36050 + }, + { + "epoch": 0.801190899395663, + "grad_norm": 2.1875, + "learning_rate": 3.973777777777778e-06, + "loss": 0.3947, + "step": 36060 + }, + { + "epoch": 0.8014130821187344, + "grad_norm": 2.34375, + "learning_rate": 3.969333333333334e-06, + "loss": 0.4031, + "step": 36070 + }, + { + "epoch": 0.8016352648418059, + "grad_norm": 2.375, + "learning_rate": 3.964888888888889e-06, + "loss": 0.3883, + "step": 36080 + }, + { + "epoch": 0.8018574475648773, + "grad_norm": 3.046875, + "learning_rate": 3.960444444444444e-06, + "loss": 0.3801, + "step": 36090 + }, + { + "epoch": 0.8020796302879488, + "grad_norm": 2.578125, + "learning_rate": 3.956000000000001e-06, + "loss": 0.4001, + "step": 36100 + }, + { + "epoch": 0.8023018130110202, + "grad_norm": 2.78125, + "learning_rate": 3.951555555555556e-06, + "loss": 0.3976, + "step": 36110 + }, + { + "epoch": 0.8025239957340917, + "grad_norm": 2.53125, + "learning_rate": 3.947111111111111e-06, + "loss": 0.4109, + "step": 36120 + }, + { + "epoch": 0.8027461784571631, + "grad_norm": 2.828125, + "learning_rate": 3.942666666666667e-06, + "loss": 0.3503, + "step": 36130 + }, + { + "epoch": 0.8029683611802346, + "grad_norm": 2.703125, + "learning_rate": 3.938222222222223e-06, + "loss": 0.3853, + "step": 36140 + }, + { + "epoch": 0.8031905439033061, + "grad_norm": 2.890625, + "learning_rate": 3.933777777777778e-06, + "loss": 0.4025, + "step": 36150 + }, + { + "epoch": 0.8034127266263775, + "grad_norm": 2.53125, + "learning_rate": 3.929333333333334e-06, + "loss": 0.3681, + "step": 36160 + }, + { + "epoch": 0.803634909349449, + "grad_norm": 2.140625, + "learning_rate": 3.924888888888889e-06, + "loss": 0.3452, + "step": 36170 + }, + { + "epoch": 0.8038570920725204, + "grad_norm": 2.515625, + "learning_rate": 3.920444444444445e-06, + "loss": 0.3523, + "step": 36180 + }, + { + "epoch": 0.8040792747955919, + "grad_norm": 2.28125, + "learning_rate": 3.916e-06, + "loss": 0.3614, + "step": 36190 + }, + { + "epoch": 0.8043014575186633, + "grad_norm": 2.6875, + "learning_rate": 3.911555555555556e-06, + "loss": 0.4246, + "step": 36200 + }, + { + "epoch": 0.8045236402417348, + "grad_norm": 2.6875, + "learning_rate": 3.907111111111112e-06, + "loss": 0.3876, + "step": 36210 + }, + { + "epoch": 0.8047458229648062, + "grad_norm": 2.5625, + "learning_rate": 3.9026666666666665e-06, + "loss": 0.3782, + "step": 36220 + }, + { + "epoch": 0.8049680056878777, + "grad_norm": 2.53125, + "learning_rate": 3.898222222222223e-06, + "loss": 0.4159, + "step": 36230 + }, + { + "epoch": 0.8051901884109491, + "grad_norm": 2.65625, + "learning_rate": 3.893777777777778e-06, + "loss": 0.3788, + "step": 36240 + }, + { + "epoch": 0.8054123711340206, + "grad_norm": 2.09375, + "learning_rate": 3.889333333333334e-06, + "loss": 0.3817, + "step": 36250 + }, + { + "epoch": 0.805634553857092, + "grad_norm": 2.546875, + "learning_rate": 3.884888888888889e-06, + "loss": 0.4005, + "step": 36260 + }, + { + "epoch": 0.8058567365801635, + "grad_norm": 3.0625, + "learning_rate": 3.880444444444445e-06, + "loss": 0.3908, + "step": 36270 + }, + { + "epoch": 0.8060789193032349, + "grad_norm": 2.8125, + "learning_rate": 3.876000000000001e-06, + "loss": 0.3803, + "step": 36280 + }, + { + "epoch": 0.8063011020263064, + "grad_norm": 1.953125, + "learning_rate": 3.8715555555555555e-06, + "loss": 0.3769, + "step": 36290 + }, + { + "epoch": 0.8065232847493778, + "grad_norm": 2.328125, + "learning_rate": 3.867111111111111e-06, + "loss": 0.3546, + "step": 36300 + }, + { + "epoch": 0.8067454674724494, + "grad_norm": 2.4375, + "learning_rate": 3.862666666666667e-06, + "loss": 0.4036, + "step": 36310 + }, + { + "epoch": 0.8069676501955207, + "grad_norm": 3.203125, + "learning_rate": 3.858222222222223e-06, + "loss": 0.3939, + "step": 36320 + }, + { + "epoch": 0.8071898329185923, + "grad_norm": 2.9375, + "learning_rate": 3.853777777777778e-06, + "loss": 0.4026, + "step": 36330 + }, + { + "epoch": 0.8074120156416636, + "grad_norm": 2.484375, + "learning_rate": 3.849333333333334e-06, + "loss": 0.4102, + "step": 36340 + }, + { + "epoch": 0.8076341983647352, + "grad_norm": 2.671875, + "learning_rate": 3.844888888888889e-06, + "loss": 0.3906, + "step": 36350 + }, + { + "epoch": 0.8078563810878067, + "grad_norm": 2.421875, + "learning_rate": 3.840444444444445e-06, + "loss": 0.328, + "step": 36360 + }, + { + "epoch": 0.8080785638108781, + "grad_norm": 2.875, + "learning_rate": 3.836e-06, + "loss": 0.409, + "step": 36370 + }, + { + "epoch": 0.8083007465339496, + "grad_norm": 2.5625, + "learning_rate": 3.831555555555556e-06, + "loss": 0.3531, + "step": 36380 + }, + { + "epoch": 0.808522929257021, + "grad_norm": 2.765625, + "learning_rate": 3.827111111111112e-06, + "loss": 0.3441, + "step": 36390 + }, + { + "epoch": 0.8087451119800925, + "grad_norm": 2.359375, + "learning_rate": 3.8226666666666664e-06, + "loss": 0.3811, + "step": 36400 + }, + { + "epoch": 0.8089672947031639, + "grad_norm": 2.390625, + "learning_rate": 3.818222222222223e-06, + "loss": 0.3645, + "step": 36410 + }, + { + "epoch": 0.8091894774262354, + "grad_norm": 2.625, + "learning_rate": 3.813777777777778e-06, + "loss": 0.3758, + "step": 36420 + }, + { + "epoch": 0.8094116601493068, + "grad_norm": 2.46875, + "learning_rate": 3.809333333333334e-06, + "loss": 0.3884, + "step": 36430 + }, + { + "epoch": 0.8096338428723783, + "grad_norm": 1.984375, + "learning_rate": 3.804888888888889e-06, + "loss": 0.399, + "step": 36440 + }, + { + "epoch": 0.8098560255954497, + "grad_norm": 2.46875, + "learning_rate": 3.800444444444445e-06, + "loss": 0.3821, + "step": 36450 + }, + { + "epoch": 0.8100782083185212, + "grad_norm": 2.546875, + "learning_rate": 3.796e-06, + "loss": 0.3441, + "step": 36460 + }, + { + "epoch": 0.8103003910415926, + "grad_norm": 2.34375, + "learning_rate": 3.7915555555555563e-06, + "loss": 0.3482, + "step": 36470 + }, + { + "epoch": 0.8105225737646641, + "grad_norm": 2.265625, + "learning_rate": 3.7871111111111115e-06, + "loss": 0.3525, + "step": 36480 + }, + { + "epoch": 0.8107447564877355, + "grad_norm": 2.484375, + "learning_rate": 3.782666666666667e-06, + "loss": 0.3821, + "step": 36490 + }, + { + "epoch": 0.810966939210807, + "grad_norm": 2.75, + "learning_rate": 3.7782222222222225e-06, + "loss": 0.364, + "step": 36500 + }, + { + "epoch": 0.8111891219338784, + "grad_norm": 3.421875, + "learning_rate": 3.7737777777777778e-06, + "loss": 0.3863, + "step": 36510 + }, + { + "epoch": 0.8114113046569499, + "grad_norm": 2.421875, + "learning_rate": 3.769333333333334e-06, + "loss": 0.3838, + "step": 36520 + }, + { + "epoch": 0.8116334873800213, + "grad_norm": 2.734375, + "learning_rate": 3.764888888888889e-06, + "loss": 0.3697, + "step": 36530 + }, + { + "epoch": 0.8118556701030928, + "grad_norm": 2.40625, + "learning_rate": 3.760444444444445e-06, + "loss": 0.3424, + "step": 36540 + }, + { + "epoch": 0.8120778528261643, + "grad_norm": 2.546875, + "learning_rate": 3.756e-06, + "loss": 0.3625, + "step": 36550 + }, + { + "epoch": 0.8123000355492357, + "grad_norm": 2.5625, + "learning_rate": 3.7515555555555562e-06, + "loss": 0.3735, + "step": 36560 + }, + { + "epoch": 0.8125222182723072, + "grad_norm": 2.703125, + "learning_rate": 3.7471111111111115e-06, + "loss": 0.4031, + "step": 36570 + }, + { + "epoch": 0.8127444009953786, + "grad_norm": 2.5625, + "learning_rate": 3.742666666666667e-06, + "loss": 0.3842, + "step": 36580 + }, + { + "epoch": 0.8129665837184501, + "grad_norm": 2.484375, + "learning_rate": 3.7382222222222225e-06, + "loss": 0.36, + "step": 36590 + }, + { + "epoch": 0.8131887664415215, + "grad_norm": 2.515625, + "learning_rate": 3.7337777777777777e-06, + "loss": 0.3619, + "step": 36600 + }, + { + "epoch": 0.813410949164593, + "grad_norm": 2.515625, + "learning_rate": 3.729333333333334e-06, + "loss": 0.3943, + "step": 36610 + }, + { + "epoch": 0.8136331318876644, + "grad_norm": 2.78125, + "learning_rate": 3.724888888888889e-06, + "loss": 0.3786, + "step": 36620 + }, + { + "epoch": 0.8138553146107359, + "grad_norm": 2.6875, + "learning_rate": 3.720444444444445e-06, + "loss": 0.3729, + "step": 36630 + }, + { + "epoch": 0.8140774973338073, + "grad_norm": 2.21875, + "learning_rate": 3.716e-06, + "loss": 0.3617, + "step": 36640 + }, + { + "epoch": 0.8142996800568788, + "grad_norm": 3.28125, + "learning_rate": 3.711555555555556e-06, + "loss": 0.4053, + "step": 36650 + }, + { + "epoch": 0.8145218627799502, + "grad_norm": 1.9453125, + "learning_rate": 3.7071111111111115e-06, + "loss": 0.3896, + "step": 36660 + }, + { + "epoch": 0.8147440455030217, + "grad_norm": 2.828125, + "learning_rate": 3.702666666666667e-06, + "loss": 0.4024, + "step": 36670 + }, + { + "epoch": 0.8149662282260931, + "grad_norm": 2.453125, + "learning_rate": 3.6982222222222224e-06, + "loss": 0.3581, + "step": 36680 + }, + { + "epoch": 0.8151884109491646, + "grad_norm": 2.6875, + "learning_rate": 3.6937777777777785e-06, + "loss": 0.3698, + "step": 36690 + }, + { + "epoch": 0.815410593672236, + "grad_norm": 2.59375, + "learning_rate": 3.689333333333334e-06, + "loss": 0.3626, + "step": 36700 + }, + { + "epoch": 0.8156327763953075, + "grad_norm": 2.265625, + "learning_rate": 3.684888888888889e-06, + "loss": 0.4152, + "step": 36710 + }, + { + "epoch": 0.8158549591183789, + "grad_norm": 2.328125, + "learning_rate": 3.6804444444444448e-06, + "loss": 0.3904, + "step": 36720 + }, + { + "epoch": 0.8160771418414504, + "grad_norm": 2.3125, + "learning_rate": 3.676e-06, + "loss": 0.3647, + "step": 36730 + }, + { + "epoch": 0.8162993245645218, + "grad_norm": 2.265625, + "learning_rate": 3.671555555555556e-06, + "loss": 0.3711, + "step": 36740 + }, + { + "epoch": 0.8165215072875933, + "grad_norm": 2.140625, + "learning_rate": 3.6671111111111114e-06, + "loss": 0.3562, + "step": 36750 + }, + { + "epoch": 0.8167436900106648, + "grad_norm": 2.640625, + "learning_rate": 3.662666666666667e-06, + "loss": 0.3905, + "step": 36760 + }, + { + "epoch": 0.8169658727337362, + "grad_norm": 2.34375, + "learning_rate": 3.6582222222222224e-06, + "loss": 0.3713, + "step": 36770 + }, + { + "epoch": 0.8171880554568077, + "grad_norm": 2.484375, + "learning_rate": 3.6537777777777785e-06, + "loss": 0.3718, + "step": 36780 + }, + { + "epoch": 0.8174102381798791, + "grad_norm": 2.421875, + "learning_rate": 3.6493333333333338e-06, + "loss": 0.3669, + "step": 36790 + }, + { + "epoch": 0.8176324209029506, + "grad_norm": 2.5625, + "learning_rate": 3.644888888888889e-06, + "loss": 0.3661, + "step": 36800 + }, + { + "epoch": 0.817854603626022, + "grad_norm": 2.796875, + "learning_rate": 3.6404444444444447e-06, + "loss": 0.4021, + "step": 36810 + }, + { + "epoch": 0.8180767863490935, + "grad_norm": 2.1875, + "learning_rate": 3.636e-06, + "loss": 0.3581, + "step": 36820 + }, + { + "epoch": 0.8182989690721649, + "grad_norm": 2.359375, + "learning_rate": 3.631555555555556e-06, + "loss": 0.3789, + "step": 36830 + }, + { + "epoch": 0.8185211517952364, + "grad_norm": 2.03125, + "learning_rate": 3.6271111111111114e-06, + "loss": 0.3591, + "step": 36840 + }, + { + "epoch": 0.8187433345183078, + "grad_norm": 2.484375, + "learning_rate": 3.622666666666667e-06, + "loss": 0.3703, + "step": 36850 + }, + { + "epoch": 0.8189655172413793, + "grad_norm": 2.5625, + "learning_rate": 3.6182222222222223e-06, + "loss": 0.4259, + "step": 36860 + }, + { + "epoch": 0.8191876999644507, + "grad_norm": 2.328125, + "learning_rate": 3.6137777777777785e-06, + "loss": 0.3824, + "step": 36870 + }, + { + "epoch": 0.8194098826875222, + "grad_norm": 3.3125, + "learning_rate": 3.6093333333333337e-06, + "loss": 0.3941, + "step": 36880 + }, + { + "epoch": 0.8196320654105936, + "grad_norm": 2.515625, + "learning_rate": 3.6048888888888894e-06, + "loss": 0.3845, + "step": 36890 + }, + { + "epoch": 0.8198542481336651, + "grad_norm": 2.40625, + "learning_rate": 3.6004444444444447e-06, + "loss": 0.3932, + "step": 36900 + }, + { + "epoch": 0.8200764308567365, + "grad_norm": 2.734375, + "learning_rate": 3.596e-06, + "loss": 0.3741, + "step": 36910 + }, + { + "epoch": 0.820298613579808, + "grad_norm": 2.40625, + "learning_rate": 3.591555555555556e-06, + "loss": 0.3793, + "step": 36920 + }, + { + "epoch": 0.8205207963028794, + "grad_norm": 2.78125, + "learning_rate": 3.5871111111111113e-06, + "loss": 0.3621, + "step": 36930 + }, + { + "epoch": 0.820742979025951, + "grad_norm": 2.359375, + "learning_rate": 3.582666666666667e-06, + "loss": 0.3599, + "step": 36940 + }, + { + "epoch": 0.8209651617490223, + "grad_norm": 3.765625, + "learning_rate": 3.5782222222222223e-06, + "loss": 0.4144, + "step": 36950 + }, + { + "epoch": 0.8211873444720938, + "grad_norm": 2.734375, + "learning_rate": 3.5737777777777784e-06, + "loss": 0.3619, + "step": 36960 + }, + { + "epoch": 0.8214095271951654, + "grad_norm": 2.90625, + "learning_rate": 3.5693333333333337e-06, + "loss": 0.3865, + "step": 36970 + }, + { + "epoch": 0.8216317099182368, + "grad_norm": 2.375, + "learning_rate": 3.5648888888888894e-06, + "loss": 0.3761, + "step": 36980 + }, + { + "epoch": 0.8218538926413083, + "grad_norm": 2.5, + "learning_rate": 3.5604444444444447e-06, + "loss": 0.3877, + "step": 36990 + }, + { + "epoch": 0.8220760753643797, + "grad_norm": 2.328125, + "learning_rate": 3.5560000000000008e-06, + "loss": 0.3583, + "step": 37000 + }, + { + "epoch": 0.8222982580874512, + "grad_norm": 2.265625, + "learning_rate": 3.551555555555556e-06, + "loss": 0.3937, + "step": 37010 + }, + { + "epoch": 0.8225204408105226, + "grad_norm": 2.609375, + "learning_rate": 3.5471111111111113e-06, + "loss": 0.4002, + "step": 37020 + }, + { + "epoch": 0.8227426235335941, + "grad_norm": 3.015625, + "learning_rate": 3.542666666666667e-06, + "loss": 0.4044, + "step": 37030 + }, + { + "epoch": 0.8229648062566655, + "grad_norm": 2.859375, + "learning_rate": 3.5382222222222223e-06, + "loss": 0.3772, + "step": 37040 + }, + { + "epoch": 0.823186988979737, + "grad_norm": 2.390625, + "learning_rate": 3.5337777777777784e-06, + "loss": 0.3827, + "step": 37050 + }, + { + "epoch": 0.8234091717028084, + "grad_norm": 2.421875, + "learning_rate": 3.5293333333333336e-06, + "loss": 0.3963, + "step": 37060 + }, + { + "epoch": 0.8236313544258799, + "grad_norm": 2.890625, + "learning_rate": 3.5248888888888893e-06, + "loss": 0.4422, + "step": 37070 + }, + { + "epoch": 0.8238535371489513, + "grad_norm": 2.578125, + "learning_rate": 3.5204444444444446e-06, + "loss": 0.4013, + "step": 37080 + }, + { + "epoch": 0.8240757198720228, + "grad_norm": 2.890625, + "learning_rate": 3.5160000000000007e-06, + "loss": 0.3957, + "step": 37090 + }, + { + "epoch": 0.8242979025950942, + "grad_norm": 2.71875, + "learning_rate": 3.511555555555556e-06, + "loss": 0.3686, + "step": 37100 + }, + { + "epoch": 0.8245200853181657, + "grad_norm": 2.515625, + "learning_rate": 3.5071111111111113e-06, + "loss": 0.3786, + "step": 37110 + }, + { + "epoch": 0.8247422680412371, + "grad_norm": 2.203125, + "learning_rate": 3.502666666666667e-06, + "loss": 0.3427, + "step": 37120 + }, + { + "epoch": 0.8249644507643086, + "grad_norm": 2.421875, + "learning_rate": 3.4982222222222222e-06, + "loss": 0.3927, + "step": 37130 + }, + { + "epoch": 0.82518663348738, + "grad_norm": 2.8125, + "learning_rate": 3.4937777777777783e-06, + "loss": 0.395, + "step": 37140 + }, + { + "epoch": 0.8254088162104515, + "grad_norm": 2.703125, + "learning_rate": 3.4893333333333336e-06, + "loss": 0.4044, + "step": 37150 + }, + { + "epoch": 0.8256309989335229, + "grad_norm": 2.703125, + "learning_rate": 3.4848888888888893e-06, + "loss": 0.3699, + "step": 37160 + }, + { + "epoch": 0.8258531816565944, + "grad_norm": 2.375, + "learning_rate": 3.4804444444444446e-06, + "loss": 0.3468, + "step": 37170 + }, + { + "epoch": 0.8260753643796659, + "grad_norm": 2.96875, + "learning_rate": 3.4760000000000007e-06, + "loss": 0.3921, + "step": 37180 + }, + { + "epoch": 0.8262975471027373, + "grad_norm": 2.234375, + "learning_rate": 3.471555555555556e-06, + "loss": 0.3611, + "step": 37190 + }, + { + "epoch": 0.8265197298258088, + "grad_norm": 2.65625, + "learning_rate": 3.4671111111111116e-06, + "loss": 0.361, + "step": 37200 + }, + { + "epoch": 0.8267419125488802, + "grad_norm": 2.578125, + "learning_rate": 3.462666666666667e-06, + "loss": 0.3554, + "step": 37210 + }, + { + "epoch": 0.8269640952719517, + "grad_norm": 2.46875, + "learning_rate": 3.458222222222222e-06, + "loss": 0.3574, + "step": 37220 + }, + { + "epoch": 0.8271862779950231, + "grad_norm": 2.46875, + "learning_rate": 3.4537777777777783e-06, + "loss": 0.3913, + "step": 37230 + }, + { + "epoch": 0.8274084607180946, + "grad_norm": 2.28125, + "learning_rate": 3.4493333333333336e-06, + "loss": 0.3534, + "step": 37240 + }, + { + "epoch": 0.827630643441166, + "grad_norm": 2.53125, + "learning_rate": 3.4448888888888893e-06, + "loss": 0.3659, + "step": 37250 + }, + { + "epoch": 0.8278528261642375, + "grad_norm": 2.875, + "learning_rate": 3.4404444444444445e-06, + "loss": 0.3847, + "step": 37260 + }, + { + "epoch": 0.8280750088873089, + "grad_norm": 2.40625, + "learning_rate": 3.4360000000000006e-06, + "loss": 0.3676, + "step": 37270 + }, + { + "epoch": 0.8282971916103804, + "grad_norm": 2.234375, + "learning_rate": 3.431555555555556e-06, + "loss": 0.3727, + "step": 37280 + }, + { + "epoch": 0.8285193743334518, + "grad_norm": 2.4375, + "learning_rate": 3.4271111111111116e-06, + "loss": 0.36, + "step": 37290 + }, + { + "epoch": 0.8287415570565233, + "grad_norm": 2.171875, + "learning_rate": 3.422666666666667e-06, + "loss": 0.3995, + "step": 37300 + }, + { + "epoch": 0.8289637397795947, + "grad_norm": 2.734375, + "learning_rate": 3.4182222222222226e-06, + "loss": 0.3648, + "step": 37310 + }, + { + "epoch": 0.8291859225026662, + "grad_norm": 3.359375, + "learning_rate": 3.4137777777777783e-06, + "loss": 0.4034, + "step": 37320 + }, + { + "epoch": 0.8294081052257376, + "grad_norm": 2.421875, + "learning_rate": 3.4093333333333335e-06, + "loss": 0.3502, + "step": 37330 + }, + { + "epoch": 0.8296302879488091, + "grad_norm": 2.46875, + "learning_rate": 3.4048888888888892e-06, + "loss": 0.3505, + "step": 37340 + }, + { + "epoch": 0.8298524706718805, + "grad_norm": 2.703125, + "learning_rate": 3.4004444444444445e-06, + "loss": 0.3957, + "step": 37350 + }, + { + "epoch": 0.830074653394952, + "grad_norm": 2.859375, + "learning_rate": 3.3960000000000006e-06, + "loss": 0.3776, + "step": 37360 + }, + { + "epoch": 0.8302968361180235, + "grad_norm": 2.546875, + "learning_rate": 3.391555555555556e-06, + "loss": 0.3782, + "step": 37370 + }, + { + "epoch": 0.8305190188410949, + "grad_norm": 2.421875, + "learning_rate": 3.3871111111111116e-06, + "loss": 0.3891, + "step": 37380 + }, + { + "epoch": 0.8307412015641664, + "grad_norm": 2.234375, + "learning_rate": 3.382666666666667e-06, + "loss": 0.3471, + "step": 37390 + }, + { + "epoch": 0.8309633842872378, + "grad_norm": 2.765625, + "learning_rate": 3.3782222222222225e-06, + "loss": 0.4007, + "step": 37400 + }, + { + "epoch": 0.8311855670103093, + "grad_norm": 2.359375, + "learning_rate": 3.3737777777777782e-06, + "loss": 0.3866, + "step": 37410 + }, + { + "epoch": 0.8314077497333807, + "grad_norm": 2.59375, + "learning_rate": 3.3693333333333335e-06, + "loss": 0.3708, + "step": 37420 + }, + { + "epoch": 0.8316299324564522, + "grad_norm": 2.375, + "learning_rate": 3.364888888888889e-06, + "loss": 0.3908, + "step": 37430 + }, + { + "epoch": 0.8318521151795236, + "grad_norm": 2.65625, + "learning_rate": 3.3604444444444444e-06, + "loss": 0.3849, + "step": 37440 + }, + { + "epoch": 0.8320742979025951, + "grad_norm": 2.75, + "learning_rate": 3.3560000000000006e-06, + "loss": 0.3863, + "step": 37450 + }, + { + "epoch": 0.8322964806256665, + "grad_norm": 2.53125, + "learning_rate": 3.351555555555556e-06, + "loss": 0.3868, + "step": 37460 + }, + { + "epoch": 0.832518663348738, + "grad_norm": 2.609375, + "learning_rate": 3.3471111111111115e-06, + "loss": 0.3625, + "step": 37470 + }, + { + "epoch": 0.8327408460718094, + "grad_norm": 2.5625, + "learning_rate": 3.342666666666667e-06, + "loss": 0.3774, + "step": 37480 + }, + { + "epoch": 0.8329630287948809, + "grad_norm": 2.453125, + "learning_rate": 3.3382222222222225e-06, + "loss": 0.4237, + "step": 37490 + }, + { + "epoch": 0.8331852115179523, + "grad_norm": 2.8125, + "learning_rate": 3.333777777777778e-06, + "loss": 0.3394, + "step": 37500 + }, + { + "epoch": 0.8334073942410238, + "grad_norm": 2.765625, + "learning_rate": 3.329333333333334e-06, + "loss": 0.4392, + "step": 37510 + }, + { + "epoch": 0.8336295769640952, + "grad_norm": 3.015625, + "learning_rate": 3.324888888888889e-06, + "loss": 0.3995, + "step": 37520 + }, + { + "epoch": 0.8338517596871667, + "grad_norm": 2.171875, + "learning_rate": 3.3204444444444444e-06, + "loss": 0.3664, + "step": 37530 + }, + { + "epoch": 0.8340739424102381, + "grad_norm": 2.265625, + "learning_rate": 3.3160000000000005e-06, + "loss": 0.3388, + "step": 37540 + }, + { + "epoch": 0.8342961251333096, + "grad_norm": 2.421875, + "learning_rate": 3.311555555555556e-06, + "loss": 0.3804, + "step": 37550 + }, + { + "epoch": 0.834518307856381, + "grad_norm": 2.71875, + "learning_rate": 3.3071111111111115e-06, + "loss": 0.3699, + "step": 37560 + }, + { + "epoch": 0.8347404905794525, + "grad_norm": 2.4375, + "learning_rate": 3.3026666666666668e-06, + "loss": 0.3796, + "step": 37570 + }, + { + "epoch": 0.834962673302524, + "grad_norm": 2.640625, + "learning_rate": 3.298222222222223e-06, + "loss": 0.3949, + "step": 37580 + }, + { + "epoch": 0.8351848560255954, + "grad_norm": 2.046875, + "learning_rate": 3.293777777777778e-06, + "loss": 0.3359, + "step": 37590 + }, + { + "epoch": 0.835407038748667, + "grad_norm": 2.5625, + "learning_rate": 3.289333333333334e-06, + "loss": 0.3812, + "step": 37600 + }, + { + "epoch": 0.8356292214717383, + "grad_norm": 2.59375, + "learning_rate": 3.284888888888889e-06, + "loss": 0.3936, + "step": 37610 + }, + { + "epoch": 0.8358514041948099, + "grad_norm": 3.0, + "learning_rate": 3.280444444444445e-06, + "loss": 0.396, + "step": 37620 + }, + { + "epoch": 0.8360735869178813, + "grad_norm": 2.546875, + "learning_rate": 3.2760000000000005e-06, + "loss": 0.4006, + "step": 37630 + }, + { + "epoch": 0.8362957696409528, + "grad_norm": 2.640625, + "learning_rate": 3.2715555555555558e-06, + "loss": 0.4128, + "step": 37640 + }, + { + "epoch": 0.8365179523640242, + "grad_norm": 2.859375, + "learning_rate": 3.2671111111111114e-06, + "loss": 0.3945, + "step": 37650 + }, + { + "epoch": 0.8367401350870957, + "grad_norm": 2.328125, + "learning_rate": 3.2626666666666667e-06, + "loss": 0.3853, + "step": 37660 + }, + { + "epoch": 0.8369623178101671, + "grad_norm": 2.6875, + "learning_rate": 3.258222222222223e-06, + "loss": 0.3778, + "step": 37670 + }, + { + "epoch": 0.8371845005332386, + "grad_norm": 2.390625, + "learning_rate": 3.253777777777778e-06, + "loss": 0.3505, + "step": 37680 + }, + { + "epoch": 0.83740668325631, + "grad_norm": 2.109375, + "learning_rate": 3.249333333333334e-06, + "loss": 0.3252, + "step": 37690 + }, + { + "epoch": 0.8376288659793815, + "grad_norm": 2.734375, + "learning_rate": 3.244888888888889e-06, + "loss": 0.3809, + "step": 37700 + }, + { + "epoch": 0.8378510487024529, + "grad_norm": 2.703125, + "learning_rate": 3.2404444444444448e-06, + "loss": 0.3932, + "step": 37710 + }, + { + "epoch": 0.8380732314255244, + "grad_norm": 2.6875, + "learning_rate": 3.2360000000000004e-06, + "loss": 0.3969, + "step": 37720 + }, + { + "epoch": 0.8382954141485958, + "grad_norm": 2.703125, + "learning_rate": 3.2315555555555557e-06, + "loss": 0.3881, + "step": 37730 + }, + { + "epoch": 0.8385175968716673, + "grad_norm": 2.421875, + "learning_rate": 3.2271111111111114e-06, + "loss": 0.402, + "step": 37740 + }, + { + "epoch": 0.8387397795947387, + "grad_norm": 2.578125, + "learning_rate": 3.2226666666666667e-06, + "loss": 0.3891, + "step": 37750 + }, + { + "epoch": 0.8389619623178102, + "grad_norm": 2.640625, + "learning_rate": 3.2182222222222228e-06, + "loss": 0.3798, + "step": 37760 + }, + { + "epoch": 0.8391841450408816, + "grad_norm": 2.359375, + "learning_rate": 3.213777777777778e-06, + "loss": 0.381, + "step": 37770 + }, + { + "epoch": 0.8394063277639531, + "grad_norm": 2.5625, + "learning_rate": 3.2093333333333337e-06, + "loss": 0.3803, + "step": 37780 + }, + { + "epoch": 0.8396285104870246, + "grad_norm": 2.453125, + "learning_rate": 3.204888888888889e-06, + "loss": 0.3757, + "step": 37790 + }, + { + "epoch": 0.839850693210096, + "grad_norm": 2.515625, + "learning_rate": 3.2004444444444447e-06, + "loss": 0.3806, + "step": 37800 + }, + { + "epoch": 0.8400728759331675, + "grad_norm": 2.40625, + "learning_rate": 3.1960000000000004e-06, + "loss": 0.4007, + "step": 37810 + }, + { + "epoch": 0.8402950586562389, + "grad_norm": 2.765625, + "learning_rate": 3.191555555555556e-06, + "loss": 0.4154, + "step": 37820 + }, + { + "epoch": 0.8405172413793104, + "grad_norm": 2.53125, + "learning_rate": 3.1871111111111114e-06, + "loss": 0.4186, + "step": 37830 + }, + { + "epoch": 0.8407394241023818, + "grad_norm": 2.640625, + "learning_rate": 3.1826666666666666e-06, + "loss": 0.4009, + "step": 37840 + }, + { + "epoch": 0.8409616068254533, + "grad_norm": 2.71875, + "learning_rate": 3.1782222222222227e-06, + "loss": 0.4162, + "step": 37850 + }, + { + "epoch": 0.8411837895485247, + "grad_norm": 2.4375, + "learning_rate": 3.173777777777778e-06, + "loss": 0.3911, + "step": 37860 + }, + { + "epoch": 0.8414059722715962, + "grad_norm": 2.65625, + "learning_rate": 3.1693333333333337e-06, + "loss": 0.3891, + "step": 37870 + }, + { + "epoch": 0.8416281549946676, + "grad_norm": 2.984375, + "learning_rate": 3.164888888888889e-06, + "loss": 0.3632, + "step": 37880 + }, + { + "epoch": 0.8418503377177391, + "grad_norm": 2.015625, + "learning_rate": 3.1604444444444447e-06, + "loss": 0.3702, + "step": 37890 + }, + { + "epoch": 0.8420725204408105, + "grad_norm": 2.421875, + "learning_rate": 3.1560000000000004e-06, + "loss": 0.3673, + "step": 37900 + }, + { + "epoch": 0.842294703163882, + "grad_norm": 2.375, + "learning_rate": 3.151555555555556e-06, + "loss": 0.3518, + "step": 37910 + }, + { + "epoch": 0.8425168858869534, + "grad_norm": 2.28125, + "learning_rate": 3.1471111111111113e-06, + "loss": 0.382, + "step": 37920 + }, + { + "epoch": 0.8427390686100249, + "grad_norm": 2.640625, + "learning_rate": 3.142666666666667e-06, + "loss": 0.377, + "step": 37930 + }, + { + "epoch": 0.8429612513330963, + "grad_norm": 2.453125, + "learning_rate": 3.1382222222222227e-06, + "loss": 0.4045, + "step": 37940 + }, + { + "epoch": 0.8431834340561678, + "grad_norm": 3.25, + "learning_rate": 3.133777777777778e-06, + "loss": 0.3554, + "step": 37950 + }, + { + "epoch": 0.8434056167792392, + "grad_norm": 2.75, + "learning_rate": 3.1293333333333337e-06, + "loss": 0.3976, + "step": 37960 + }, + { + "epoch": 0.8436277995023107, + "grad_norm": 2.609375, + "learning_rate": 3.124888888888889e-06, + "loss": 0.4088, + "step": 37970 + }, + { + "epoch": 0.8438499822253821, + "grad_norm": 2.359375, + "learning_rate": 3.1204444444444446e-06, + "loss": 0.3825, + "step": 37980 + }, + { + "epoch": 0.8440721649484536, + "grad_norm": 2.15625, + "learning_rate": 3.1160000000000003e-06, + "loss": 0.3749, + "step": 37990 + }, + { + "epoch": 0.8442943476715251, + "grad_norm": 2.828125, + "learning_rate": 3.111555555555556e-06, + "loss": 0.3941, + "step": 38000 + }, + { + "epoch": 0.8445165303945965, + "grad_norm": 2.515625, + "learning_rate": 3.1071111111111113e-06, + "loss": 0.3638, + "step": 38010 + }, + { + "epoch": 0.844738713117668, + "grad_norm": 2.5625, + "learning_rate": 3.102666666666667e-06, + "loss": 0.3978, + "step": 38020 + }, + { + "epoch": 0.8449608958407394, + "grad_norm": 2.546875, + "learning_rate": 3.0982222222222227e-06, + "loss": 0.3633, + "step": 38030 + }, + { + "epoch": 0.8451830785638109, + "grad_norm": 2.953125, + "learning_rate": 3.093777777777778e-06, + "loss": 0.3862, + "step": 38040 + }, + { + "epoch": 0.8454052612868823, + "grad_norm": 2.984375, + "learning_rate": 3.0893333333333336e-06, + "loss": 0.3785, + "step": 38050 + }, + { + "epoch": 0.8456274440099538, + "grad_norm": 2.578125, + "learning_rate": 3.084888888888889e-06, + "loss": 0.389, + "step": 38060 + }, + { + "epoch": 0.8458496267330252, + "grad_norm": 2.875, + "learning_rate": 3.0804444444444446e-06, + "loss": 0.3706, + "step": 38070 + }, + { + "epoch": 0.8460718094560967, + "grad_norm": 2.015625, + "learning_rate": 3.0760000000000003e-06, + "loss": 0.3671, + "step": 38080 + }, + { + "epoch": 0.8462939921791681, + "grad_norm": 2.765625, + "learning_rate": 3.071555555555556e-06, + "loss": 0.4127, + "step": 38090 + }, + { + "epoch": 0.8465161749022396, + "grad_norm": 2.609375, + "learning_rate": 3.0671111111111112e-06, + "loss": 0.3619, + "step": 38100 + }, + { + "epoch": 0.846738357625311, + "grad_norm": 2.0625, + "learning_rate": 3.062666666666667e-06, + "loss": 0.3844, + "step": 38110 + }, + { + "epoch": 0.8469605403483825, + "grad_norm": 2.5625, + "learning_rate": 3.0582222222222226e-06, + "loss": 0.4027, + "step": 38120 + }, + { + "epoch": 0.8471827230714539, + "grad_norm": 2.609375, + "learning_rate": 3.0537777777777783e-06, + "loss": 0.379, + "step": 38130 + }, + { + "epoch": 0.8474049057945254, + "grad_norm": 2.28125, + "learning_rate": 3.0493333333333336e-06, + "loss": 0.4034, + "step": 38140 + }, + { + "epoch": 0.8476270885175968, + "grad_norm": 2.6875, + "learning_rate": 3.044888888888889e-06, + "loss": 0.4175, + "step": 38150 + }, + { + "epoch": 0.8478492712406683, + "grad_norm": 2.640625, + "learning_rate": 3.0404444444444445e-06, + "loss": 0.3888, + "step": 38160 + }, + { + "epoch": 0.8480714539637397, + "grad_norm": 2.390625, + "learning_rate": 3.0360000000000002e-06, + "loss": 0.349, + "step": 38170 + }, + { + "epoch": 0.8482936366868112, + "grad_norm": 2.375, + "learning_rate": 3.031555555555556e-06, + "loss": 0.3918, + "step": 38180 + }, + { + "epoch": 0.8485158194098826, + "grad_norm": 2.734375, + "learning_rate": 3.027111111111111e-06, + "loss": 0.3855, + "step": 38190 + }, + { + "epoch": 0.8487380021329541, + "grad_norm": 2.515625, + "learning_rate": 3.022666666666667e-06, + "loss": 0.3281, + "step": 38200 + }, + { + "epoch": 0.8489601848560256, + "grad_norm": 2.28125, + "learning_rate": 3.0182222222222226e-06, + "loss": 0.3725, + "step": 38210 + }, + { + "epoch": 0.849182367579097, + "grad_norm": 2.046875, + "learning_rate": 3.0137777777777783e-06, + "loss": 0.3511, + "step": 38220 + }, + { + "epoch": 0.8494045503021685, + "grad_norm": 2.703125, + "learning_rate": 3.0093333333333335e-06, + "loss": 0.3693, + "step": 38230 + }, + { + "epoch": 0.8496267330252399, + "grad_norm": 2.234375, + "learning_rate": 3.0048888888888892e-06, + "loss": 0.3991, + "step": 38240 + }, + { + "epoch": 0.8498489157483115, + "grad_norm": 2.609375, + "learning_rate": 3.0004444444444445e-06, + "loss": 0.3783, + "step": 38250 + }, + { + "epoch": 0.8500710984713828, + "grad_norm": 2.78125, + "learning_rate": 2.996e-06, + "loss": 0.3797, + "step": 38260 + }, + { + "epoch": 0.8502932811944544, + "grad_norm": 2.359375, + "learning_rate": 2.991555555555556e-06, + "loss": 0.3732, + "step": 38270 + }, + { + "epoch": 0.8505154639175257, + "grad_norm": 2.125, + "learning_rate": 2.987111111111111e-06, + "loss": 0.3435, + "step": 38280 + }, + { + "epoch": 0.8507376466405973, + "grad_norm": 1.96875, + "learning_rate": 2.982666666666667e-06, + "loss": 0.3585, + "step": 38290 + }, + { + "epoch": 0.8509598293636687, + "grad_norm": 2.90625, + "learning_rate": 2.9782222222222225e-06, + "loss": 0.3673, + "step": 38300 + }, + { + "epoch": 0.8511820120867402, + "grad_norm": 2.671875, + "learning_rate": 2.9737777777777782e-06, + "loss": 0.3812, + "step": 38310 + }, + { + "epoch": 0.8514041948098116, + "grad_norm": 2.453125, + "learning_rate": 2.9693333333333335e-06, + "loss": 0.3847, + "step": 38320 + }, + { + "epoch": 0.8516263775328831, + "grad_norm": 2.984375, + "learning_rate": 2.964888888888889e-06, + "loss": 0.379, + "step": 38330 + }, + { + "epoch": 0.8518485602559545, + "grad_norm": 2.953125, + "learning_rate": 2.9604444444444445e-06, + "loss": 0.3578, + "step": 38340 + }, + { + "epoch": 0.852070742979026, + "grad_norm": 2.9375, + "learning_rate": 2.956e-06, + "loss": 0.3734, + "step": 38350 + }, + { + "epoch": 0.8522929257020974, + "grad_norm": 2.8125, + "learning_rate": 2.951555555555556e-06, + "loss": 0.3857, + "step": 38360 + }, + { + "epoch": 0.8525151084251689, + "grad_norm": 2.65625, + "learning_rate": 2.947111111111111e-06, + "loss": 0.3981, + "step": 38370 + }, + { + "epoch": 0.8527372911482403, + "grad_norm": 2.359375, + "learning_rate": 2.942666666666667e-06, + "loss": 0.3664, + "step": 38380 + }, + { + "epoch": 0.8529594738713118, + "grad_norm": 2.546875, + "learning_rate": 2.9382222222222225e-06, + "loss": 0.3549, + "step": 38390 + }, + { + "epoch": 0.8531816565943833, + "grad_norm": 2.90625, + "learning_rate": 2.933777777777778e-06, + "loss": 0.3706, + "step": 38400 + }, + { + "epoch": 0.8534038393174547, + "grad_norm": 2.578125, + "learning_rate": 2.9293333333333335e-06, + "loss": 0.3724, + "step": 38410 + }, + { + "epoch": 0.8536260220405262, + "grad_norm": 2.53125, + "learning_rate": 2.924888888888889e-06, + "loss": 0.411, + "step": 38420 + }, + { + "epoch": 0.8538482047635976, + "grad_norm": 2.65625, + "learning_rate": 2.9204444444444444e-06, + "loss": 0.3301, + "step": 38430 + }, + { + "epoch": 0.8540703874866691, + "grad_norm": 2.75, + "learning_rate": 2.9160000000000005e-06, + "loss": 0.3766, + "step": 38440 + }, + { + "epoch": 0.8542925702097405, + "grad_norm": 3.015625, + "learning_rate": 2.911555555555556e-06, + "loss": 0.3619, + "step": 38450 + }, + { + "epoch": 0.854514752932812, + "grad_norm": 2.578125, + "learning_rate": 2.907111111111111e-06, + "loss": 0.3743, + "step": 38460 + }, + { + "epoch": 0.8547369356558834, + "grad_norm": 2.25, + "learning_rate": 2.9026666666666668e-06, + "loss": 0.3742, + "step": 38470 + }, + { + "epoch": 0.8549591183789549, + "grad_norm": 2.25, + "learning_rate": 2.8982222222222225e-06, + "loss": 0.3755, + "step": 38480 + }, + { + "epoch": 0.8551813011020263, + "grad_norm": 2.140625, + "learning_rate": 2.893777777777778e-06, + "loss": 0.3777, + "step": 38490 + }, + { + "epoch": 0.8554034838250978, + "grad_norm": 2.203125, + "learning_rate": 2.8893333333333334e-06, + "loss": 0.3395, + "step": 38500 + }, + { + "epoch": 0.8556256665481692, + "grad_norm": 2.359375, + "learning_rate": 2.884888888888889e-06, + "loss": 0.3713, + "step": 38510 + }, + { + "epoch": 0.8558478492712407, + "grad_norm": 2.84375, + "learning_rate": 2.880444444444445e-06, + "loss": 0.3555, + "step": 38520 + }, + { + "epoch": 0.8560700319943121, + "grad_norm": 2.203125, + "learning_rate": 2.8760000000000005e-06, + "loss": 0.3666, + "step": 38530 + }, + { + "epoch": 0.8562922147173836, + "grad_norm": 2.3125, + "learning_rate": 2.8715555555555558e-06, + "loss": 0.3908, + "step": 38540 + }, + { + "epoch": 0.856514397440455, + "grad_norm": 2.40625, + "learning_rate": 2.8671111111111115e-06, + "loss": 0.3939, + "step": 38550 + }, + { + "epoch": 0.8567365801635265, + "grad_norm": 2.484375, + "learning_rate": 2.8626666666666667e-06, + "loss": 0.402, + "step": 38560 + }, + { + "epoch": 0.8569587628865979, + "grad_norm": 2.59375, + "learning_rate": 2.8582222222222224e-06, + "loss": 0.3823, + "step": 38570 + }, + { + "epoch": 0.8571809456096694, + "grad_norm": 2.4375, + "learning_rate": 2.853777777777778e-06, + "loss": 0.3862, + "step": 38580 + }, + { + "epoch": 0.8574031283327408, + "grad_norm": 2.9375, + "learning_rate": 2.8493333333333334e-06, + "loss": 0.3953, + "step": 38590 + }, + { + "epoch": 0.8576253110558123, + "grad_norm": 2.640625, + "learning_rate": 2.844888888888889e-06, + "loss": 0.392, + "step": 38600 + }, + { + "epoch": 0.8578474937788838, + "grad_norm": 2.453125, + "learning_rate": 2.8404444444444448e-06, + "loss": 0.3912, + "step": 38610 + }, + { + "epoch": 0.8580696765019552, + "grad_norm": 2.671875, + "learning_rate": 2.8360000000000005e-06, + "loss": 0.3748, + "step": 38620 + }, + { + "epoch": 0.8582918592250267, + "grad_norm": 2.390625, + "learning_rate": 2.8315555555555557e-06, + "loss": 0.3653, + "step": 38630 + }, + { + "epoch": 0.8585140419480981, + "grad_norm": 2.375, + "learning_rate": 2.8271111111111114e-06, + "loss": 0.3418, + "step": 38640 + }, + { + "epoch": 0.8587362246711696, + "grad_norm": 3.03125, + "learning_rate": 2.8226666666666667e-06, + "loss": 0.3835, + "step": 38650 + }, + { + "epoch": 0.858958407394241, + "grad_norm": 2.484375, + "learning_rate": 2.8182222222222224e-06, + "loss": 0.3827, + "step": 38660 + }, + { + "epoch": 0.8591805901173125, + "grad_norm": 2.375, + "learning_rate": 2.813777777777778e-06, + "loss": 0.3809, + "step": 38670 + }, + { + "epoch": 0.8594027728403839, + "grad_norm": 3.296875, + "learning_rate": 2.8093333333333333e-06, + "loss": 0.3883, + "step": 38680 + }, + { + "epoch": 0.8596249555634554, + "grad_norm": 2.59375, + "learning_rate": 2.804888888888889e-06, + "loss": 0.3659, + "step": 38690 + }, + { + "epoch": 0.8598471382865268, + "grad_norm": 2.953125, + "learning_rate": 2.8004444444444447e-06, + "loss": 0.3739, + "step": 38700 + }, + { + "epoch": 0.8600693210095983, + "grad_norm": 2.484375, + "learning_rate": 2.7960000000000004e-06, + "loss": 0.354, + "step": 38710 + }, + { + "epoch": 0.8602915037326697, + "grad_norm": 2.46875, + "learning_rate": 2.7915555555555557e-06, + "loss": 0.3551, + "step": 38720 + }, + { + "epoch": 0.8605136864557412, + "grad_norm": 2.5, + "learning_rate": 2.7871111111111114e-06, + "loss": 0.3688, + "step": 38730 + }, + { + "epoch": 0.8607358691788126, + "grad_norm": 3.28125, + "learning_rate": 2.7826666666666666e-06, + "loss": 0.3777, + "step": 38740 + }, + { + "epoch": 0.8609580519018841, + "grad_norm": 2.671875, + "learning_rate": 2.7782222222222228e-06, + "loss": 0.4131, + "step": 38750 + }, + { + "epoch": 0.8611802346249555, + "grad_norm": 3.0, + "learning_rate": 2.773777777777778e-06, + "loss": 0.3677, + "step": 38760 + }, + { + "epoch": 0.861402417348027, + "grad_norm": 2.515625, + "learning_rate": 2.7693333333333333e-06, + "loss": 0.3922, + "step": 38770 + }, + { + "epoch": 0.8616246000710984, + "grad_norm": 2.875, + "learning_rate": 2.764888888888889e-06, + "loss": 0.378, + "step": 38780 + }, + { + "epoch": 0.8618467827941699, + "grad_norm": 3.1875, + "learning_rate": 2.7604444444444447e-06, + "loss": 0.3579, + "step": 38790 + }, + { + "epoch": 0.8620689655172413, + "grad_norm": 2.359375, + "learning_rate": 2.7560000000000004e-06, + "loss": 0.3672, + "step": 38800 + }, + { + "epoch": 0.8622911482403128, + "grad_norm": 2.5, + "learning_rate": 2.7515555555555556e-06, + "loss": 0.3729, + "step": 38810 + }, + { + "epoch": 0.8625133309633843, + "grad_norm": 3.28125, + "learning_rate": 2.7471111111111113e-06, + "loss": 0.3626, + "step": 38820 + }, + { + "epoch": 0.8627355136864557, + "grad_norm": 2.71875, + "learning_rate": 2.7426666666666666e-06, + "loss": 0.4237, + "step": 38830 + }, + { + "epoch": 0.8629576964095272, + "grad_norm": 2.4375, + "learning_rate": 2.7382222222222227e-06, + "loss": 0.4017, + "step": 38840 + }, + { + "epoch": 0.8631798791325986, + "grad_norm": 2.59375, + "learning_rate": 2.733777777777778e-06, + "loss": 0.3437, + "step": 38850 + }, + { + "epoch": 0.8634020618556701, + "grad_norm": 2.53125, + "learning_rate": 2.7293333333333333e-06, + "loss": 0.3578, + "step": 38860 + }, + { + "epoch": 0.8636242445787415, + "grad_norm": 2.171875, + "learning_rate": 2.724888888888889e-06, + "loss": 0.3849, + "step": 38870 + }, + { + "epoch": 0.863846427301813, + "grad_norm": 2.859375, + "learning_rate": 2.7204444444444446e-06, + "loss": 0.3795, + "step": 38880 + }, + { + "epoch": 0.8640686100248844, + "grad_norm": 2.484375, + "learning_rate": 2.7160000000000003e-06, + "loss": 0.3713, + "step": 38890 + }, + { + "epoch": 0.864290792747956, + "grad_norm": 2.28125, + "learning_rate": 2.7115555555555556e-06, + "loss": 0.3595, + "step": 38900 + }, + { + "epoch": 0.8645129754710273, + "grad_norm": 2.6875, + "learning_rate": 2.7071111111111113e-06, + "loss": 0.3469, + "step": 38910 + }, + { + "epoch": 0.8647351581940989, + "grad_norm": 2.734375, + "learning_rate": 2.7026666666666666e-06, + "loss": 0.3666, + "step": 38920 + }, + { + "epoch": 0.8649573409171702, + "grad_norm": 2.46875, + "learning_rate": 2.6982222222222227e-06, + "loss": 0.3731, + "step": 38930 + }, + { + "epoch": 0.8651795236402418, + "grad_norm": 2.578125, + "learning_rate": 2.693777777777778e-06, + "loss": 0.385, + "step": 38940 + }, + { + "epoch": 0.8654017063633132, + "grad_norm": 2.671875, + "learning_rate": 2.6893333333333336e-06, + "loss": 0.3696, + "step": 38950 + }, + { + "epoch": 0.8656238890863847, + "grad_norm": 2.609375, + "learning_rate": 2.684888888888889e-06, + "loss": 0.4151, + "step": 38960 + }, + { + "epoch": 0.865846071809456, + "grad_norm": 2.6875, + "learning_rate": 2.6804444444444446e-06, + "loss": 0.3815, + "step": 38970 + }, + { + "epoch": 0.8660682545325276, + "grad_norm": 3.34375, + "learning_rate": 2.6760000000000003e-06, + "loss": 0.3737, + "step": 38980 + }, + { + "epoch": 0.866290437255599, + "grad_norm": 2.75, + "learning_rate": 2.6715555555555556e-06, + "loss": 0.378, + "step": 38990 + }, + { + "epoch": 0.8665126199786705, + "grad_norm": 2.40625, + "learning_rate": 2.6671111111111113e-06, + "loss": 0.3694, + "step": 39000 + }, + { + "epoch": 0.8667348027017419, + "grad_norm": 2.859375, + "learning_rate": 2.6626666666666665e-06, + "loss": 0.3726, + "step": 39010 + }, + { + "epoch": 0.8669569854248134, + "grad_norm": 2.59375, + "learning_rate": 2.6582222222222226e-06, + "loss": 0.3751, + "step": 39020 + }, + { + "epoch": 0.8671791681478849, + "grad_norm": 2.515625, + "learning_rate": 2.653777777777778e-06, + "loss": 0.3833, + "step": 39030 + }, + { + "epoch": 0.8674013508709563, + "grad_norm": 2.9375, + "learning_rate": 2.6493333333333336e-06, + "loss": 0.4114, + "step": 39040 + }, + { + "epoch": 0.8676235335940278, + "grad_norm": 2.984375, + "learning_rate": 2.644888888888889e-06, + "loss": 0.3759, + "step": 39050 + }, + { + "epoch": 0.8678457163170992, + "grad_norm": 2.46875, + "learning_rate": 2.640444444444445e-06, + "loss": 0.3894, + "step": 39060 + }, + { + "epoch": 0.8680678990401707, + "grad_norm": 2.625, + "learning_rate": 2.6360000000000003e-06, + "loss": 0.3688, + "step": 39070 + }, + { + "epoch": 0.8682900817632421, + "grad_norm": 2.359375, + "learning_rate": 2.6315555555555555e-06, + "loss": 0.3668, + "step": 39080 + }, + { + "epoch": 0.8685122644863136, + "grad_norm": 2.734375, + "learning_rate": 2.6271111111111112e-06, + "loss": 0.3603, + "step": 39090 + }, + { + "epoch": 0.868734447209385, + "grad_norm": 2.46875, + "learning_rate": 2.6226666666666665e-06, + "loss": 0.3859, + "step": 39100 + }, + { + "epoch": 0.8689566299324565, + "grad_norm": 3.078125, + "learning_rate": 2.6182222222222226e-06, + "loss": 0.3455, + "step": 39110 + }, + { + "epoch": 0.8691788126555279, + "grad_norm": 2.59375, + "learning_rate": 2.613777777777778e-06, + "loss": 0.3751, + "step": 39120 + }, + { + "epoch": 0.8694009953785994, + "grad_norm": 2.1875, + "learning_rate": 2.6093333333333336e-06, + "loss": 0.3494, + "step": 39130 + }, + { + "epoch": 0.8696231781016708, + "grad_norm": 2.21875, + "learning_rate": 2.604888888888889e-06, + "loss": 0.4063, + "step": 39140 + }, + { + "epoch": 0.8698453608247423, + "grad_norm": 2.6875, + "learning_rate": 2.600444444444445e-06, + "loss": 0.4022, + "step": 39150 + }, + { + "epoch": 0.8700675435478137, + "grad_norm": 3.296875, + "learning_rate": 2.5960000000000002e-06, + "loss": 0.3903, + "step": 39160 + }, + { + "epoch": 0.8702897262708852, + "grad_norm": 2.21875, + "learning_rate": 2.5915555555555555e-06, + "loss": 0.4057, + "step": 39170 + }, + { + "epoch": 0.8705119089939566, + "grad_norm": 2.3125, + "learning_rate": 2.587111111111111e-06, + "loss": 0.3593, + "step": 39180 + }, + { + "epoch": 0.8707340917170281, + "grad_norm": 2.953125, + "learning_rate": 2.5826666666666664e-06, + "loss": 0.3396, + "step": 39190 + }, + { + "epoch": 0.8709562744400995, + "grad_norm": 2.375, + "learning_rate": 2.5782222222222226e-06, + "loss": 0.3696, + "step": 39200 + }, + { + "epoch": 0.871178457163171, + "grad_norm": 2.234375, + "learning_rate": 2.573777777777778e-06, + "loss": 0.3804, + "step": 39210 + }, + { + "epoch": 0.8714006398862425, + "grad_norm": 2.265625, + "learning_rate": 2.5693333333333335e-06, + "loss": 0.4056, + "step": 39220 + }, + { + "epoch": 0.8716228226093139, + "grad_norm": 2.375, + "learning_rate": 2.564888888888889e-06, + "loss": 0.3712, + "step": 39230 + }, + { + "epoch": 0.8718450053323854, + "grad_norm": 2.6875, + "learning_rate": 2.560444444444445e-06, + "loss": 0.3899, + "step": 39240 + }, + { + "epoch": 0.8720671880554568, + "grad_norm": 3.09375, + "learning_rate": 2.556e-06, + "loss": 0.3869, + "step": 39250 + }, + { + "epoch": 0.8722893707785283, + "grad_norm": 2.34375, + "learning_rate": 2.551555555555556e-06, + "loss": 0.3569, + "step": 39260 + }, + { + "epoch": 0.8725115535015997, + "grad_norm": 2.453125, + "learning_rate": 2.547111111111111e-06, + "loss": 0.3602, + "step": 39270 + }, + { + "epoch": 0.8727337362246712, + "grad_norm": 2.453125, + "learning_rate": 2.5426666666666664e-06, + "loss": 0.3651, + "step": 39280 + }, + { + "epoch": 0.8729559189477426, + "grad_norm": 2.453125, + "learning_rate": 2.5382222222222225e-06, + "loss": 0.4032, + "step": 39290 + }, + { + "epoch": 0.8731781016708141, + "grad_norm": 2.390625, + "learning_rate": 2.533777777777778e-06, + "loss": 0.4008, + "step": 39300 + }, + { + "epoch": 0.8734002843938855, + "grad_norm": 2.328125, + "learning_rate": 2.5293333333333335e-06, + "loss": 0.3781, + "step": 39310 + }, + { + "epoch": 0.873622467116957, + "grad_norm": 3.234375, + "learning_rate": 2.5248888888888888e-06, + "loss": 0.3624, + "step": 39320 + }, + { + "epoch": 0.8738446498400284, + "grad_norm": 2.765625, + "learning_rate": 2.520444444444445e-06, + "loss": 0.3744, + "step": 39330 + }, + { + "epoch": 0.8740668325630999, + "grad_norm": 2.984375, + "learning_rate": 2.516e-06, + "loss": 0.3678, + "step": 39340 + }, + { + "epoch": 0.8742890152861713, + "grad_norm": 2.078125, + "learning_rate": 2.511555555555556e-06, + "loss": 0.3681, + "step": 39350 + }, + { + "epoch": 0.8745111980092428, + "grad_norm": 2.5625, + "learning_rate": 2.507111111111111e-06, + "loss": 0.3679, + "step": 39360 + }, + { + "epoch": 0.8747333807323142, + "grad_norm": 2.296875, + "learning_rate": 2.5026666666666672e-06, + "loss": 0.3866, + "step": 39370 + }, + { + "epoch": 0.8749555634553857, + "grad_norm": 3.3125, + "learning_rate": 2.4982222222222225e-06, + "loss": 0.3862, + "step": 39380 + }, + { + "epoch": 0.8751777461784571, + "grad_norm": 2.203125, + "learning_rate": 2.493777777777778e-06, + "loss": 0.3422, + "step": 39390 + }, + { + "epoch": 0.8753999289015286, + "grad_norm": 2.4375, + "learning_rate": 2.4893333333333334e-06, + "loss": 0.3805, + "step": 39400 + }, + { + "epoch": 0.8756221116246, + "grad_norm": 2.546875, + "learning_rate": 2.484888888888889e-06, + "loss": 0.4086, + "step": 39410 + }, + { + "epoch": 0.8758442943476715, + "grad_norm": 2.328125, + "learning_rate": 2.480444444444445e-06, + "loss": 0.3776, + "step": 39420 + }, + { + "epoch": 0.876066477070743, + "grad_norm": 2.453125, + "learning_rate": 2.476e-06, + "loss": 0.3834, + "step": 39430 + }, + { + "epoch": 0.8762886597938144, + "grad_norm": 2.859375, + "learning_rate": 2.4715555555555558e-06, + "loss": 0.3931, + "step": 39440 + }, + { + "epoch": 0.8765108425168859, + "grad_norm": 2.96875, + "learning_rate": 2.467111111111111e-06, + "loss": 0.3288, + "step": 39450 + }, + { + "epoch": 0.8767330252399573, + "grad_norm": 2.6875, + "learning_rate": 2.4626666666666667e-06, + "loss": 0.4021, + "step": 39460 + }, + { + "epoch": 0.8769552079630288, + "grad_norm": 2.3125, + "learning_rate": 2.4582222222222224e-06, + "loss": 0.3973, + "step": 39470 + }, + { + "epoch": 0.8771773906861002, + "grad_norm": 2.578125, + "learning_rate": 2.453777777777778e-06, + "loss": 0.3773, + "step": 39480 + }, + { + "epoch": 0.8773995734091717, + "grad_norm": 2.265625, + "learning_rate": 2.4493333333333334e-06, + "loss": 0.3593, + "step": 39490 + }, + { + "epoch": 0.8776217561322431, + "grad_norm": 2.46875, + "learning_rate": 2.444888888888889e-06, + "loss": 0.4039, + "step": 39500 + }, + { + "epoch": 0.8778439388553146, + "grad_norm": 2.765625, + "learning_rate": 2.4404444444444448e-06, + "loss": 0.3548, + "step": 39510 + }, + { + "epoch": 0.878066121578386, + "grad_norm": 2.625, + "learning_rate": 2.4360000000000005e-06, + "loss": 0.3488, + "step": 39520 + }, + { + "epoch": 0.8782883043014575, + "grad_norm": 2.484375, + "learning_rate": 2.4315555555555557e-06, + "loss": 0.3653, + "step": 39530 + }, + { + "epoch": 0.8785104870245289, + "grad_norm": 2.3125, + "learning_rate": 2.427111111111111e-06, + "loss": 0.3435, + "step": 39540 + }, + { + "epoch": 0.8787326697476004, + "grad_norm": 2.578125, + "learning_rate": 2.4226666666666667e-06, + "loss": 0.4013, + "step": 39550 + }, + { + "epoch": 0.8789548524706718, + "grad_norm": 2.78125, + "learning_rate": 2.4182222222222224e-06, + "loss": 0.379, + "step": 39560 + }, + { + "epoch": 0.8791770351937434, + "grad_norm": 2.140625, + "learning_rate": 2.413777777777778e-06, + "loss": 0.334, + "step": 39570 + }, + { + "epoch": 0.8793992179168147, + "grad_norm": 2.234375, + "learning_rate": 2.4093333333333334e-06, + "loss": 0.3664, + "step": 39580 + }, + { + "epoch": 0.8796214006398863, + "grad_norm": 2.65625, + "learning_rate": 2.404888888888889e-06, + "loss": 0.332, + "step": 39590 + }, + { + "epoch": 0.8798435833629576, + "grad_norm": 3.359375, + "learning_rate": 2.4004444444444447e-06, + "loss": 0.3774, + "step": 39600 + }, + { + "epoch": 0.8800657660860292, + "grad_norm": 2.671875, + "learning_rate": 2.3960000000000004e-06, + "loss": 0.4064, + "step": 39610 + }, + { + "epoch": 0.8802879488091006, + "grad_norm": 2.984375, + "learning_rate": 2.3915555555555557e-06, + "loss": 0.3968, + "step": 39620 + }, + { + "epoch": 0.8805101315321721, + "grad_norm": 2.5625, + "learning_rate": 2.3871111111111114e-06, + "loss": 0.366, + "step": 39630 + }, + { + "epoch": 0.8807323142552436, + "grad_norm": 2.46875, + "learning_rate": 2.3826666666666667e-06, + "loss": 0.3739, + "step": 39640 + }, + { + "epoch": 0.880954496978315, + "grad_norm": 2.875, + "learning_rate": 2.3782222222222224e-06, + "loss": 0.3934, + "step": 39650 + }, + { + "epoch": 0.8811766797013865, + "grad_norm": 2.59375, + "learning_rate": 2.373777777777778e-06, + "loss": 0.3518, + "step": 39660 + }, + { + "epoch": 0.8813988624244579, + "grad_norm": 2.15625, + "learning_rate": 2.3693333333333333e-06, + "loss": 0.3619, + "step": 39670 + }, + { + "epoch": 0.8816210451475294, + "grad_norm": 2.734375, + "learning_rate": 2.364888888888889e-06, + "loss": 0.3641, + "step": 39680 + }, + { + "epoch": 0.8818432278706008, + "grad_norm": 2.796875, + "learning_rate": 2.3604444444444447e-06, + "loss": 0.3966, + "step": 39690 + }, + { + "epoch": 0.8820654105936723, + "grad_norm": 2.625, + "learning_rate": 2.3560000000000004e-06, + "loss": 0.3988, + "step": 39700 + }, + { + "epoch": 0.8822875933167437, + "grad_norm": 3.328125, + "learning_rate": 2.3515555555555557e-06, + "loss": 0.3729, + "step": 39710 + }, + { + "epoch": 0.8825097760398152, + "grad_norm": 2.53125, + "learning_rate": 2.3471111111111114e-06, + "loss": 0.3538, + "step": 39720 + }, + { + "epoch": 0.8827319587628866, + "grad_norm": 2.4375, + "learning_rate": 2.342666666666667e-06, + "loss": 0.362, + "step": 39730 + }, + { + "epoch": 0.8829541414859581, + "grad_norm": 2.53125, + "learning_rate": 2.3382222222222223e-06, + "loss": 0.4084, + "step": 39740 + }, + { + "epoch": 0.8831763242090295, + "grad_norm": 2.296875, + "learning_rate": 2.333777777777778e-06, + "loss": 0.3524, + "step": 39750 + }, + { + "epoch": 0.883398506932101, + "grad_norm": 2.34375, + "learning_rate": 2.3293333333333333e-06, + "loss": 0.3715, + "step": 39760 + }, + { + "epoch": 0.8836206896551724, + "grad_norm": 2.78125, + "learning_rate": 2.324888888888889e-06, + "loss": 0.3663, + "step": 39770 + }, + { + "epoch": 0.8838428723782439, + "grad_norm": 2.34375, + "learning_rate": 2.3204444444444447e-06, + "loss": 0.3583, + "step": 39780 + }, + { + "epoch": 0.8840650551013153, + "grad_norm": 2.515625, + "learning_rate": 2.3160000000000004e-06, + "loss": 0.3297, + "step": 39790 + }, + { + "epoch": 0.8842872378243868, + "grad_norm": 2.40625, + "learning_rate": 2.3115555555555556e-06, + "loss": 0.3392, + "step": 39800 + }, + { + "epoch": 0.8845094205474582, + "grad_norm": 2.734375, + "learning_rate": 2.3071111111111113e-06, + "loss": 0.3786, + "step": 39810 + }, + { + "epoch": 0.8847316032705297, + "grad_norm": 2.34375, + "learning_rate": 2.302666666666667e-06, + "loss": 0.4139, + "step": 39820 + }, + { + "epoch": 0.8849537859936011, + "grad_norm": 2.484375, + "learning_rate": 2.2982222222222227e-06, + "loss": 0.4058, + "step": 39830 + }, + { + "epoch": 0.8851759687166726, + "grad_norm": 2.671875, + "learning_rate": 2.293777777777778e-06, + "loss": 0.3488, + "step": 39840 + }, + { + "epoch": 0.8853981514397441, + "grad_norm": 2.609375, + "learning_rate": 2.2893333333333332e-06, + "loss": 0.3846, + "step": 39850 + }, + { + "epoch": 0.8856203341628155, + "grad_norm": 2.296875, + "learning_rate": 2.284888888888889e-06, + "loss": 0.3565, + "step": 39860 + }, + { + "epoch": 0.885842516885887, + "grad_norm": 2.234375, + "learning_rate": 2.2804444444444446e-06, + "loss": 0.3832, + "step": 39870 + }, + { + "epoch": 0.8860646996089584, + "grad_norm": 2.0, + "learning_rate": 2.2760000000000003e-06, + "loss": 0.389, + "step": 39880 + }, + { + "epoch": 0.8862868823320299, + "grad_norm": 2.578125, + "learning_rate": 2.2715555555555556e-06, + "loss": 0.3651, + "step": 39890 + }, + { + "epoch": 0.8865090650551013, + "grad_norm": 2.21875, + "learning_rate": 2.2671111111111113e-06, + "loss": 0.3626, + "step": 39900 + }, + { + "epoch": 0.8867312477781728, + "grad_norm": 2.859375, + "learning_rate": 2.262666666666667e-06, + "loss": 0.4105, + "step": 39910 + }, + { + "epoch": 0.8869534305012442, + "grad_norm": 2.484375, + "learning_rate": 2.2582222222222227e-06, + "loss": 0.4118, + "step": 39920 + }, + { + "epoch": 0.8871756132243157, + "grad_norm": 2.328125, + "learning_rate": 2.253777777777778e-06, + "loss": 0.3644, + "step": 39930 + }, + { + "epoch": 0.8873977959473871, + "grad_norm": 3.015625, + "learning_rate": 2.2493333333333336e-06, + "loss": 0.4027, + "step": 39940 + }, + { + "epoch": 0.8876199786704586, + "grad_norm": 2.5, + "learning_rate": 2.244888888888889e-06, + "loss": 0.3823, + "step": 39950 + }, + { + "epoch": 0.88784216139353, + "grad_norm": 2.71875, + "learning_rate": 2.2404444444444446e-06, + "loss": 0.3794, + "step": 39960 + }, + { + "epoch": 0.8880643441166015, + "grad_norm": 2.25, + "learning_rate": 2.2360000000000003e-06, + "loss": 0.4008, + "step": 39970 + }, + { + "epoch": 0.8882865268396729, + "grad_norm": 2.40625, + "learning_rate": 2.2315555555555555e-06, + "loss": 0.3852, + "step": 39980 + }, + { + "epoch": 0.8885087095627444, + "grad_norm": 2.578125, + "learning_rate": 2.2271111111111112e-06, + "loss": 0.4021, + "step": 39990 + }, + { + "epoch": 0.8887308922858158, + "grad_norm": 2.59375, + "learning_rate": 2.222666666666667e-06, + "loss": 0.3567, + "step": 40000 + }, + { + "epoch": 0.8889530750088873, + "grad_norm": 2.375, + "learning_rate": 2.2182222222222226e-06, + "loss": 0.4155, + "step": 40010 + }, + { + "epoch": 0.8891752577319587, + "grad_norm": 2.59375, + "learning_rate": 2.213777777777778e-06, + "loss": 0.4003, + "step": 40020 + }, + { + "epoch": 0.8893974404550302, + "grad_norm": 2.921875, + "learning_rate": 2.2093333333333336e-06, + "loss": 0.3741, + "step": 40030 + }, + { + "epoch": 0.8896196231781017, + "grad_norm": 2.28125, + "learning_rate": 2.2048888888888893e-06, + "loss": 0.3505, + "step": 40040 + }, + { + "epoch": 0.8898418059011731, + "grad_norm": 2.765625, + "learning_rate": 2.2004444444444445e-06, + "loss": 0.384, + "step": 40050 + }, + { + "epoch": 0.8900639886242446, + "grad_norm": 2.21875, + "learning_rate": 2.1960000000000002e-06, + "loss": 0.3613, + "step": 40060 + }, + { + "epoch": 0.890286171347316, + "grad_norm": 2.375, + "learning_rate": 2.1915555555555555e-06, + "loss": 0.369, + "step": 40070 + }, + { + "epoch": 0.8905083540703875, + "grad_norm": 2.421875, + "learning_rate": 2.187111111111111e-06, + "loss": 0.3752, + "step": 40080 + }, + { + "epoch": 0.8907305367934589, + "grad_norm": 2.8125, + "learning_rate": 2.182666666666667e-06, + "loss": 0.3766, + "step": 40090 + }, + { + "epoch": 0.8909527195165304, + "grad_norm": 2.640625, + "learning_rate": 2.1782222222222226e-06, + "loss": 0.3497, + "step": 40100 + }, + { + "epoch": 0.8911749022396018, + "grad_norm": 2.546875, + "learning_rate": 2.173777777777778e-06, + "loss": 0.3781, + "step": 40110 + }, + { + "epoch": 0.8913970849626733, + "grad_norm": 2.75, + "learning_rate": 2.1693333333333335e-06, + "loss": 0.3893, + "step": 40120 + }, + { + "epoch": 0.8916192676857447, + "grad_norm": 2.46875, + "learning_rate": 2.1648888888888892e-06, + "loss": 0.3513, + "step": 40130 + }, + { + "epoch": 0.8918414504088162, + "grad_norm": 2.796875, + "learning_rate": 2.160444444444445e-06, + "loss": 0.3694, + "step": 40140 + }, + { + "epoch": 0.8920636331318876, + "grad_norm": 2.6875, + "learning_rate": 2.156e-06, + "loss": 0.4148, + "step": 40150 + }, + { + "epoch": 0.8922858158549591, + "grad_norm": 2.515625, + "learning_rate": 2.1515555555555555e-06, + "loss": 0.3905, + "step": 40160 + }, + { + "epoch": 0.8925079985780305, + "grad_norm": 2.515625, + "learning_rate": 2.147111111111111e-06, + "loss": 0.4334, + "step": 40170 + }, + { + "epoch": 0.892730181301102, + "grad_norm": 2.28125, + "learning_rate": 2.142666666666667e-06, + "loss": 0.4006, + "step": 40180 + }, + { + "epoch": 0.8929523640241734, + "grad_norm": 2.484375, + "learning_rate": 2.1382222222222225e-06, + "loss": 0.3621, + "step": 40190 + }, + { + "epoch": 0.893174546747245, + "grad_norm": 2.65625, + "learning_rate": 2.133777777777778e-06, + "loss": 0.4158, + "step": 40200 + }, + { + "epoch": 0.8933967294703163, + "grad_norm": 2.53125, + "learning_rate": 2.1293333333333335e-06, + "loss": 0.3865, + "step": 40210 + }, + { + "epoch": 0.8936189121933878, + "grad_norm": 2.21875, + "learning_rate": 2.124888888888889e-06, + "loss": 0.3357, + "step": 40220 + }, + { + "epoch": 0.8938410949164592, + "grad_norm": 2.53125, + "learning_rate": 2.120444444444445e-06, + "loss": 0.3716, + "step": 40230 + }, + { + "epoch": 0.8940632776395308, + "grad_norm": 2.484375, + "learning_rate": 2.116e-06, + "loss": 0.3947, + "step": 40240 + }, + { + "epoch": 0.8942854603626023, + "grad_norm": 2.578125, + "learning_rate": 2.111555555555556e-06, + "loss": 0.3916, + "step": 40250 + }, + { + "epoch": 0.8945076430856737, + "grad_norm": 2.6875, + "learning_rate": 2.107111111111111e-06, + "loss": 0.3989, + "step": 40260 + }, + { + "epoch": 0.8947298258087452, + "grad_norm": 2.703125, + "learning_rate": 2.102666666666667e-06, + "loss": 0.38, + "step": 40270 + }, + { + "epoch": 0.8949520085318166, + "grad_norm": 2.421875, + "learning_rate": 2.0982222222222225e-06, + "loss": 0.3718, + "step": 40280 + }, + { + "epoch": 0.8951741912548881, + "grad_norm": 2.390625, + "learning_rate": 2.0937777777777778e-06, + "loss": 0.4015, + "step": 40290 + }, + { + "epoch": 0.8953963739779595, + "grad_norm": 2.859375, + "learning_rate": 2.0893333333333335e-06, + "loss": 0.3837, + "step": 40300 + }, + { + "epoch": 0.895618556701031, + "grad_norm": 2.640625, + "learning_rate": 2.084888888888889e-06, + "loss": 0.3657, + "step": 40310 + }, + { + "epoch": 0.8958407394241024, + "grad_norm": 2.59375, + "learning_rate": 2.080444444444445e-06, + "loss": 0.3588, + "step": 40320 + }, + { + "epoch": 0.8960629221471739, + "grad_norm": 2.515625, + "learning_rate": 2.076e-06, + "loss": 0.3911, + "step": 40330 + }, + { + "epoch": 0.8962851048702453, + "grad_norm": 2.59375, + "learning_rate": 2.071555555555556e-06, + "loss": 0.4072, + "step": 40340 + }, + { + "epoch": 0.8965072875933168, + "grad_norm": 2.546875, + "learning_rate": 2.0671111111111115e-06, + "loss": 0.3658, + "step": 40350 + }, + { + "epoch": 0.8967294703163882, + "grad_norm": 2.796875, + "learning_rate": 2.0626666666666668e-06, + "loss": 0.4061, + "step": 40360 + }, + { + "epoch": 0.8969516530394597, + "grad_norm": 2.203125, + "learning_rate": 2.0582222222222225e-06, + "loss": 0.3724, + "step": 40370 + }, + { + "epoch": 0.8971738357625311, + "grad_norm": 2.375, + "learning_rate": 2.0537777777777777e-06, + "loss": 0.3883, + "step": 40380 + }, + { + "epoch": 0.8973960184856026, + "grad_norm": 2.90625, + "learning_rate": 2.0493333333333334e-06, + "loss": 0.3974, + "step": 40390 + }, + { + "epoch": 0.897618201208674, + "grad_norm": 2.8125, + "learning_rate": 2.044888888888889e-06, + "loss": 0.3535, + "step": 40400 + }, + { + "epoch": 0.8978403839317455, + "grad_norm": 2.59375, + "learning_rate": 2.040444444444445e-06, + "loss": 0.3663, + "step": 40410 + }, + { + "epoch": 0.8980625666548169, + "grad_norm": 2.71875, + "learning_rate": 2.036e-06, + "loss": 0.3851, + "step": 40420 + }, + { + "epoch": 0.8982847493778884, + "grad_norm": 2.75, + "learning_rate": 2.0315555555555558e-06, + "loss": 0.3804, + "step": 40430 + }, + { + "epoch": 0.8985069321009598, + "grad_norm": 2.40625, + "learning_rate": 2.0271111111111115e-06, + "loss": 0.3553, + "step": 40440 + }, + { + "epoch": 0.8987291148240313, + "grad_norm": 2.40625, + "learning_rate": 2.022666666666667e-06, + "loss": 0.3897, + "step": 40450 + }, + { + "epoch": 0.8989512975471028, + "grad_norm": 2.5625, + "learning_rate": 2.0182222222222224e-06, + "loss": 0.3835, + "step": 40460 + }, + { + "epoch": 0.8991734802701742, + "grad_norm": 2.96875, + "learning_rate": 2.0137777777777777e-06, + "loss": 0.3883, + "step": 40470 + }, + { + "epoch": 0.8993956629932457, + "grad_norm": 2.390625, + "learning_rate": 2.0093333333333334e-06, + "loss": 0.3752, + "step": 40480 + }, + { + "epoch": 0.8996178457163171, + "grad_norm": 3.0625, + "learning_rate": 2.004888888888889e-06, + "loss": 0.3865, + "step": 40490 + }, + { + "epoch": 0.8998400284393886, + "grad_norm": 2.46875, + "learning_rate": 2.0004444444444448e-06, + "loss": 0.373, + "step": 40500 + }, + { + "epoch": 0.90006221116246, + "grad_norm": 2.65625, + "learning_rate": 1.996e-06, + "loss": 0.3439, + "step": 40510 + }, + { + "epoch": 0.9002843938855315, + "grad_norm": 2.640625, + "learning_rate": 1.9915555555555557e-06, + "loss": 0.3622, + "step": 40520 + }, + { + "epoch": 0.9005065766086029, + "grad_norm": 3.03125, + "learning_rate": 1.9871111111111114e-06, + "loss": 0.3596, + "step": 40530 + }, + { + "epoch": 0.9007287593316744, + "grad_norm": 2.953125, + "learning_rate": 1.982666666666667e-06, + "loss": 0.3888, + "step": 40540 + }, + { + "epoch": 0.9009509420547458, + "grad_norm": 2.453125, + "learning_rate": 1.9782222222222224e-06, + "loss": 0.3853, + "step": 40550 + }, + { + "epoch": 0.9011731247778173, + "grad_norm": 2.421875, + "learning_rate": 1.973777777777778e-06, + "loss": 0.3725, + "step": 40560 + }, + { + "epoch": 0.9013953075008887, + "grad_norm": 2.375, + "learning_rate": 1.9693333333333333e-06, + "loss": 0.3684, + "step": 40570 + }, + { + "epoch": 0.9016174902239602, + "grad_norm": 2.84375, + "learning_rate": 1.964888888888889e-06, + "loss": 0.4225, + "step": 40580 + }, + { + "epoch": 0.9018396729470316, + "grad_norm": 2.703125, + "learning_rate": 1.9604444444444447e-06, + "loss": 0.3596, + "step": 40590 + }, + { + "epoch": 0.9020618556701031, + "grad_norm": 2.421875, + "learning_rate": 1.956e-06, + "loss": 0.3713, + "step": 40600 + }, + { + "epoch": 0.9022840383931745, + "grad_norm": 2.734375, + "learning_rate": 1.9515555555555557e-06, + "loss": 0.3995, + "step": 40610 + }, + { + "epoch": 0.902506221116246, + "grad_norm": 2.09375, + "learning_rate": 1.9471111111111114e-06, + "loss": 0.3526, + "step": 40620 + }, + { + "epoch": 0.9027284038393174, + "grad_norm": 2.5625, + "learning_rate": 1.942666666666667e-06, + "loss": 0.3605, + "step": 40630 + }, + { + "epoch": 0.9029505865623889, + "grad_norm": 2.40625, + "learning_rate": 1.9382222222222223e-06, + "loss": 0.3665, + "step": 40640 + }, + { + "epoch": 0.9031727692854603, + "grad_norm": 2.453125, + "learning_rate": 1.933777777777778e-06, + "loss": 0.3558, + "step": 40650 + }, + { + "epoch": 0.9033949520085318, + "grad_norm": 2.234375, + "learning_rate": 1.9293333333333337e-06, + "loss": 0.3958, + "step": 40660 + }, + { + "epoch": 0.9036171347316033, + "grad_norm": 2.921875, + "learning_rate": 1.924888888888889e-06, + "loss": 0.38, + "step": 40670 + }, + { + "epoch": 0.9038393174546747, + "grad_norm": 2.59375, + "learning_rate": 1.9204444444444447e-06, + "loss": 0.3761, + "step": 40680 + }, + { + "epoch": 0.9040615001777462, + "grad_norm": 2.234375, + "learning_rate": 1.916e-06, + "loss": 0.3434, + "step": 40690 + }, + { + "epoch": 0.9042836829008176, + "grad_norm": 2.640625, + "learning_rate": 1.9115555555555556e-06, + "loss": 0.4032, + "step": 40700 + }, + { + "epoch": 0.9045058656238891, + "grad_norm": 2.265625, + "learning_rate": 1.9071111111111113e-06, + "loss": 0.4014, + "step": 40710 + }, + { + "epoch": 0.9047280483469605, + "grad_norm": 2.765625, + "learning_rate": 1.9026666666666668e-06, + "loss": 0.3608, + "step": 40720 + }, + { + "epoch": 0.904950231070032, + "grad_norm": 2.453125, + "learning_rate": 1.8982222222222225e-06, + "loss": 0.3829, + "step": 40730 + }, + { + "epoch": 0.9051724137931034, + "grad_norm": 2.6875, + "learning_rate": 1.893777777777778e-06, + "loss": 0.3839, + "step": 40740 + }, + { + "epoch": 0.9053945965161749, + "grad_norm": 2.625, + "learning_rate": 1.8893333333333335e-06, + "loss": 0.4213, + "step": 40750 + }, + { + "epoch": 0.9056167792392463, + "grad_norm": 2.609375, + "learning_rate": 1.8848888888888892e-06, + "loss": 0.411, + "step": 40760 + }, + { + "epoch": 0.9058389619623178, + "grad_norm": 2.890625, + "learning_rate": 1.8804444444444444e-06, + "loss": 0.3776, + "step": 40770 + }, + { + "epoch": 0.9060611446853892, + "grad_norm": 2.78125, + "learning_rate": 1.8760000000000001e-06, + "loss": 0.3827, + "step": 40780 + }, + { + "epoch": 0.9062833274084607, + "grad_norm": 2.515625, + "learning_rate": 1.8715555555555556e-06, + "loss": 0.329, + "step": 40790 + }, + { + "epoch": 0.9065055101315321, + "grad_norm": 2.40625, + "learning_rate": 1.8671111111111113e-06, + "loss": 0.3922, + "step": 40800 + }, + { + "epoch": 0.9067276928546036, + "grad_norm": 2.40625, + "learning_rate": 1.8626666666666668e-06, + "loss": 0.3798, + "step": 40810 + }, + { + "epoch": 0.906949875577675, + "grad_norm": 2.40625, + "learning_rate": 1.8582222222222225e-06, + "loss": 0.3537, + "step": 40820 + }, + { + "epoch": 0.9071720583007465, + "grad_norm": 2.453125, + "learning_rate": 1.853777777777778e-06, + "loss": 0.3708, + "step": 40830 + }, + { + "epoch": 0.9073942410238179, + "grad_norm": 2.578125, + "learning_rate": 1.8493333333333336e-06, + "loss": 0.3599, + "step": 40840 + }, + { + "epoch": 0.9076164237468894, + "grad_norm": 2.390625, + "learning_rate": 1.8448888888888891e-06, + "loss": 0.3792, + "step": 40850 + }, + { + "epoch": 0.9078386064699608, + "grad_norm": 2.515625, + "learning_rate": 1.8404444444444446e-06, + "loss": 0.4045, + "step": 40860 + }, + { + "epoch": 0.9080607891930323, + "grad_norm": 2.203125, + "learning_rate": 1.8360000000000003e-06, + "loss": 0.3799, + "step": 40870 + }, + { + "epoch": 0.9082829719161039, + "grad_norm": 3.484375, + "learning_rate": 1.8315555555555556e-06, + "loss": 0.3598, + "step": 40880 + }, + { + "epoch": 0.9085051546391752, + "grad_norm": 2.65625, + "learning_rate": 1.8271111111111113e-06, + "loss": 0.3637, + "step": 40890 + }, + { + "epoch": 0.9087273373622468, + "grad_norm": 2.484375, + "learning_rate": 1.8226666666666667e-06, + "loss": 0.3867, + "step": 40900 + }, + { + "epoch": 0.9089495200853182, + "grad_norm": 2.15625, + "learning_rate": 1.8182222222222224e-06, + "loss": 0.3726, + "step": 40910 + }, + { + "epoch": 0.9091717028083897, + "grad_norm": 2.703125, + "learning_rate": 1.813777777777778e-06, + "loss": 0.3596, + "step": 40920 + }, + { + "epoch": 0.909393885531461, + "grad_norm": 2.421875, + "learning_rate": 1.8093333333333336e-06, + "loss": 0.368, + "step": 40930 + }, + { + "epoch": 0.9096160682545326, + "grad_norm": 2.59375, + "learning_rate": 1.804888888888889e-06, + "loss": 0.3872, + "step": 40940 + }, + { + "epoch": 0.909838250977604, + "grad_norm": 2.984375, + "learning_rate": 1.8004444444444446e-06, + "loss": 0.4331, + "step": 40950 + }, + { + "epoch": 0.9100604337006755, + "grad_norm": 2.296875, + "learning_rate": 1.7960000000000003e-06, + "loss": 0.396, + "step": 40960 + }, + { + "epoch": 0.9102826164237469, + "grad_norm": 2.640625, + "learning_rate": 1.7915555555555557e-06, + "loss": 0.4018, + "step": 40970 + }, + { + "epoch": 0.9105047991468184, + "grad_norm": 3.0, + "learning_rate": 1.7871111111111112e-06, + "loss": 0.3579, + "step": 40980 + }, + { + "epoch": 0.9107269818698898, + "grad_norm": 2.640625, + "learning_rate": 1.7826666666666667e-06, + "loss": 0.3482, + "step": 40990 + }, + { + "epoch": 0.9109491645929613, + "grad_norm": 2.625, + "learning_rate": 1.7782222222222224e-06, + "loss": 0.3721, + "step": 41000 + }, + { + "epoch": 0.9111713473160327, + "grad_norm": 2.390625, + "learning_rate": 1.7737777777777779e-06, + "loss": 0.3953, + "step": 41010 + }, + { + "epoch": 0.9113935300391042, + "grad_norm": 3.078125, + "learning_rate": 1.7693333333333336e-06, + "loss": 0.3973, + "step": 41020 + }, + { + "epoch": 0.9116157127621756, + "grad_norm": 2.203125, + "learning_rate": 1.764888888888889e-06, + "loss": 0.4075, + "step": 41030 + }, + { + "epoch": 0.9118378954852471, + "grad_norm": 2.4375, + "learning_rate": 1.7604444444444445e-06, + "loss": 0.3856, + "step": 41040 + }, + { + "epoch": 0.9120600782083185, + "grad_norm": 2.75, + "learning_rate": 1.7560000000000002e-06, + "loss": 0.4153, + "step": 41050 + }, + { + "epoch": 0.91228226093139, + "grad_norm": 2.3125, + "learning_rate": 1.7515555555555557e-06, + "loss": 0.3414, + "step": 41060 + }, + { + "epoch": 0.9125044436544615, + "grad_norm": 2.453125, + "learning_rate": 1.7471111111111114e-06, + "loss": 0.3496, + "step": 41070 + }, + { + "epoch": 0.9127266263775329, + "grad_norm": 2.328125, + "learning_rate": 1.7426666666666667e-06, + "loss": 0.3977, + "step": 41080 + }, + { + "epoch": 0.9129488091006044, + "grad_norm": 2.28125, + "learning_rate": 1.7382222222222223e-06, + "loss": 0.3732, + "step": 41090 + }, + { + "epoch": 0.9131709918236758, + "grad_norm": 2.5, + "learning_rate": 1.7337777777777778e-06, + "loss": 0.4004, + "step": 41100 + }, + { + "epoch": 0.9133931745467473, + "grad_norm": 2.65625, + "learning_rate": 1.7293333333333335e-06, + "loss": 0.3833, + "step": 41110 + }, + { + "epoch": 0.9136153572698187, + "grad_norm": 1.9921875, + "learning_rate": 1.724888888888889e-06, + "loss": 0.3491, + "step": 41120 + }, + { + "epoch": 0.9138375399928902, + "grad_norm": 2.40625, + "learning_rate": 1.7204444444444445e-06, + "loss": 0.3881, + "step": 41130 + }, + { + "epoch": 0.9140597227159616, + "grad_norm": 2.8125, + "learning_rate": 1.7160000000000002e-06, + "loss": 0.3619, + "step": 41140 + }, + { + "epoch": 0.9142819054390331, + "grad_norm": 2.5, + "learning_rate": 1.7115555555555557e-06, + "loss": 0.3969, + "step": 41150 + }, + { + "epoch": 0.9145040881621045, + "grad_norm": 2.46875, + "learning_rate": 1.7071111111111113e-06, + "loss": 0.3802, + "step": 41160 + }, + { + "epoch": 0.914726270885176, + "grad_norm": 2.765625, + "learning_rate": 1.7026666666666668e-06, + "loss": 0.4202, + "step": 41170 + }, + { + "epoch": 0.9149484536082474, + "grad_norm": 2.34375, + "learning_rate": 1.6982222222222225e-06, + "loss": 0.354, + "step": 41180 + }, + { + "epoch": 0.9151706363313189, + "grad_norm": 2.546875, + "learning_rate": 1.6937777777777778e-06, + "loss": 0.371, + "step": 41190 + }, + { + "epoch": 0.9153928190543903, + "grad_norm": 2.703125, + "learning_rate": 1.6893333333333335e-06, + "loss": 0.3855, + "step": 41200 + }, + { + "epoch": 0.9156150017774618, + "grad_norm": 2.625, + "learning_rate": 1.684888888888889e-06, + "loss": 0.4043, + "step": 41210 + }, + { + "epoch": 0.9158371845005332, + "grad_norm": 2.875, + "learning_rate": 1.6804444444444444e-06, + "loss": 0.3803, + "step": 41220 + }, + { + "epoch": 0.9160593672236047, + "grad_norm": 2.296875, + "learning_rate": 1.6760000000000001e-06, + "loss": 0.4139, + "step": 41230 + }, + { + "epoch": 0.9162815499466761, + "grad_norm": 2.328125, + "learning_rate": 1.6715555555555556e-06, + "loss": 0.3653, + "step": 41240 + }, + { + "epoch": 0.9165037326697476, + "grad_norm": 2.84375, + "learning_rate": 1.6671111111111113e-06, + "loss": 0.3962, + "step": 41250 + }, + { + "epoch": 0.916725915392819, + "grad_norm": 2.421875, + "learning_rate": 1.6626666666666668e-06, + "loss": 0.3863, + "step": 41260 + }, + { + "epoch": 0.9169480981158905, + "grad_norm": 2.75, + "learning_rate": 1.6582222222222225e-06, + "loss": 0.3965, + "step": 41270 + }, + { + "epoch": 0.917170280838962, + "grad_norm": 2.5625, + "learning_rate": 1.653777777777778e-06, + "loss": 0.3669, + "step": 41280 + }, + { + "epoch": 0.9173924635620334, + "grad_norm": 2.390625, + "learning_rate": 1.6493333333333334e-06, + "loss": 0.4242, + "step": 41290 + }, + { + "epoch": 0.9176146462851049, + "grad_norm": 2.296875, + "learning_rate": 1.644888888888889e-06, + "loss": 0.3731, + "step": 41300 + }, + { + "epoch": 0.9178368290081763, + "grad_norm": 2.5, + "learning_rate": 1.6404444444444446e-06, + "loss": 0.3651, + "step": 41310 + }, + { + "epoch": 0.9180590117312478, + "grad_norm": 2.421875, + "learning_rate": 1.636e-06, + "loss": 0.3503, + "step": 41320 + }, + { + "epoch": 0.9182811944543192, + "grad_norm": 2.796875, + "learning_rate": 1.6315555555555556e-06, + "loss": 0.3702, + "step": 41330 + }, + { + "epoch": 0.9185033771773907, + "grad_norm": 2.71875, + "learning_rate": 1.6271111111111113e-06, + "loss": 0.3517, + "step": 41340 + }, + { + "epoch": 0.9187255599004621, + "grad_norm": 2.578125, + "learning_rate": 1.6226666666666667e-06, + "loss": 0.3837, + "step": 41350 + }, + { + "epoch": 0.9189477426235336, + "grad_norm": 2.453125, + "learning_rate": 1.6182222222222224e-06, + "loss": 0.3561, + "step": 41360 + }, + { + "epoch": 0.919169925346605, + "grad_norm": 2.359375, + "learning_rate": 1.613777777777778e-06, + "loss": 0.3496, + "step": 41370 + }, + { + "epoch": 0.9193921080696765, + "grad_norm": 2.78125, + "learning_rate": 1.6093333333333336e-06, + "loss": 0.4501, + "step": 41380 + }, + { + "epoch": 0.9196142907927479, + "grad_norm": 2.640625, + "learning_rate": 1.6048888888888889e-06, + "loss": 0.3776, + "step": 41390 + }, + { + "epoch": 0.9198364735158194, + "grad_norm": 2.859375, + "learning_rate": 1.6004444444444446e-06, + "loss": 0.3998, + "step": 41400 + }, + { + "epoch": 0.9200586562388908, + "grad_norm": 2.859375, + "learning_rate": 1.596e-06, + "loss": 0.3667, + "step": 41410 + }, + { + "epoch": 0.9202808389619623, + "grad_norm": 2.5, + "learning_rate": 1.5915555555555555e-06, + "loss": 0.4079, + "step": 41420 + }, + { + "epoch": 0.9205030216850337, + "grad_norm": 2.5625, + "learning_rate": 1.5871111111111112e-06, + "loss": 0.3854, + "step": 41430 + }, + { + "epoch": 0.9207252044081052, + "grad_norm": 2.4375, + "learning_rate": 1.5826666666666667e-06, + "loss": 0.383, + "step": 41440 + }, + { + "epoch": 0.9209473871311766, + "grad_norm": 2.8125, + "learning_rate": 1.5782222222222224e-06, + "loss": 0.3772, + "step": 41450 + }, + { + "epoch": 0.9211695698542481, + "grad_norm": 2.515625, + "learning_rate": 1.5737777777777779e-06, + "loss": 0.3157, + "step": 41460 + }, + { + "epoch": 0.9213917525773195, + "grad_norm": 2.578125, + "learning_rate": 1.5693333333333336e-06, + "loss": 0.3859, + "step": 41470 + }, + { + "epoch": 0.921613935300391, + "grad_norm": 2.953125, + "learning_rate": 1.564888888888889e-06, + "loss": 0.3857, + "step": 41480 + }, + { + "epoch": 0.9218361180234625, + "grad_norm": 2.203125, + "learning_rate": 1.5604444444444447e-06, + "loss": 0.3758, + "step": 41490 + }, + { + "epoch": 0.9220583007465339, + "grad_norm": 2.328125, + "learning_rate": 1.556e-06, + "loss": 0.361, + "step": 41500 + }, + { + "epoch": 0.9222804834696054, + "grad_norm": 2.546875, + "learning_rate": 1.5515555555555555e-06, + "loss": 0.4005, + "step": 41510 + }, + { + "epoch": 0.9225026661926768, + "grad_norm": 3.265625, + "learning_rate": 1.5471111111111112e-06, + "loss": 0.3538, + "step": 41520 + }, + { + "epoch": 0.9227248489157484, + "grad_norm": 2.8125, + "learning_rate": 1.5426666666666667e-06, + "loss": 0.4144, + "step": 41530 + }, + { + "epoch": 0.9229470316388197, + "grad_norm": 2.390625, + "learning_rate": 1.5382222222222224e-06, + "loss": 0.3523, + "step": 41540 + }, + { + "epoch": 0.9231692143618913, + "grad_norm": 2.265625, + "learning_rate": 1.5337777777777778e-06, + "loss": 0.3618, + "step": 41550 + }, + { + "epoch": 0.9233913970849627, + "grad_norm": 2.53125, + "learning_rate": 1.5293333333333335e-06, + "loss": 0.3997, + "step": 41560 + }, + { + "epoch": 0.9236135798080342, + "grad_norm": 2.6875, + "learning_rate": 1.524888888888889e-06, + "loss": 0.366, + "step": 41570 + }, + { + "epoch": 0.9238357625311056, + "grad_norm": 2.625, + "learning_rate": 1.5204444444444447e-06, + "loss": 0.4361, + "step": 41580 + }, + { + "epoch": 0.9240579452541771, + "grad_norm": 2.59375, + "learning_rate": 1.5160000000000002e-06, + "loss": 0.392, + "step": 41590 + }, + { + "epoch": 0.9242801279772485, + "grad_norm": 2.390625, + "learning_rate": 1.5115555555555554e-06, + "loss": 0.4082, + "step": 41600 + }, + { + "epoch": 0.92450231070032, + "grad_norm": 2.34375, + "learning_rate": 1.5071111111111111e-06, + "loss": 0.4094, + "step": 41610 + }, + { + "epoch": 0.9247244934233914, + "grad_norm": 2.359375, + "learning_rate": 1.5026666666666666e-06, + "loss": 0.3583, + "step": 41620 + }, + { + "epoch": 0.9249466761464629, + "grad_norm": 2.875, + "learning_rate": 1.4982222222222223e-06, + "loss": 0.3857, + "step": 41630 + }, + { + "epoch": 0.9251688588695343, + "grad_norm": 2.59375, + "learning_rate": 1.4937777777777778e-06, + "loss": 0.3817, + "step": 41640 + }, + { + "epoch": 0.9253910415926058, + "grad_norm": 2.21875, + "learning_rate": 1.4893333333333335e-06, + "loss": 0.3696, + "step": 41650 + }, + { + "epoch": 0.9256132243156772, + "grad_norm": 3.0, + "learning_rate": 1.484888888888889e-06, + "loss": 0.3652, + "step": 41660 + }, + { + "epoch": 0.9258354070387487, + "grad_norm": 2.859375, + "learning_rate": 1.4804444444444447e-06, + "loss": 0.4169, + "step": 41670 + }, + { + "epoch": 0.9260575897618201, + "grad_norm": 2.6875, + "learning_rate": 1.4760000000000001e-06, + "loss": 0.369, + "step": 41680 + }, + { + "epoch": 0.9262797724848916, + "grad_norm": 2.46875, + "learning_rate": 1.4715555555555558e-06, + "loss": 0.3739, + "step": 41690 + }, + { + "epoch": 0.9265019552079631, + "grad_norm": 2.8125, + "learning_rate": 1.467111111111111e-06, + "loss": 0.3888, + "step": 41700 + }, + { + "epoch": 0.9267241379310345, + "grad_norm": 2.515625, + "learning_rate": 1.4626666666666666e-06, + "loss": 0.3685, + "step": 41710 + }, + { + "epoch": 0.926946320654106, + "grad_norm": 2.484375, + "learning_rate": 1.4582222222222223e-06, + "loss": 0.4017, + "step": 41720 + }, + { + "epoch": 0.9271685033771774, + "grad_norm": 2.59375, + "learning_rate": 1.4537777777777778e-06, + "loss": 0.358, + "step": 41730 + }, + { + "epoch": 0.9273906861002489, + "grad_norm": 2.15625, + "learning_rate": 1.4493333333333334e-06, + "loss": 0.3853, + "step": 41740 + }, + { + "epoch": 0.9276128688233203, + "grad_norm": 2.359375, + "learning_rate": 1.444888888888889e-06, + "loss": 0.3745, + "step": 41750 + }, + { + "epoch": 0.9278350515463918, + "grad_norm": 1.90625, + "learning_rate": 1.4404444444444446e-06, + "loss": 0.3652, + "step": 41760 + }, + { + "epoch": 0.9280572342694632, + "grad_norm": 2.5625, + "learning_rate": 1.436e-06, + "loss": 0.3897, + "step": 41770 + }, + { + "epoch": 0.9282794169925347, + "grad_norm": 2.40625, + "learning_rate": 1.4315555555555558e-06, + "loss": 0.3989, + "step": 41780 + }, + { + "epoch": 0.9285015997156061, + "grad_norm": 2.421875, + "learning_rate": 1.4271111111111113e-06, + "loss": 0.3566, + "step": 41790 + }, + { + "epoch": 0.9287237824386776, + "grad_norm": 2.328125, + "learning_rate": 1.422666666666667e-06, + "loss": 0.4067, + "step": 41800 + }, + { + "epoch": 0.928945965161749, + "grad_norm": 2.59375, + "learning_rate": 1.4182222222222222e-06, + "loss": 0.3799, + "step": 41810 + }, + { + "epoch": 0.9291681478848205, + "grad_norm": 2.546875, + "learning_rate": 1.4137777777777777e-06, + "loss": 0.3472, + "step": 41820 + }, + { + "epoch": 0.9293903306078919, + "grad_norm": 2.46875, + "learning_rate": 1.4093333333333334e-06, + "loss": 0.3585, + "step": 41830 + }, + { + "epoch": 0.9296125133309634, + "grad_norm": 2.65625, + "learning_rate": 1.4048888888888889e-06, + "loss": 0.3858, + "step": 41840 + }, + { + "epoch": 0.9298346960540348, + "grad_norm": 2.703125, + "learning_rate": 1.4004444444444446e-06, + "loss": 0.3849, + "step": 41850 + }, + { + "epoch": 0.9300568787771063, + "grad_norm": 2.390625, + "learning_rate": 1.396e-06, + "loss": 0.3503, + "step": 41860 + }, + { + "epoch": 0.9302790615001777, + "grad_norm": 2.34375, + "learning_rate": 1.3915555555555558e-06, + "loss": 0.3858, + "step": 41870 + }, + { + "epoch": 0.9305012442232492, + "grad_norm": 2.4375, + "learning_rate": 1.3871111111111112e-06, + "loss": 0.3878, + "step": 41880 + }, + { + "epoch": 0.9307234269463207, + "grad_norm": 2.6875, + "learning_rate": 1.382666666666667e-06, + "loss": 0.3741, + "step": 41890 + }, + { + "epoch": 0.9309456096693921, + "grad_norm": 2.140625, + "learning_rate": 1.3782222222222224e-06, + "loss": 0.3998, + "step": 41900 + }, + { + "epoch": 0.9311677923924636, + "grad_norm": 2.765625, + "learning_rate": 1.3737777777777777e-06, + "loss": 0.3897, + "step": 41910 + }, + { + "epoch": 0.931389975115535, + "grad_norm": 2.59375, + "learning_rate": 1.3693333333333334e-06, + "loss": 0.3934, + "step": 41920 + }, + { + "epoch": 0.9316121578386065, + "grad_norm": 1.9765625, + "learning_rate": 1.3648888888888888e-06, + "loss": 0.3852, + "step": 41930 + }, + { + "epoch": 0.9318343405616779, + "grad_norm": 2.828125, + "learning_rate": 1.3604444444444445e-06, + "loss": 0.4274, + "step": 41940 + }, + { + "epoch": 0.9320565232847494, + "grad_norm": 2.546875, + "learning_rate": 1.356e-06, + "loss": 0.3154, + "step": 41950 + }, + { + "epoch": 0.9322787060078208, + "grad_norm": 2.5625, + "learning_rate": 1.3515555555555557e-06, + "loss": 0.4125, + "step": 41960 + }, + { + "epoch": 0.9325008887308923, + "grad_norm": 2.640625, + "learning_rate": 1.3471111111111112e-06, + "loss": 0.3975, + "step": 41970 + }, + { + "epoch": 0.9327230714539637, + "grad_norm": 2.75, + "learning_rate": 1.3426666666666669e-06, + "loss": 0.3655, + "step": 41980 + }, + { + "epoch": 0.9329452541770352, + "grad_norm": 2.625, + "learning_rate": 1.3382222222222224e-06, + "loss": 0.4096, + "step": 41990 + }, + { + "epoch": 0.9331674369001066, + "grad_norm": 2.5625, + "learning_rate": 1.333777777777778e-06, + "loss": 0.3845, + "step": 42000 + }, + { + "epoch": 0.9333896196231781, + "grad_norm": 2.34375, + "learning_rate": 1.3293333333333333e-06, + "loss": 0.3792, + "step": 42010 + }, + { + "epoch": 0.9336118023462495, + "grad_norm": 2.578125, + "learning_rate": 1.3248888888888888e-06, + "loss": 0.3942, + "step": 42020 + }, + { + "epoch": 0.933833985069321, + "grad_norm": 2.859375, + "learning_rate": 1.3204444444444445e-06, + "loss": 0.39, + "step": 42030 + }, + { + "epoch": 0.9340561677923924, + "grad_norm": 2.890625, + "learning_rate": 1.316e-06, + "loss": 0.3978, + "step": 42040 + }, + { + "epoch": 0.9342783505154639, + "grad_norm": 2.5, + "learning_rate": 1.3115555555555557e-06, + "loss": 0.3792, + "step": 42050 + }, + { + "epoch": 0.9345005332385353, + "grad_norm": 2.734375, + "learning_rate": 1.3071111111111112e-06, + "loss": 0.3787, + "step": 42060 + }, + { + "epoch": 0.9347227159616068, + "grad_norm": 2.5625, + "learning_rate": 1.3026666666666668e-06, + "loss": 0.3818, + "step": 42070 + }, + { + "epoch": 0.9349448986846782, + "grad_norm": 2.703125, + "learning_rate": 1.2982222222222223e-06, + "loss": 0.3714, + "step": 42080 + }, + { + "epoch": 0.9351670814077497, + "grad_norm": 2.5, + "learning_rate": 1.293777777777778e-06, + "loss": 0.3789, + "step": 42090 + }, + { + "epoch": 0.9353892641308212, + "grad_norm": 3.234375, + "learning_rate": 1.2893333333333335e-06, + "loss": 0.358, + "step": 42100 + }, + { + "epoch": 0.9356114468538926, + "grad_norm": 2.390625, + "learning_rate": 1.2848888888888892e-06, + "loss": 0.3733, + "step": 42110 + }, + { + "epoch": 0.9358336295769641, + "grad_norm": 2.3125, + "learning_rate": 1.2804444444444445e-06, + "loss": 0.3934, + "step": 42120 + }, + { + "epoch": 0.9360558123000355, + "grad_norm": 2.59375, + "learning_rate": 1.276e-06, + "loss": 0.3869, + "step": 42130 + }, + { + "epoch": 0.936277995023107, + "grad_norm": 2.234375, + "learning_rate": 1.2715555555555556e-06, + "loss": 0.4351, + "step": 42140 + }, + { + "epoch": 0.9365001777461784, + "grad_norm": 2.25, + "learning_rate": 1.2671111111111111e-06, + "loss": 0.3776, + "step": 42150 + }, + { + "epoch": 0.93672236046925, + "grad_norm": 2.34375, + "learning_rate": 1.2626666666666668e-06, + "loss": 0.3752, + "step": 42160 + }, + { + "epoch": 0.9369445431923213, + "grad_norm": 2.796875, + "learning_rate": 1.2582222222222223e-06, + "loss": 0.3555, + "step": 42170 + }, + { + "epoch": 0.9371667259153929, + "grad_norm": 2.328125, + "learning_rate": 1.253777777777778e-06, + "loss": 0.394, + "step": 42180 + }, + { + "epoch": 0.9373889086384642, + "grad_norm": 2.96875, + "learning_rate": 1.2493333333333335e-06, + "loss": 0.4158, + "step": 42190 + }, + { + "epoch": 0.9376110913615358, + "grad_norm": 2.15625, + "learning_rate": 1.244888888888889e-06, + "loss": 0.3608, + "step": 42200 + }, + { + "epoch": 0.9378332740846071, + "grad_norm": 2.296875, + "learning_rate": 1.2404444444444446e-06, + "loss": 0.3904, + "step": 42210 + }, + { + "epoch": 0.9380554568076787, + "grad_norm": 2.40625, + "learning_rate": 1.2360000000000001e-06, + "loss": 0.4005, + "step": 42220 + }, + { + "epoch": 0.93827763953075, + "grad_norm": 3.078125, + "learning_rate": 1.2315555555555558e-06, + "loss": 0.3674, + "step": 42230 + }, + { + "epoch": 0.9384998222538216, + "grad_norm": 3.015625, + "learning_rate": 1.2271111111111113e-06, + "loss": 0.3818, + "step": 42240 + }, + { + "epoch": 0.938722004976893, + "grad_norm": 2.6875, + "learning_rate": 1.2226666666666668e-06, + "loss": 0.3794, + "step": 42250 + }, + { + "epoch": 0.9389441876999645, + "grad_norm": 2.734375, + "learning_rate": 1.2182222222222222e-06, + "loss": 0.411, + "step": 42260 + }, + { + "epoch": 0.9391663704230359, + "grad_norm": 2.53125, + "learning_rate": 1.213777777777778e-06, + "loss": 0.3834, + "step": 42270 + }, + { + "epoch": 0.9393885531461074, + "grad_norm": 2.484375, + "learning_rate": 1.2093333333333334e-06, + "loss": 0.3438, + "step": 42280 + }, + { + "epoch": 0.9396107358691788, + "grad_norm": 2.25, + "learning_rate": 1.2048888888888891e-06, + "loss": 0.4101, + "step": 42290 + }, + { + "epoch": 0.9398329185922503, + "grad_norm": 2.53125, + "learning_rate": 1.2004444444444446e-06, + "loss": 0.4113, + "step": 42300 + }, + { + "epoch": 0.9400551013153218, + "grad_norm": 2.0625, + "learning_rate": 1.196e-06, + "loss": 0.3796, + "step": 42310 + }, + { + "epoch": 0.9402772840383932, + "grad_norm": 2.625, + "learning_rate": 1.1915555555555558e-06, + "loss": 0.3958, + "step": 42320 + }, + { + "epoch": 0.9404994667614647, + "grad_norm": 2.359375, + "learning_rate": 1.1871111111111112e-06, + "loss": 0.4032, + "step": 42330 + }, + { + "epoch": 0.9407216494845361, + "grad_norm": 2.671875, + "learning_rate": 1.1826666666666667e-06, + "loss": 0.3915, + "step": 42340 + }, + { + "epoch": 0.9409438322076076, + "grad_norm": 2.453125, + "learning_rate": 1.1782222222222222e-06, + "loss": 0.4121, + "step": 42350 + }, + { + "epoch": 0.941166014930679, + "grad_norm": 2.53125, + "learning_rate": 1.173777777777778e-06, + "loss": 0.3798, + "step": 42360 + }, + { + "epoch": 0.9413881976537505, + "grad_norm": 2.453125, + "learning_rate": 1.1693333333333334e-06, + "loss": 0.3774, + "step": 42370 + }, + { + "epoch": 0.9416103803768219, + "grad_norm": 2.515625, + "learning_rate": 1.164888888888889e-06, + "loss": 0.3736, + "step": 42380 + }, + { + "epoch": 0.9418325630998934, + "grad_norm": 2.625, + "learning_rate": 1.1604444444444445e-06, + "loss": 0.3654, + "step": 42390 + }, + { + "epoch": 0.9420547458229648, + "grad_norm": 2.515625, + "learning_rate": 1.156e-06, + "loss": 0.3836, + "step": 42400 + }, + { + "epoch": 0.9422769285460363, + "grad_norm": 2.5625, + "learning_rate": 1.1515555555555557e-06, + "loss": 0.3899, + "step": 42410 + }, + { + "epoch": 0.9424991112691077, + "grad_norm": 2.421875, + "learning_rate": 1.1471111111111112e-06, + "loss": 0.3751, + "step": 42420 + }, + { + "epoch": 0.9427212939921792, + "grad_norm": 2.546875, + "learning_rate": 1.1426666666666667e-06, + "loss": 0.3956, + "step": 42430 + }, + { + "epoch": 0.9429434767152506, + "grad_norm": 2.71875, + "learning_rate": 1.1382222222222224e-06, + "loss": 0.3709, + "step": 42440 + }, + { + "epoch": 0.9431656594383221, + "grad_norm": 2.46875, + "learning_rate": 1.1337777777777779e-06, + "loss": 0.3564, + "step": 42450 + }, + { + "epoch": 0.9433878421613935, + "grad_norm": 2.4375, + "learning_rate": 1.1293333333333333e-06, + "loss": 0.3594, + "step": 42460 + }, + { + "epoch": 0.943610024884465, + "grad_norm": 2.546875, + "learning_rate": 1.124888888888889e-06, + "loss": 0.3577, + "step": 42470 + }, + { + "epoch": 0.9438322076075364, + "grad_norm": 1.921875, + "learning_rate": 1.1204444444444445e-06, + "loss": 0.3817, + "step": 42480 + }, + { + "epoch": 0.9440543903306079, + "grad_norm": 2.640625, + "learning_rate": 1.1160000000000002e-06, + "loss": 0.3739, + "step": 42490 + }, + { + "epoch": 0.9442765730536793, + "grad_norm": 3.0625, + "learning_rate": 1.1115555555555557e-06, + "loss": 0.376, + "step": 42500 + }, + { + "epoch": 0.9444987557767508, + "grad_norm": 2.5625, + "learning_rate": 1.1071111111111112e-06, + "loss": 0.4159, + "step": 42510 + }, + { + "epoch": 0.9447209384998223, + "grad_norm": 2.546875, + "learning_rate": 1.1026666666666666e-06, + "loss": 0.3683, + "step": 42520 + }, + { + "epoch": 0.9449431212228937, + "grad_norm": 2.421875, + "learning_rate": 1.0982222222222223e-06, + "loss": 0.3693, + "step": 42530 + }, + { + "epoch": 0.9451653039459652, + "grad_norm": 2.578125, + "learning_rate": 1.0937777777777778e-06, + "loss": 0.4035, + "step": 42540 + }, + { + "epoch": 0.9453874866690366, + "grad_norm": 2.515625, + "learning_rate": 1.0893333333333333e-06, + "loss": 0.3564, + "step": 42550 + }, + { + "epoch": 0.9456096693921081, + "grad_norm": 2.78125, + "learning_rate": 1.084888888888889e-06, + "loss": 0.3892, + "step": 42560 + }, + { + "epoch": 0.9458318521151795, + "grad_norm": 2.765625, + "learning_rate": 1.0804444444444445e-06, + "loss": 0.4029, + "step": 42570 + }, + { + "epoch": 0.946054034838251, + "grad_norm": 2.71875, + "learning_rate": 1.0760000000000002e-06, + "loss": 0.3823, + "step": 42580 + }, + { + "epoch": 0.9462762175613224, + "grad_norm": 2.1875, + "learning_rate": 1.0715555555555556e-06, + "loss": 0.3705, + "step": 42590 + }, + { + "epoch": 0.9464984002843939, + "grad_norm": 2.765625, + "learning_rate": 1.0671111111111113e-06, + "loss": 0.3629, + "step": 42600 + }, + { + "epoch": 0.9467205830074653, + "grad_norm": 2.46875, + "learning_rate": 1.0626666666666668e-06, + "loss": 0.3675, + "step": 42610 + }, + { + "epoch": 0.9469427657305368, + "grad_norm": 2.46875, + "learning_rate": 1.0582222222222223e-06, + "loss": 0.39, + "step": 42620 + }, + { + "epoch": 0.9471649484536082, + "grad_norm": 2.734375, + "learning_rate": 1.0537777777777778e-06, + "loss": 0.3632, + "step": 42630 + }, + { + "epoch": 0.9473871311766797, + "grad_norm": 2.359375, + "learning_rate": 1.0493333333333335e-06, + "loss": 0.3646, + "step": 42640 + }, + { + "epoch": 0.9476093138997511, + "grad_norm": 2.375, + "learning_rate": 1.044888888888889e-06, + "loss": 0.4065, + "step": 42650 + }, + { + "epoch": 0.9478314966228226, + "grad_norm": 2.890625, + "learning_rate": 1.0404444444444444e-06, + "loss": 0.3758, + "step": 42660 + }, + { + "epoch": 0.948053679345894, + "grad_norm": 2.53125, + "learning_rate": 1.0360000000000001e-06, + "loss": 0.3788, + "step": 42670 + }, + { + "epoch": 0.9482758620689655, + "grad_norm": 3.046875, + "learning_rate": 1.0315555555555556e-06, + "loss": 0.3928, + "step": 42680 + }, + { + "epoch": 0.9484980447920369, + "grad_norm": 2.453125, + "learning_rate": 1.0271111111111113e-06, + "loss": 0.3944, + "step": 42690 + }, + { + "epoch": 0.9487202275151084, + "grad_norm": 2.546875, + "learning_rate": 1.0226666666666668e-06, + "loss": 0.3966, + "step": 42700 + }, + { + "epoch": 0.9489424102381799, + "grad_norm": 2.125, + "learning_rate": 1.0182222222222223e-06, + "loss": 0.3592, + "step": 42710 + }, + { + "epoch": 0.9491645929612513, + "grad_norm": 2.203125, + "learning_rate": 1.0137777777777777e-06, + "loss": 0.3896, + "step": 42720 + }, + { + "epoch": 0.9493867756843228, + "grad_norm": 2.640625, + "learning_rate": 1.0093333333333334e-06, + "loss": 0.3657, + "step": 42730 + }, + { + "epoch": 0.9496089584073942, + "grad_norm": 2.0625, + "learning_rate": 1.004888888888889e-06, + "loss": 0.3815, + "step": 42740 + }, + { + "epoch": 0.9498311411304657, + "grad_norm": 3.015625, + "learning_rate": 1.0004444444444446e-06, + "loss": 0.3694, + "step": 42750 + }, + { + "epoch": 0.9500533238535371, + "grad_norm": 2.234375, + "learning_rate": 9.96e-07, + "loss": 0.3747, + "step": 42760 + }, + { + "epoch": 0.9502755065766086, + "grad_norm": 2.546875, + "learning_rate": 9.915555555555556e-07, + "loss": 0.3493, + "step": 42770 + }, + { + "epoch": 0.95049768929968, + "grad_norm": 2.578125, + "learning_rate": 9.871111111111113e-07, + "loss": 0.419, + "step": 42780 + }, + { + "epoch": 0.9507198720227515, + "grad_norm": 2.75, + "learning_rate": 9.826666666666667e-07, + "loss": 0.3941, + "step": 42790 + }, + { + "epoch": 0.9509420547458229, + "grad_norm": 2.46875, + "learning_rate": 9.782222222222224e-07, + "loss": 0.3739, + "step": 42800 + }, + { + "epoch": 0.9511642374688944, + "grad_norm": 2.8125, + "learning_rate": 9.737777777777777e-07, + "loss": 0.3549, + "step": 42810 + }, + { + "epoch": 0.9513864201919658, + "grad_norm": 2.546875, + "learning_rate": 9.693333333333334e-07, + "loss": 0.403, + "step": 42820 + }, + { + "epoch": 0.9516086029150373, + "grad_norm": 2.46875, + "learning_rate": 9.648888888888889e-07, + "loss": 0.3846, + "step": 42830 + }, + { + "epoch": 0.9518307856381087, + "grad_norm": 2.71875, + "learning_rate": 9.604444444444446e-07, + "loss": 0.3849, + "step": 42840 + }, + { + "epoch": 0.9520529683611803, + "grad_norm": 2.453125, + "learning_rate": 9.56e-07, + "loss": 0.3603, + "step": 42850 + }, + { + "epoch": 0.9522751510842516, + "grad_norm": 2.546875, + "learning_rate": 9.515555555555555e-07, + "loss": 0.3794, + "step": 42860 + }, + { + "epoch": 0.9524973338073232, + "grad_norm": 2.875, + "learning_rate": 9.471111111111111e-07, + "loss": 0.4247, + "step": 42870 + }, + { + "epoch": 0.9527195165303946, + "grad_norm": 2.375, + "learning_rate": 9.426666666666667e-07, + "loss": 0.3788, + "step": 42880 + }, + { + "epoch": 0.9529416992534661, + "grad_norm": 2.71875, + "learning_rate": 9.382222222222223e-07, + "loss": 0.3558, + "step": 42890 + }, + { + "epoch": 0.9531638819765375, + "grad_norm": 2.421875, + "learning_rate": 9.337777777777779e-07, + "loss": 0.3491, + "step": 42900 + }, + { + "epoch": 0.953386064699609, + "grad_norm": 2.5625, + "learning_rate": 9.293333333333334e-07, + "loss": 0.388, + "step": 42910 + }, + { + "epoch": 0.9536082474226805, + "grad_norm": 2.609375, + "learning_rate": 9.248888888888889e-07, + "loss": 0.3641, + "step": 42920 + }, + { + "epoch": 0.9538304301457519, + "grad_norm": 2.640625, + "learning_rate": 9.204444444444445e-07, + "loss": 0.3852, + "step": 42930 + }, + { + "epoch": 0.9540526128688234, + "grad_norm": 2.140625, + "learning_rate": 9.160000000000001e-07, + "loss": 0.3894, + "step": 42940 + }, + { + "epoch": 0.9542747955918948, + "grad_norm": 2.640625, + "learning_rate": 9.115555555555557e-07, + "loss": 0.39, + "step": 42950 + }, + { + "epoch": 0.9544969783149663, + "grad_norm": 2.484375, + "learning_rate": 9.071111111111113e-07, + "loss": 0.328, + "step": 42960 + }, + { + "epoch": 0.9547191610380377, + "grad_norm": 2.78125, + "learning_rate": 9.026666666666666e-07, + "loss": 0.3202, + "step": 42970 + }, + { + "epoch": 0.9549413437611092, + "grad_norm": 2.296875, + "learning_rate": 8.982222222222222e-07, + "loss": 0.3493, + "step": 42980 + }, + { + "epoch": 0.9551635264841806, + "grad_norm": 2.265625, + "learning_rate": 8.937777777777778e-07, + "loss": 0.3455, + "step": 42990 + }, + { + "epoch": 0.9553857092072521, + "grad_norm": 2.609375, + "learning_rate": 8.893333333333334e-07, + "loss": 0.3665, + "step": 43000 + }, + { + "epoch": 0.9556078919303235, + "grad_norm": 2.0625, + "learning_rate": 8.84888888888889e-07, + "loss": 0.3596, + "step": 43010 + }, + { + "epoch": 0.955830074653395, + "grad_norm": 2.9375, + "learning_rate": 8.804444444444445e-07, + "loss": 0.4035, + "step": 43020 + }, + { + "epoch": 0.9560522573764664, + "grad_norm": 2.875, + "learning_rate": 8.760000000000001e-07, + "loss": 0.4075, + "step": 43030 + }, + { + "epoch": 0.9562744400995379, + "grad_norm": 2.328125, + "learning_rate": 8.715555555555556e-07, + "loss": 0.3831, + "step": 43040 + }, + { + "epoch": 0.9564966228226093, + "grad_norm": 3.078125, + "learning_rate": 8.671111111111112e-07, + "loss": 0.3732, + "step": 43050 + }, + { + "epoch": 0.9567188055456808, + "grad_norm": 2.234375, + "learning_rate": 8.626666666666668e-07, + "loss": 0.3609, + "step": 43060 + }, + { + "epoch": 0.9569409882687522, + "grad_norm": 3.0, + "learning_rate": 8.582222222222222e-07, + "loss": 0.4022, + "step": 43070 + }, + { + "epoch": 0.9571631709918237, + "grad_norm": 2.515625, + "learning_rate": 8.537777777777778e-07, + "loss": 0.3679, + "step": 43080 + }, + { + "epoch": 0.9573853537148951, + "grad_norm": 2.296875, + "learning_rate": 8.493333333333334e-07, + "loss": 0.3745, + "step": 43090 + }, + { + "epoch": 0.9576075364379666, + "grad_norm": 2.78125, + "learning_rate": 8.44888888888889e-07, + "loss": 0.4094, + "step": 43100 + }, + { + "epoch": 0.957829719161038, + "grad_norm": 2.90625, + "learning_rate": 8.404444444444445e-07, + "loss": 0.3889, + "step": 43110 + }, + { + "epoch": 0.9580519018841095, + "grad_norm": 2.453125, + "learning_rate": 8.36e-07, + "loss": 0.3631, + "step": 43120 + }, + { + "epoch": 0.958274084607181, + "grad_norm": 2.28125, + "learning_rate": 8.315555555555556e-07, + "loss": 0.3851, + "step": 43130 + }, + { + "epoch": 0.9584962673302524, + "grad_norm": 2.53125, + "learning_rate": 8.271111111111112e-07, + "loss": 0.3862, + "step": 43140 + }, + { + "epoch": 0.9587184500533239, + "grad_norm": 2.703125, + "learning_rate": 8.226666666666668e-07, + "loss": 0.3828, + "step": 43150 + }, + { + "epoch": 0.9589406327763953, + "grad_norm": 2.734375, + "learning_rate": 8.182222222222224e-07, + "loss": 0.3638, + "step": 43160 + }, + { + "epoch": 0.9591628154994668, + "grad_norm": 2.640625, + "learning_rate": 8.137777777777777e-07, + "loss": 0.4227, + "step": 43170 + }, + { + "epoch": 0.9593849982225382, + "grad_norm": 2.53125, + "learning_rate": 8.093333333333333e-07, + "loss": 0.3688, + "step": 43180 + }, + { + "epoch": 0.9596071809456097, + "grad_norm": 2.625, + "learning_rate": 8.048888888888889e-07, + "loss": 0.3755, + "step": 43190 + }, + { + "epoch": 0.9598293636686811, + "grad_norm": 2.421875, + "learning_rate": 8.004444444444445e-07, + "loss": 0.3626, + "step": 43200 + }, + { + "epoch": 0.9600515463917526, + "grad_norm": 2.375, + "learning_rate": 7.960000000000001e-07, + "loss": 0.3469, + "step": 43210 + }, + { + "epoch": 0.960273729114824, + "grad_norm": 2.421875, + "learning_rate": 7.915555555555557e-07, + "loss": 0.3896, + "step": 43220 + }, + { + "epoch": 0.9604959118378955, + "grad_norm": 2.78125, + "learning_rate": 7.871111111111112e-07, + "loss": 0.3818, + "step": 43230 + }, + { + "epoch": 0.9607180945609669, + "grad_norm": 2.796875, + "learning_rate": 7.826666666666667e-07, + "loss": 0.3521, + "step": 43240 + }, + { + "epoch": 0.9609402772840384, + "grad_norm": 2.8125, + "learning_rate": 7.782222222222223e-07, + "loss": 0.3824, + "step": 43250 + }, + { + "epoch": 0.9611624600071098, + "grad_norm": 2.734375, + "learning_rate": 7.737777777777779e-07, + "loss": 0.3383, + "step": 43260 + }, + { + "epoch": 0.9613846427301813, + "grad_norm": 2.328125, + "learning_rate": 7.693333333333335e-07, + "loss": 0.3442, + "step": 43270 + }, + { + "epoch": 0.9616068254532527, + "grad_norm": 2.375, + "learning_rate": 7.648888888888889e-07, + "loss": 0.3612, + "step": 43280 + }, + { + "epoch": 0.9618290081763242, + "grad_norm": 2.765625, + "learning_rate": 7.604444444444445e-07, + "loss": 0.3635, + "step": 43290 + }, + { + "epoch": 0.9620511908993956, + "grad_norm": 3.0, + "learning_rate": 7.56e-07, + "loss": 0.3861, + "step": 43300 + }, + { + "epoch": 0.9622733736224671, + "grad_norm": 2.59375, + "learning_rate": 7.515555555555556e-07, + "loss": 0.3739, + "step": 43310 + }, + { + "epoch": 0.9624955563455385, + "grad_norm": 1.9609375, + "learning_rate": 7.471111111111112e-07, + "loss": 0.336, + "step": 43320 + }, + { + "epoch": 0.96271773906861, + "grad_norm": 2.4375, + "learning_rate": 7.426666666666667e-07, + "loss": 0.3691, + "step": 43330 + }, + { + "epoch": 0.9629399217916815, + "grad_norm": 2.234375, + "learning_rate": 7.382222222222223e-07, + "loss": 0.3891, + "step": 43340 + }, + { + "epoch": 0.9631621045147529, + "grad_norm": 2.578125, + "learning_rate": 7.337777777777779e-07, + "loss": 0.4046, + "step": 43350 + }, + { + "epoch": 0.9633842872378244, + "grad_norm": 2.234375, + "learning_rate": 7.293333333333335e-07, + "loss": 0.3876, + "step": 43360 + }, + { + "epoch": 0.9636064699608958, + "grad_norm": 2.9375, + "learning_rate": 7.24888888888889e-07, + "loss": 0.3588, + "step": 43370 + }, + { + "epoch": 0.9638286526839673, + "grad_norm": 2.53125, + "learning_rate": 7.204444444444444e-07, + "loss": 0.3977, + "step": 43380 + }, + { + "epoch": 0.9640508354070387, + "grad_norm": 2.984375, + "learning_rate": 7.16e-07, + "loss": 0.4028, + "step": 43390 + }, + { + "epoch": 0.9642730181301102, + "grad_norm": 2.65625, + "learning_rate": 7.115555555555556e-07, + "loss": 0.373, + "step": 43400 + }, + { + "epoch": 0.9644952008531816, + "grad_norm": 2.3125, + "learning_rate": 7.071111111111112e-07, + "loss": 0.3863, + "step": 43410 + }, + { + "epoch": 0.9647173835762531, + "grad_norm": 2.265625, + "learning_rate": 7.026666666666668e-07, + "loss": 0.3439, + "step": 43420 + }, + { + "epoch": 0.9649395662993245, + "grad_norm": 2.125, + "learning_rate": 6.982222222222222e-07, + "loss": 0.3628, + "step": 43430 + }, + { + "epoch": 0.965161749022396, + "grad_norm": 2.125, + "learning_rate": 6.937777777777778e-07, + "loss": 0.3671, + "step": 43440 + }, + { + "epoch": 0.9653839317454674, + "grad_norm": 2.25, + "learning_rate": 6.893333333333334e-07, + "loss": 0.3832, + "step": 43450 + }, + { + "epoch": 0.965606114468539, + "grad_norm": 2.59375, + "learning_rate": 6.84888888888889e-07, + "loss": 0.3556, + "step": 43460 + }, + { + "epoch": 0.9658282971916103, + "grad_norm": 3.140625, + "learning_rate": 6.804444444444446e-07, + "loss": 0.3729, + "step": 43470 + }, + { + "epoch": 0.9660504799146818, + "grad_norm": 3.03125, + "learning_rate": 6.76e-07, + "loss": 0.4146, + "step": 43480 + }, + { + "epoch": 0.9662726626377532, + "grad_norm": 2.140625, + "learning_rate": 6.715555555555556e-07, + "loss": 0.3773, + "step": 43490 + }, + { + "epoch": 0.9664948453608248, + "grad_norm": 2.390625, + "learning_rate": 6.671111111111111e-07, + "loss": 0.3963, + "step": 43500 + }, + { + "epoch": 0.9667170280838961, + "grad_norm": 2.5625, + "learning_rate": 6.626666666666667e-07, + "loss": 0.3538, + "step": 43510 + }, + { + "epoch": 0.9669392108069677, + "grad_norm": 2.34375, + "learning_rate": 6.582222222222223e-07, + "loss": 0.3399, + "step": 43520 + }, + { + "epoch": 0.9671613935300392, + "grad_norm": 2.5, + "learning_rate": 6.537777777777779e-07, + "loss": 0.3567, + "step": 43530 + }, + { + "epoch": 0.9673835762531106, + "grad_norm": 2.859375, + "learning_rate": 6.493333333333334e-07, + "loss": 0.4067, + "step": 43540 + }, + { + "epoch": 0.9676057589761821, + "grad_norm": 2.703125, + "learning_rate": 6.44888888888889e-07, + "loss": 0.3565, + "step": 43550 + }, + { + "epoch": 0.9678279416992535, + "grad_norm": 2.890625, + "learning_rate": 6.404444444444446e-07, + "loss": 0.3918, + "step": 43560 + }, + { + "epoch": 0.968050124422325, + "grad_norm": 2.546875, + "learning_rate": 6.360000000000001e-07, + "loss": 0.3324, + "step": 43570 + }, + { + "epoch": 0.9682723071453964, + "grad_norm": 2.59375, + "learning_rate": 6.315555555555557e-07, + "loss": 0.3777, + "step": 43580 + }, + { + "epoch": 0.9684944898684679, + "grad_norm": 2.40625, + "learning_rate": 6.271111111111111e-07, + "loss": 0.3734, + "step": 43590 + }, + { + "epoch": 0.9687166725915393, + "grad_norm": 2.203125, + "learning_rate": 6.226666666666667e-07, + "loss": 0.3634, + "step": 43600 + }, + { + "epoch": 0.9689388553146108, + "grad_norm": 2.515625, + "learning_rate": 6.182222222222223e-07, + "loss": 0.3954, + "step": 43610 + }, + { + "epoch": 0.9691610380376822, + "grad_norm": 2.8125, + "learning_rate": 6.137777777777779e-07, + "loss": 0.4005, + "step": 43620 + }, + { + "epoch": 0.9693832207607537, + "grad_norm": 2.359375, + "learning_rate": 6.093333333333333e-07, + "loss": 0.3846, + "step": 43630 + }, + { + "epoch": 0.9696054034838251, + "grad_norm": 2.375, + "learning_rate": 6.048888888888889e-07, + "loss": 0.3847, + "step": 43640 + }, + { + "epoch": 0.9698275862068966, + "grad_norm": 2.515625, + "learning_rate": 6.004444444444445e-07, + "loss": 0.3993, + "step": 43650 + }, + { + "epoch": 0.970049768929968, + "grad_norm": 2.75, + "learning_rate": 5.960000000000001e-07, + "loss": 0.3835, + "step": 43660 + }, + { + "epoch": 0.9702719516530395, + "grad_norm": 2.28125, + "learning_rate": 5.915555555555557e-07, + "loss": 0.3449, + "step": 43670 + }, + { + "epoch": 0.9704941343761109, + "grad_norm": 2.359375, + "learning_rate": 5.871111111111112e-07, + "loss": 0.3962, + "step": 43680 + }, + { + "epoch": 0.9707163170991824, + "grad_norm": 2.328125, + "learning_rate": 5.826666666666667e-07, + "loss": 0.3536, + "step": 43690 + }, + { + "epoch": 0.9709384998222538, + "grad_norm": 2.46875, + "learning_rate": 5.782222222222222e-07, + "loss": 0.3503, + "step": 43700 + }, + { + "epoch": 0.9711606825453253, + "grad_norm": 2.453125, + "learning_rate": 5.737777777777778e-07, + "loss": 0.3566, + "step": 43710 + }, + { + "epoch": 0.9713828652683967, + "grad_norm": 2.890625, + "learning_rate": 5.693333333333334e-07, + "loss": 0.4052, + "step": 43720 + }, + { + "epoch": 0.9716050479914682, + "grad_norm": 2.421875, + "learning_rate": 5.648888888888889e-07, + "loss": 0.3622, + "step": 43730 + }, + { + "epoch": 0.9718272307145397, + "grad_norm": 2.765625, + "learning_rate": 5.604444444444445e-07, + "loss": 0.3668, + "step": 43740 + }, + { + "epoch": 0.9720494134376111, + "grad_norm": 2.15625, + "learning_rate": 5.560000000000001e-07, + "loss": 0.3619, + "step": 43750 + }, + { + "epoch": 0.9722715961606826, + "grad_norm": 2.875, + "learning_rate": 5.515555555555556e-07, + "loss": 0.4131, + "step": 43760 + }, + { + "epoch": 0.972493778883754, + "grad_norm": 2.28125, + "learning_rate": 5.471111111111112e-07, + "loss": 0.3642, + "step": 43770 + }, + { + "epoch": 0.9727159616068255, + "grad_norm": 2.15625, + "learning_rate": 5.426666666666667e-07, + "loss": 0.3549, + "step": 43780 + }, + { + "epoch": 0.9729381443298969, + "grad_norm": 2.578125, + "learning_rate": 5.382222222222223e-07, + "loss": 0.3783, + "step": 43790 + }, + { + "epoch": 0.9731603270529684, + "grad_norm": 2.3125, + "learning_rate": 5.337777777777779e-07, + "loss": 0.3919, + "step": 43800 + }, + { + "epoch": 0.9733825097760398, + "grad_norm": 2.65625, + "learning_rate": 5.293333333333334e-07, + "loss": 0.3956, + "step": 43810 + }, + { + "epoch": 0.9736046924991113, + "grad_norm": 3.125, + "learning_rate": 5.24888888888889e-07, + "loss": 0.4097, + "step": 43820 + }, + { + "epoch": 0.9738268752221827, + "grad_norm": 2.421875, + "learning_rate": 5.204444444444444e-07, + "loss": 0.3635, + "step": 43830 + }, + { + "epoch": 0.9740490579452542, + "grad_norm": 2.328125, + "learning_rate": 5.16e-07, + "loss": 0.4085, + "step": 43840 + }, + { + "epoch": 0.9742712406683256, + "grad_norm": 2.640625, + "learning_rate": 5.115555555555556e-07, + "loss": 0.3866, + "step": 43850 + }, + { + "epoch": 0.9744934233913971, + "grad_norm": 2.859375, + "learning_rate": 5.071111111111112e-07, + "loss": 0.3987, + "step": 43860 + }, + { + "epoch": 0.9747156061144685, + "grad_norm": 2.046875, + "learning_rate": 5.026666666666667e-07, + "loss": 0.3365, + "step": 43870 + }, + { + "epoch": 0.97493778883754, + "grad_norm": 2.796875, + "learning_rate": 4.982222222222223e-07, + "loss": 0.3901, + "step": 43880 + }, + { + "epoch": 0.9751599715606114, + "grad_norm": 2.484375, + "learning_rate": 4.937777777777778e-07, + "loss": 0.388, + "step": 43890 + }, + { + "epoch": 0.9753821542836829, + "grad_norm": 2.5, + "learning_rate": 4.893333333333334e-07, + "loss": 0.3724, + "step": 43900 + }, + { + "epoch": 0.9756043370067543, + "grad_norm": 2.390625, + "learning_rate": 4.848888888888889e-07, + "loss": 0.3528, + "step": 43910 + }, + { + "epoch": 0.9758265197298258, + "grad_norm": 2.4375, + "learning_rate": 4.804444444444445e-07, + "loss": 0.351, + "step": 43920 + }, + { + "epoch": 0.9760487024528972, + "grad_norm": 2.375, + "learning_rate": 4.760000000000001e-07, + "loss": 0.3827, + "step": 43930 + }, + { + "epoch": 0.9762708851759687, + "grad_norm": 2.6875, + "learning_rate": 4.7155555555555556e-07, + "loss": 0.3802, + "step": 43940 + }, + { + "epoch": 0.9764930678990402, + "grad_norm": 3.015625, + "learning_rate": 4.6711111111111115e-07, + "loss": 0.3979, + "step": 43950 + }, + { + "epoch": 0.9767152506221116, + "grad_norm": 2.734375, + "learning_rate": 4.626666666666667e-07, + "loss": 0.3912, + "step": 43960 + }, + { + "epoch": 0.9769374333451831, + "grad_norm": 2.609375, + "learning_rate": 4.5822222222222227e-07, + "loss": 0.42, + "step": 43970 + }, + { + "epoch": 0.9771596160682545, + "grad_norm": 2.59375, + "learning_rate": 4.5377777777777785e-07, + "loss": 0.3759, + "step": 43980 + }, + { + "epoch": 0.977381798791326, + "grad_norm": 2.5, + "learning_rate": 4.4933333333333333e-07, + "loss": 0.3879, + "step": 43990 + }, + { + "epoch": 0.9776039815143974, + "grad_norm": 2.40625, + "learning_rate": 4.448888888888889e-07, + "loss": 0.3706, + "step": 44000 + }, + { + "epoch": 0.9778261642374689, + "grad_norm": 2.21875, + "learning_rate": 4.4044444444444445e-07, + "loss": 0.3588, + "step": 44010 + }, + { + "epoch": 0.9780483469605403, + "grad_norm": 2.46875, + "learning_rate": 4.3600000000000004e-07, + "loss": 0.3787, + "step": 44020 + }, + { + "epoch": 0.9782705296836118, + "grad_norm": 2.203125, + "learning_rate": 4.315555555555556e-07, + "loss": 0.3515, + "step": 44030 + }, + { + "epoch": 0.9784927124066832, + "grad_norm": 2.75, + "learning_rate": 4.271111111111111e-07, + "loss": 0.3578, + "step": 44040 + }, + { + "epoch": 0.9787148951297547, + "grad_norm": 2.8125, + "learning_rate": 4.226666666666667e-07, + "loss": 0.3804, + "step": 44050 + }, + { + "epoch": 0.9789370778528261, + "grad_norm": 2.59375, + "learning_rate": 4.182222222222222e-07, + "loss": 0.4112, + "step": 44060 + }, + { + "epoch": 0.9791592605758976, + "grad_norm": 2.671875, + "learning_rate": 4.137777777777778e-07, + "loss": 0.3923, + "step": 44070 + }, + { + "epoch": 0.979381443298969, + "grad_norm": 2.171875, + "learning_rate": 4.093333333333334e-07, + "loss": 0.3167, + "step": 44080 + }, + { + "epoch": 0.9796036260220405, + "grad_norm": 2.125, + "learning_rate": 4.048888888888889e-07, + "loss": 0.4019, + "step": 44090 + }, + { + "epoch": 0.9798258087451119, + "grad_norm": 2.75, + "learning_rate": 4.0044444444444447e-07, + "loss": 0.3911, + "step": 44100 + }, + { + "epoch": 0.9800479914681834, + "grad_norm": 2.875, + "learning_rate": 3.9600000000000005e-07, + "loss": 0.356, + "step": 44110 + }, + { + "epoch": 0.9802701741912548, + "grad_norm": 2.25, + "learning_rate": 3.915555555555556e-07, + "loss": 0.3712, + "step": 44120 + }, + { + "epoch": 0.9804923569143263, + "grad_norm": 2.828125, + "learning_rate": 3.8711111111111117e-07, + "loss": 0.4247, + "step": 44130 + }, + { + "epoch": 0.9807145396373977, + "grad_norm": 2.609375, + "learning_rate": 3.8266666666666665e-07, + "loss": 0.4134, + "step": 44140 + }, + { + "epoch": 0.9809367223604692, + "grad_norm": 2.78125, + "learning_rate": 3.7822222222222224e-07, + "loss": 0.3876, + "step": 44150 + }, + { + "epoch": 0.9811589050835408, + "grad_norm": 2.90625, + "learning_rate": 3.737777777777778e-07, + "loss": 0.3674, + "step": 44160 + }, + { + "epoch": 0.9813810878066122, + "grad_norm": 2.484375, + "learning_rate": 3.6933333333333336e-07, + "loss": 0.3346, + "step": 44170 + }, + { + "epoch": 0.9816032705296837, + "grad_norm": 2.515625, + "learning_rate": 3.6488888888888894e-07, + "loss": 0.3563, + "step": 44180 + }, + { + "epoch": 0.981825453252755, + "grad_norm": 2.265625, + "learning_rate": 3.604444444444444e-07, + "loss": 0.4051, + "step": 44190 + }, + { + "epoch": 0.9820476359758266, + "grad_norm": 2.5, + "learning_rate": 3.56e-07, + "loss": 0.3901, + "step": 44200 + }, + { + "epoch": 0.982269818698898, + "grad_norm": 3.15625, + "learning_rate": 3.515555555555556e-07, + "loss": 0.4045, + "step": 44210 + }, + { + "epoch": 0.9824920014219695, + "grad_norm": 2.28125, + "learning_rate": 3.4711111111111113e-07, + "loss": 0.3926, + "step": 44220 + }, + { + "epoch": 0.9827141841450409, + "grad_norm": 2.421875, + "learning_rate": 3.426666666666667e-07, + "loss": 0.3516, + "step": 44230 + }, + { + "epoch": 0.9829363668681124, + "grad_norm": 2.75, + "learning_rate": 3.382222222222222e-07, + "loss": 0.393, + "step": 44240 + }, + { + "epoch": 0.9831585495911838, + "grad_norm": 2.4375, + "learning_rate": 3.337777777777778e-07, + "loss": 0.3764, + "step": 44250 + }, + { + "epoch": 0.9833807323142553, + "grad_norm": 2.953125, + "learning_rate": 3.2933333333333337e-07, + "loss": 0.3908, + "step": 44260 + }, + { + "epoch": 0.9836029150373267, + "grad_norm": 2.765625, + "learning_rate": 3.248888888888889e-07, + "loss": 0.4109, + "step": 44270 + }, + { + "epoch": 0.9838250977603982, + "grad_norm": 2.40625, + "learning_rate": 3.204444444444445e-07, + "loss": 0.3827, + "step": 44280 + }, + { + "epoch": 0.9840472804834696, + "grad_norm": 2.3125, + "learning_rate": 3.160000000000001e-07, + "loss": 0.386, + "step": 44290 + }, + { + "epoch": 0.9842694632065411, + "grad_norm": 2.8125, + "learning_rate": 3.1155555555555556e-07, + "loss": 0.3903, + "step": 44300 + }, + { + "epoch": 0.9844916459296125, + "grad_norm": 2.734375, + "learning_rate": 3.0711111111111114e-07, + "loss": 0.4183, + "step": 44310 + }, + { + "epoch": 0.984713828652684, + "grad_norm": 2.53125, + "learning_rate": 3.026666666666667e-07, + "loss": 0.39, + "step": 44320 + }, + { + "epoch": 0.9849360113757554, + "grad_norm": 2.703125, + "learning_rate": 2.9822222222222226e-07, + "loss": 0.4059, + "step": 44330 + }, + { + "epoch": 0.9851581940988269, + "grad_norm": 2.59375, + "learning_rate": 2.937777777777778e-07, + "loss": 0.3491, + "step": 44340 + }, + { + "epoch": 0.9853803768218983, + "grad_norm": 2.671875, + "learning_rate": 2.8933333333333333e-07, + "loss": 0.3557, + "step": 44350 + }, + { + "epoch": 0.9856025595449698, + "grad_norm": 2.265625, + "learning_rate": 2.848888888888889e-07, + "loss": 0.4059, + "step": 44360 + }, + { + "epoch": 0.9858247422680413, + "grad_norm": 2.5625, + "learning_rate": 2.8044444444444445e-07, + "loss": 0.3636, + "step": 44370 + }, + { + "epoch": 0.9860469249911127, + "grad_norm": 3.203125, + "learning_rate": 2.7600000000000004e-07, + "loss": 0.3925, + "step": 44380 + }, + { + "epoch": 0.9862691077141842, + "grad_norm": 2.328125, + "learning_rate": 2.7155555555555557e-07, + "loss": 0.358, + "step": 44390 + }, + { + "epoch": 0.9864912904372556, + "grad_norm": 2.390625, + "learning_rate": 2.6711111111111116e-07, + "loss": 0.3771, + "step": 44400 + }, + { + "epoch": 0.9867134731603271, + "grad_norm": 2.734375, + "learning_rate": 2.626666666666667e-07, + "loss": 0.4057, + "step": 44410 + }, + { + "epoch": 0.9869356558833985, + "grad_norm": 2.953125, + "learning_rate": 2.582222222222222e-07, + "loss": 0.3639, + "step": 44420 + }, + { + "epoch": 0.98715783860647, + "grad_norm": 2.765625, + "learning_rate": 2.537777777777778e-07, + "loss": 0.397, + "step": 44430 + }, + { + "epoch": 0.9873800213295414, + "grad_norm": 2.765625, + "learning_rate": 2.4933333333333334e-07, + "loss": 0.3519, + "step": 44440 + }, + { + "epoch": 0.9876022040526129, + "grad_norm": 1.90625, + "learning_rate": 2.4488888888888893e-07, + "loss": 0.3831, + "step": 44450 + }, + { + "epoch": 0.9878243867756843, + "grad_norm": 2.3125, + "learning_rate": 2.4044444444444446e-07, + "loss": 0.3542, + "step": 44460 + }, + { + "epoch": 0.9880465694987558, + "grad_norm": 2.296875, + "learning_rate": 2.3600000000000002e-07, + "loss": 0.3645, + "step": 44470 + }, + { + "epoch": 0.9882687522218272, + "grad_norm": 2.40625, + "learning_rate": 2.3155555555555556e-07, + "loss": 0.3595, + "step": 44480 + }, + { + "epoch": 0.9884909349448987, + "grad_norm": 2.78125, + "learning_rate": 2.2711111111111114e-07, + "loss": 0.3782, + "step": 44490 + }, + { + "epoch": 0.9887131176679701, + "grad_norm": 3.0, + "learning_rate": 2.226666666666667e-07, + "loss": 0.4173, + "step": 44500 + }, + { + "epoch": 0.9889353003910416, + "grad_norm": 2.671875, + "learning_rate": 2.1822222222222224e-07, + "loss": 0.4032, + "step": 44510 + }, + { + "epoch": 0.989157483114113, + "grad_norm": 2.21875, + "learning_rate": 2.137777777777778e-07, + "loss": 0.3444, + "step": 44520 + }, + { + "epoch": 0.9893796658371845, + "grad_norm": 2.703125, + "learning_rate": 2.0933333333333335e-07, + "loss": 0.3868, + "step": 44530 + }, + { + "epoch": 0.9896018485602559, + "grad_norm": 2.53125, + "learning_rate": 2.0488888888888891e-07, + "loss": 0.3836, + "step": 44540 + }, + { + "epoch": 0.9898240312833274, + "grad_norm": 2.78125, + "learning_rate": 2.0044444444444447e-07, + "loss": 0.3686, + "step": 44550 + }, + { + "epoch": 0.9900462140063989, + "grad_norm": 2.5625, + "learning_rate": 1.96e-07, + "loss": 0.3866, + "step": 44560 + }, + { + "epoch": 0.9902683967294703, + "grad_norm": 2.84375, + "learning_rate": 1.9155555555555557e-07, + "loss": 0.3696, + "step": 44570 + }, + { + "epoch": 0.9904905794525418, + "grad_norm": 2.125, + "learning_rate": 1.8711111111111113e-07, + "loss": 0.376, + "step": 44580 + }, + { + "epoch": 0.9907127621756132, + "grad_norm": 2.328125, + "learning_rate": 1.826666666666667e-07, + "loss": 0.3585, + "step": 44590 + }, + { + "epoch": 0.9909349448986847, + "grad_norm": 3.0, + "learning_rate": 1.7822222222222222e-07, + "loss": 0.3533, + "step": 44600 + }, + { + "epoch": 0.9911571276217561, + "grad_norm": 2.75, + "learning_rate": 1.7377777777777778e-07, + "loss": 0.3631, + "step": 44610 + }, + { + "epoch": 0.9913793103448276, + "grad_norm": 2.5625, + "learning_rate": 1.6933333333333337e-07, + "loss": 0.3594, + "step": 44620 + }, + { + "epoch": 0.991601493067899, + "grad_norm": 2.34375, + "learning_rate": 1.648888888888889e-07, + "loss": 0.3878, + "step": 44630 + }, + { + "epoch": 0.9918236757909705, + "grad_norm": 1.9375, + "learning_rate": 1.6044444444444446e-07, + "loss": 0.3635, + "step": 44640 + }, + { + "epoch": 0.9920458585140419, + "grad_norm": 2.390625, + "learning_rate": 1.56e-07, + "loss": 0.3724, + "step": 44650 + }, + { + "epoch": 0.9922680412371134, + "grad_norm": 2.421875, + "learning_rate": 1.5155555555555558e-07, + "loss": 0.3796, + "step": 44660 + }, + { + "epoch": 0.9924902239601848, + "grad_norm": 2.46875, + "learning_rate": 1.4711111111111111e-07, + "loss": 0.3429, + "step": 44670 + }, + { + "epoch": 0.9927124066832563, + "grad_norm": 2.734375, + "learning_rate": 1.4266666666666667e-07, + "loss": 0.3699, + "step": 44680 + }, + { + "epoch": 0.9929345894063277, + "grad_norm": 2.5625, + "learning_rate": 1.3822222222222223e-07, + "loss": 0.3655, + "step": 44690 + }, + { + "epoch": 0.9931567721293992, + "grad_norm": 2.125, + "learning_rate": 1.337777777777778e-07, + "loss": 0.392, + "step": 44700 + }, + { + "epoch": 0.9933789548524706, + "grad_norm": 2.640625, + "learning_rate": 1.2933333333333335e-07, + "loss": 0.4055, + "step": 44710 + }, + { + "epoch": 0.9936011375755421, + "grad_norm": 2.484375, + "learning_rate": 1.248888888888889e-07, + "loss": 0.4049, + "step": 44720 + }, + { + "epoch": 0.9938233202986135, + "grad_norm": 2.984375, + "learning_rate": 1.2044444444444445e-07, + "loss": 0.3453, + "step": 44730 + }, + { + "epoch": 0.994045503021685, + "grad_norm": 2.921875, + "learning_rate": 1.16e-07, + "loss": 0.3585, + "step": 44740 + }, + { + "epoch": 0.9942676857447564, + "grad_norm": 2.703125, + "learning_rate": 1.1155555555555557e-07, + "loss": 0.3736, + "step": 44750 + }, + { + "epoch": 0.9944898684678279, + "grad_norm": 2.46875, + "learning_rate": 1.0711111111111111e-07, + "loss": 0.3338, + "step": 44760 + }, + { + "epoch": 0.9947120511908994, + "grad_norm": 2.671875, + "learning_rate": 1.0266666666666667e-07, + "loss": 0.3935, + "step": 44770 + }, + { + "epoch": 0.9949342339139708, + "grad_norm": 2.140625, + "learning_rate": 9.822222222222222e-08, + "loss": 0.3645, + "step": 44780 + }, + { + "epoch": 0.9951564166370424, + "grad_norm": 2.4375, + "learning_rate": 9.377777777777779e-08, + "loss": 0.4132, + "step": 44790 + }, + { + "epoch": 0.9953785993601137, + "grad_norm": 2.265625, + "learning_rate": 8.933333333333334e-08, + "loss": 0.3645, + "step": 44800 + }, + { + "epoch": 0.9956007820831853, + "grad_norm": 2.640625, + "learning_rate": 8.48888888888889e-08, + "loss": 0.3802, + "step": 44810 + }, + { + "epoch": 0.9958229648062566, + "grad_norm": 2.375, + "learning_rate": 8.044444444444445e-08, + "loss": 0.4019, + "step": 44820 + }, + { + "epoch": 0.9960451475293282, + "grad_norm": 2.59375, + "learning_rate": 7.6e-08, + "loss": 0.3698, + "step": 44830 + }, + { + "epoch": 0.9962673302523996, + "grad_norm": 2.90625, + "learning_rate": 7.155555555555557e-08, + "loss": 0.3613, + "step": 44840 + }, + { + "epoch": 0.9964895129754711, + "grad_norm": 2.15625, + "learning_rate": 6.711111111111111e-08, + "loss": 0.334, + "step": 44850 + }, + { + "epoch": 0.9967116956985425, + "grad_norm": 2.9375, + "learning_rate": 6.266666666666667e-08, + "loss": 0.3732, + "step": 44860 + }, + { + "epoch": 0.996933878421614, + "grad_norm": 2.96875, + "learning_rate": 5.822222222222223e-08, + "loss": 0.3744, + "step": 44870 + }, + { + "epoch": 0.9971560611446854, + "grad_norm": 2.921875, + "learning_rate": 5.3777777777777785e-08, + "loss": 0.3746, + "step": 44880 + }, + { + "epoch": 0.9973782438677569, + "grad_norm": 2.171875, + "learning_rate": 4.933333333333333e-08, + "loss": 0.3952, + "step": 44890 + }, + { + "epoch": 0.9976004265908283, + "grad_norm": 2.109375, + "learning_rate": 4.488888888888889e-08, + "loss": 0.3463, + "step": 44900 + }, + { + "epoch": 0.9978226093138998, + "grad_norm": 2.640625, + "learning_rate": 4.0444444444444445e-08, + "loss": 0.3189, + "step": 44910 + }, + { + "epoch": 0.9980447920369712, + "grad_norm": 3.0, + "learning_rate": 3.6000000000000005e-08, + "loss": 0.4051, + "step": 44920 + }, + { + "epoch": 0.9982669747600427, + "grad_norm": 2.4375, + "learning_rate": 3.155555555555556e-08, + "loss": 0.3898, + "step": 44930 + }, + { + "epoch": 0.9984891574831141, + "grad_norm": 2.640625, + "learning_rate": 2.7111111111111115e-08, + "loss": 0.4055, + "step": 44940 + }, + { + "epoch": 0.9987113402061856, + "grad_norm": 3.03125, + "learning_rate": 2.266666666666667e-08, + "loss": 0.3668, + "step": 44950 + }, + { + "epoch": 0.998933522929257, + "grad_norm": 2.390625, + "learning_rate": 1.8222222222222224e-08, + "loss": 0.3609, + "step": 44960 + }, + { + "epoch": 0.9991557056523285, + "grad_norm": 2.8125, + "learning_rate": 1.3777777777777778e-08, + "loss": 0.3752, + "step": 44970 + }, + { + "epoch": 0.9993778883754, + "grad_norm": 2.609375, + "learning_rate": 9.333333333333334e-09, + "loss": 0.3807, + "step": 44980 + }, + { + "epoch": 0.9996000710984714, + "grad_norm": 2.703125, + "learning_rate": 4.888888888888889e-09, + "loss": 0.3794, + "step": 44990 + }, + { + "epoch": 0.9998222538215429, + "grad_norm": 2.75, + "learning_rate": 4.444444444444445e-10, + "loss": 0.3628, + "step": 45000 + } + ], + "logging_steps": 10, + "max_steps": 45000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3115342813852418e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}