diff --git "a/checkpoint-10000/trainer_state.json" "b/checkpoint-10000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-10000/trainer_state.json" @@ -0,0 +1,7034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.61333046193604, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008614191881124151, + "grad_norm": 3.921875, + "learning_rate": 1.9982000000000003e-05, + "loss": 1.3429, + "step": 10 + }, + { + "epoch": 0.017228383762248303, + "grad_norm": 3.15625, + "learning_rate": 1.9962000000000003e-05, + "loss": 0.7212, + "step": 20 + }, + { + "epoch": 0.025842575643372456, + "grad_norm": 3.015625, + "learning_rate": 1.9942e-05, + "loss": 0.6892, + "step": 30 + }, + { + "epoch": 0.034456767524496605, + "grad_norm": 2.890625, + "learning_rate": 1.9922e-05, + "loss": 0.6611, + "step": 40 + }, + { + "epoch": 0.04307095940562076, + "grad_norm": 2.875, + "learning_rate": 1.9902e-05, + "loss": 0.6514, + "step": 50 + }, + { + "epoch": 0.05168515128674491, + "grad_norm": 3.0, + "learning_rate": 1.9882e-05, + "loss": 0.6437, + "step": 60 + }, + { + "epoch": 0.060299343167869064, + "grad_norm": 2.96875, + "learning_rate": 1.9862e-05, + "loss": 0.6161, + "step": 70 + }, + { + "epoch": 0.06891353504899321, + "grad_norm": 3.109375, + "learning_rate": 1.9842e-05, + "loss": 0.6083, + "step": 80 + }, + { + "epoch": 0.07752772693011736, + "grad_norm": 3.296875, + "learning_rate": 1.9822e-05, + "loss": 0.5967, + "step": 90 + }, + { + "epoch": 0.08614191881124152, + "grad_norm": 2.953125, + "learning_rate": 1.9802e-05, + "loss": 0.5773, + "step": 100 + }, + { + "epoch": 0.09475611069236567, + "grad_norm": 3.21875, + "learning_rate": 1.9782e-05, + "loss": 0.5722, + "step": 110 + }, + { + "epoch": 0.10337030257348982, + "grad_norm": 2.796875, + "learning_rate": 1.9762e-05, + "loss": 0.5528, + "step": 120 + }, + { + "epoch": 0.11198449445461398, + "grad_norm": 2.953125, + "learning_rate": 1.9742000000000002e-05, + "loss": 0.5952, + "step": 130 + }, + { + "epoch": 0.12059868633573813, + "grad_norm": 2.6875, + "learning_rate": 1.9722000000000002e-05, + "loss": 0.5309, + "step": 140 + }, + { + "epoch": 0.12921287821686228, + "grad_norm": 2.609375, + "learning_rate": 1.9702000000000002e-05, + "loss": 0.5353, + "step": 150 + }, + { + "epoch": 0.13782707009798642, + "grad_norm": 2.71875, + "learning_rate": 1.9682000000000002e-05, + "loss": 0.5447, + "step": 160 + }, + { + "epoch": 0.1464412619791106, + "grad_norm": 2.859375, + "learning_rate": 1.9662000000000003e-05, + "loss": 0.4998, + "step": 170 + }, + { + "epoch": 0.15505545386023473, + "grad_norm": 3.09375, + "learning_rate": 1.9642000000000003e-05, + "loss": 0.5191, + "step": 180 + }, + { + "epoch": 0.1636696457413589, + "grad_norm": 3.140625, + "learning_rate": 1.9622e-05, + "loss": 0.5358, + "step": 190 + }, + { + "epoch": 0.17228383762248303, + "grad_norm": 2.515625, + "learning_rate": 1.9602e-05, + "loss": 0.4914, + "step": 200 + }, + { + "epoch": 0.1808980295036072, + "grad_norm": 2.6875, + "learning_rate": 1.9582e-05, + "loss": 0.4943, + "step": 210 + }, + { + "epoch": 0.18951222138473134, + "grad_norm": 2.53125, + "learning_rate": 1.9562e-05, + "loss": 0.4731, + "step": 220 + }, + { + "epoch": 0.1981264132658555, + "grad_norm": 2.625, + "learning_rate": 1.9542e-05, + "loss": 0.497, + "step": 230 + }, + { + "epoch": 0.20674060514697964, + "grad_norm": 2.46875, + "learning_rate": 1.9522e-05, + "loss": 0.4746, + "step": 240 + }, + { + "epoch": 0.2153547970281038, + "grad_norm": 2.515625, + "learning_rate": 1.9502e-05, + "loss": 0.4763, + "step": 250 + }, + { + "epoch": 0.22396898890922795, + "grad_norm": 2.546875, + "learning_rate": 1.9482e-05, + "loss": 0.4759, + "step": 260 + }, + { + "epoch": 0.23258318079035212, + "grad_norm": 2.71875, + "learning_rate": 1.9462e-05, + "loss": 0.5039, + "step": 270 + }, + { + "epoch": 0.24119737267147626, + "grad_norm": 2.546875, + "learning_rate": 1.9442e-05, + "loss": 0.4799, + "step": 280 + }, + { + "epoch": 0.2498115645526004, + "grad_norm": 2.84375, + "learning_rate": 1.9422e-05, + "loss": 0.4446, + "step": 290 + }, + { + "epoch": 0.25842575643372456, + "grad_norm": 2.671875, + "learning_rate": 1.9402e-05, + "loss": 0.4727, + "step": 300 + }, + { + "epoch": 0.26703994831484873, + "grad_norm": 2.890625, + "learning_rate": 1.9382000000000002e-05, + "loss": 0.4189, + "step": 310 + }, + { + "epoch": 0.27565414019597284, + "grad_norm": 2.265625, + "learning_rate": 1.9362000000000002e-05, + "loss": 0.4409, + "step": 320 + }, + { + "epoch": 0.284268332077097, + "grad_norm": 3.25, + "learning_rate": 1.9342000000000002e-05, + "loss": 0.4656, + "step": 330 + }, + { + "epoch": 0.2928825239582212, + "grad_norm": 2.578125, + "learning_rate": 1.9322000000000002e-05, + "loss": 0.4713, + "step": 340 + }, + { + "epoch": 0.30149671583934534, + "grad_norm": 2.0625, + "learning_rate": 1.9302e-05, + "loss": 0.4282, + "step": 350 + }, + { + "epoch": 0.31011090772046945, + "grad_norm": 2.953125, + "learning_rate": 1.9282e-05, + "loss": 0.4565, + "step": 360 + }, + { + "epoch": 0.3187250996015936, + "grad_norm": 2.296875, + "learning_rate": 1.9262e-05, + "loss": 0.4346, + "step": 370 + }, + { + "epoch": 0.3273392914827178, + "grad_norm": 2.828125, + "learning_rate": 1.9242e-05, + "loss": 0.426, + "step": 380 + }, + { + "epoch": 0.33595348336384195, + "grad_norm": 2.3125, + "learning_rate": 1.9222e-05, + "loss": 0.42, + "step": 390 + }, + { + "epoch": 0.34456767524496607, + "grad_norm": 2.171875, + "learning_rate": 1.9202e-05, + "loss": 0.4317, + "step": 400 + }, + { + "epoch": 0.35318186712609023, + "grad_norm": 2.59375, + "learning_rate": 1.9182e-05, + "loss": 0.4311, + "step": 410 + }, + { + "epoch": 0.3617960590072144, + "grad_norm": 2.609375, + "learning_rate": 1.9162e-05, + "loss": 0.4056, + "step": 420 + }, + { + "epoch": 0.3704102508883385, + "grad_norm": 2.1875, + "learning_rate": 1.9142e-05, + "loss": 0.4029, + "step": 430 + }, + { + "epoch": 0.3790244427694627, + "grad_norm": 3.125, + "learning_rate": 1.9122e-05, + "loss": 0.4337, + "step": 440 + }, + { + "epoch": 0.38763863465058684, + "grad_norm": 2.390625, + "learning_rate": 1.9102e-05, + "loss": 0.4381, + "step": 450 + }, + { + "epoch": 0.396252826531711, + "grad_norm": 2.796875, + "learning_rate": 1.9082e-05, + "loss": 0.4174, + "step": 460 + }, + { + "epoch": 0.4048670184128351, + "grad_norm": 2.421875, + "learning_rate": 1.9062e-05, + "loss": 0.3928, + "step": 470 + }, + { + "epoch": 0.4134812102939593, + "grad_norm": 2.515625, + "learning_rate": 1.9042e-05, + "loss": 0.4051, + "step": 480 + }, + { + "epoch": 0.42209540217508346, + "grad_norm": 2.40625, + "learning_rate": 1.9022000000000002e-05, + "loss": 0.3992, + "step": 490 + }, + { + "epoch": 0.4307095940562076, + "grad_norm": 2.21875, + "learning_rate": 1.9002000000000002e-05, + "loss": 0.4194, + "step": 500 + }, + { + "epoch": 0.43932378593733173, + "grad_norm": 3.328125, + "learning_rate": 1.8982000000000002e-05, + "loss": 0.3951, + "step": 510 + }, + { + "epoch": 0.4479379778184559, + "grad_norm": 2.234375, + "learning_rate": 1.8962000000000002e-05, + "loss": 0.3918, + "step": 520 + }, + { + "epoch": 0.45655216969958007, + "grad_norm": 2.65625, + "learning_rate": 1.8942000000000003e-05, + "loss": 0.3854, + "step": 530 + }, + { + "epoch": 0.46516636158070424, + "grad_norm": 2.0625, + "learning_rate": 1.8922000000000003e-05, + "loss": 0.3836, + "step": 540 + }, + { + "epoch": 0.47378055346182835, + "grad_norm": 2.28125, + "learning_rate": 1.8902000000000003e-05, + "loss": 0.3824, + "step": 550 + }, + { + "epoch": 0.4823947453429525, + "grad_norm": 2.625, + "learning_rate": 1.8882000000000003e-05, + "loss": 0.3913, + "step": 560 + }, + { + "epoch": 0.4910089372240767, + "grad_norm": 2.265625, + "learning_rate": 1.8862000000000003e-05, + "loss": 0.3834, + "step": 570 + }, + { + "epoch": 0.4996231291052008, + "grad_norm": 2.359375, + "learning_rate": 1.8842000000000004e-05, + "loss": 0.3848, + "step": 580 + }, + { + "epoch": 0.508237320986325, + "grad_norm": 2.34375, + "learning_rate": 1.8822000000000004e-05, + "loss": 0.3845, + "step": 590 + }, + { + "epoch": 0.5168515128674491, + "grad_norm": 2.453125, + "learning_rate": 1.8802000000000004e-05, + "loss": 0.3836, + "step": 600 + }, + { + "epoch": 0.5254657047485732, + "grad_norm": 2.0, + "learning_rate": 1.8782e-05, + "loss": 0.3799, + "step": 610 + }, + { + "epoch": 0.5340798966296975, + "grad_norm": 2.296875, + "learning_rate": 1.8762e-05, + "loss": 0.3715, + "step": 620 + }, + { + "epoch": 0.5426940885108216, + "grad_norm": 2.0625, + "learning_rate": 1.8742e-05, + "loss": 0.3825, + "step": 630 + }, + { + "epoch": 0.5513082803919457, + "grad_norm": 2.359375, + "learning_rate": 1.8722e-05, + "loss": 0.364, + "step": 640 + }, + { + "epoch": 0.5599224722730699, + "grad_norm": 2.1875, + "learning_rate": 1.8702e-05, + "loss": 0.3765, + "step": 650 + }, + { + "epoch": 0.568536664154194, + "grad_norm": 1.96875, + "learning_rate": 1.8682000000000002e-05, + "loss": 0.3748, + "step": 660 + }, + { + "epoch": 0.5771508560353182, + "grad_norm": 2.390625, + "learning_rate": 1.8662000000000002e-05, + "loss": 0.3751, + "step": 670 + }, + { + "epoch": 0.5857650479164423, + "grad_norm": 2.203125, + "learning_rate": 1.8642000000000002e-05, + "loss": 0.3778, + "step": 680 + }, + { + "epoch": 0.5943792397975665, + "grad_norm": 1.796875, + "learning_rate": 1.8622000000000002e-05, + "loss": 0.3798, + "step": 690 + }, + { + "epoch": 0.6029934316786907, + "grad_norm": 2.46875, + "learning_rate": 1.8602000000000002e-05, + "loss": 0.3682, + "step": 700 + }, + { + "epoch": 0.6116076235598148, + "grad_norm": 1.9765625, + "learning_rate": 1.8582000000000003e-05, + "loss": 0.3652, + "step": 710 + }, + { + "epoch": 0.6202218154409389, + "grad_norm": 2.25, + "learning_rate": 1.8562000000000003e-05, + "loss": 0.3658, + "step": 720 + }, + { + "epoch": 0.6288360073220631, + "grad_norm": 1.9140625, + "learning_rate": 1.8542000000000003e-05, + "loss": 0.389, + "step": 730 + }, + { + "epoch": 0.6374501992031872, + "grad_norm": 1.9375, + "learning_rate": 1.8522000000000003e-05, + "loss": 0.375, + "step": 740 + }, + { + "epoch": 0.6460643910843114, + "grad_norm": 2.140625, + "learning_rate": 1.8502000000000003e-05, + "loss": 0.3617, + "step": 750 + }, + { + "epoch": 0.6546785829654356, + "grad_norm": 1.9140625, + "learning_rate": 1.8482000000000004e-05, + "loss": 0.3777, + "step": 760 + }, + { + "epoch": 0.6632927748465597, + "grad_norm": 1.9453125, + "learning_rate": 1.8462000000000004e-05, + "loss": 0.3599, + "step": 770 + }, + { + "epoch": 0.6719069667276839, + "grad_norm": 1.75, + "learning_rate": 1.8442e-05, + "loss": 0.3495, + "step": 780 + }, + { + "epoch": 0.680521158608808, + "grad_norm": 2.109375, + "learning_rate": 1.8422e-05, + "loss": 0.3461, + "step": 790 + }, + { + "epoch": 0.6891353504899321, + "grad_norm": 2.078125, + "learning_rate": 1.8402e-05, + "loss": 0.3661, + "step": 800 + }, + { + "epoch": 0.6977495423710564, + "grad_norm": 2.03125, + "learning_rate": 1.8382e-05, + "loss": 0.3594, + "step": 810 + }, + { + "epoch": 0.7063637342521805, + "grad_norm": 1.984375, + "learning_rate": 1.8362e-05, + "loss": 0.3512, + "step": 820 + }, + { + "epoch": 0.7149779261333046, + "grad_norm": 2.0625, + "learning_rate": 1.8342e-05, + "loss": 0.3616, + "step": 830 + }, + { + "epoch": 0.7235921180144288, + "grad_norm": 2.484375, + "learning_rate": 1.8322000000000002e-05, + "loss": 0.3575, + "step": 840 + }, + { + "epoch": 0.7322063098955529, + "grad_norm": 2.1875, + "learning_rate": 1.8302000000000002e-05, + "loss": 0.3712, + "step": 850 + }, + { + "epoch": 0.740820501776677, + "grad_norm": 2.046875, + "learning_rate": 1.8282000000000002e-05, + "loss": 0.3724, + "step": 860 + }, + { + "epoch": 0.7494346936578012, + "grad_norm": 1.953125, + "learning_rate": 1.8262000000000002e-05, + "loss": 0.3524, + "step": 870 + }, + { + "epoch": 0.7580488855389254, + "grad_norm": 2.203125, + "learning_rate": 1.8242000000000003e-05, + "loss": 0.3543, + "step": 880 + }, + { + "epoch": 0.7666630774200496, + "grad_norm": 2.015625, + "learning_rate": 1.8222000000000003e-05, + "loss": 0.3697, + "step": 890 + }, + { + "epoch": 0.7752772693011737, + "grad_norm": 2.140625, + "learning_rate": 1.8202000000000003e-05, + "loss": 0.3583, + "step": 900 + }, + { + "epoch": 0.7838914611822978, + "grad_norm": 1.9921875, + "learning_rate": 1.8182000000000003e-05, + "loss": 0.3753, + "step": 910 + }, + { + "epoch": 0.792505653063422, + "grad_norm": 1.9375, + "learning_rate": 1.8162000000000003e-05, + "loss": 0.3581, + "step": 920 + }, + { + "epoch": 0.8011198449445461, + "grad_norm": 2.171875, + "learning_rate": 1.8142000000000004e-05, + "loss": 0.3534, + "step": 930 + }, + { + "epoch": 0.8097340368256702, + "grad_norm": 2.328125, + "learning_rate": 1.8122e-05, + "loss": 0.3654, + "step": 940 + }, + { + "epoch": 0.8183482287067945, + "grad_norm": 1.875, + "learning_rate": 1.8102e-05, + "loss": 0.3663, + "step": 950 + }, + { + "epoch": 0.8269624205879186, + "grad_norm": 1.921875, + "learning_rate": 1.8082e-05, + "loss": 0.3599, + "step": 960 + }, + { + "epoch": 0.8355766124690428, + "grad_norm": 2.296875, + "learning_rate": 1.8062e-05, + "loss": 0.3511, + "step": 970 + }, + { + "epoch": 0.8441908043501669, + "grad_norm": 2.0625, + "learning_rate": 1.8042e-05, + "loss": 0.3615, + "step": 980 + }, + { + "epoch": 0.852804996231291, + "grad_norm": 1.9140625, + "learning_rate": 1.8022e-05, + "loss": 0.3523, + "step": 990 + }, + { + "epoch": 0.8614191881124152, + "grad_norm": 2.0, + "learning_rate": 1.8002e-05, + "loss": 0.3591, + "step": 1000 + }, + { + "epoch": 0.8700333799935394, + "grad_norm": 2.078125, + "learning_rate": 1.7982e-05, + "loss": 0.3567, + "step": 1010 + }, + { + "epoch": 0.8786475718746635, + "grad_norm": 1.984375, + "learning_rate": 1.7962000000000002e-05, + "loss": 0.3568, + "step": 1020 + }, + { + "epoch": 0.8872617637557877, + "grad_norm": 1.765625, + "learning_rate": 1.7942000000000002e-05, + "loss": 0.3492, + "step": 1030 + }, + { + "epoch": 0.8958759556369118, + "grad_norm": 1.671875, + "learning_rate": 1.7922000000000002e-05, + "loss": 0.3386, + "step": 1040 + }, + { + "epoch": 0.9044901475180359, + "grad_norm": 2.09375, + "learning_rate": 1.7902000000000002e-05, + "loss": 0.3496, + "step": 1050 + }, + { + "epoch": 0.9131043393991601, + "grad_norm": 1.9296875, + "learning_rate": 1.7882000000000003e-05, + "loss": 0.3278, + "step": 1060 + }, + { + "epoch": 0.9217185312802842, + "grad_norm": 1.8359375, + "learning_rate": 1.7862000000000003e-05, + "loss": 0.3343, + "step": 1070 + }, + { + "epoch": 0.9303327231614085, + "grad_norm": 1.6171875, + "learning_rate": 1.7842000000000003e-05, + "loss": 0.3389, + "step": 1080 + }, + { + "epoch": 0.9389469150425326, + "grad_norm": 1.96875, + "learning_rate": 1.7822000000000003e-05, + "loss": 0.351, + "step": 1090 + }, + { + "epoch": 0.9475611069236567, + "grad_norm": 1.8203125, + "learning_rate": 1.7802e-05, + "loss": 0.3625, + "step": 1100 + }, + { + "epoch": 0.9561752988047809, + "grad_norm": 2.03125, + "learning_rate": 1.7782e-05, + "loss": 0.3597, + "step": 1110 + }, + { + "epoch": 0.964789490685905, + "grad_norm": 2.015625, + "learning_rate": 1.7762e-05, + "loss": 0.3631, + "step": 1120 + }, + { + "epoch": 0.9734036825670291, + "grad_norm": 1.859375, + "learning_rate": 1.7742e-05, + "loss": 0.3378, + "step": 1130 + }, + { + "epoch": 0.9820178744481534, + "grad_norm": 1.71875, + "learning_rate": 1.7722e-05, + "loss": 0.3461, + "step": 1140 + }, + { + "epoch": 0.9906320663292775, + "grad_norm": 1.9609375, + "learning_rate": 1.7702e-05, + "loss": 0.3691, + "step": 1150 + }, + { + "epoch": 0.9992462582104016, + "grad_norm": 1.6328125, + "learning_rate": 1.7682e-05, + "loss": 0.3332, + "step": 1160 + }, + { + "epoch": 1.0077527726930118, + "grad_norm": 1.7890625, + "learning_rate": 1.7662e-05, + "loss": 0.2853, + "step": 1170 + }, + { + "epoch": 1.0163669645741358, + "grad_norm": 1.8125, + "learning_rate": 1.7642e-05, + "loss": 0.284, + "step": 1180 + }, + { + "epoch": 1.02498115645526, + "grad_norm": 1.78125, + "learning_rate": 1.7622000000000002e-05, + "loss": 0.2768, + "step": 1190 + }, + { + "epoch": 1.0335953483363842, + "grad_norm": 1.8515625, + "learning_rate": 1.7602000000000002e-05, + "loss": 0.2734, + "step": 1200 + }, + { + "epoch": 1.0422095402175084, + "grad_norm": 1.7734375, + "learning_rate": 1.7582000000000002e-05, + "loss": 0.2697, + "step": 1210 + }, + { + "epoch": 1.0508237320986324, + "grad_norm": 1.9765625, + "learning_rate": 1.7562000000000002e-05, + "loss": 0.2799, + "step": 1220 + }, + { + "epoch": 1.0594379239797567, + "grad_norm": 1.875, + "learning_rate": 1.7542000000000002e-05, + "loss": 0.2766, + "step": 1230 + }, + { + "epoch": 1.068052115860881, + "grad_norm": 1.96875, + "learning_rate": 1.7522000000000003e-05, + "loss": 0.2742, + "step": 1240 + }, + { + "epoch": 1.076666307742005, + "grad_norm": 1.8671875, + "learning_rate": 1.7502000000000003e-05, + "loss": 0.2882, + "step": 1250 + }, + { + "epoch": 1.0852804996231291, + "grad_norm": 1.8046875, + "learning_rate": 1.7482e-05, + "loss": 0.2882, + "step": 1260 + }, + { + "epoch": 1.0938946915042533, + "grad_norm": 2.140625, + "learning_rate": 1.7462e-05, + "loss": 0.2828, + "step": 1270 + }, + { + "epoch": 1.1025088833853773, + "grad_norm": 1.859375, + "learning_rate": 1.7442e-05, + "loss": 0.2732, + "step": 1280 + }, + { + "epoch": 1.1111230752665016, + "grad_norm": 1.65625, + "learning_rate": 1.7422e-05, + "loss": 0.2798, + "step": 1290 + }, + { + "epoch": 1.1197372671476258, + "grad_norm": 1.8671875, + "learning_rate": 1.7402e-05, + "loss": 0.293, + "step": 1300 + }, + { + "epoch": 1.1283514590287498, + "grad_norm": 1.9609375, + "learning_rate": 1.7382e-05, + "loss": 0.2971, + "step": 1310 + }, + { + "epoch": 1.136965650909874, + "grad_norm": 1.953125, + "learning_rate": 1.7362e-05, + "loss": 0.2673, + "step": 1320 + }, + { + "epoch": 1.1455798427909982, + "grad_norm": 1.75, + "learning_rate": 1.7342e-05, + "loss": 0.2875, + "step": 1330 + }, + { + "epoch": 1.1541940346721222, + "grad_norm": 1.6640625, + "learning_rate": 1.7322e-05, + "loss": 0.2723, + "step": 1340 + }, + { + "epoch": 1.1628082265532464, + "grad_norm": 1.9296875, + "learning_rate": 1.7302e-05, + "loss": 0.3001, + "step": 1350 + }, + { + "epoch": 1.1714224184343707, + "grad_norm": 1.6875, + "learning_rate": 1.7282e-05, + "loss": 0.2878, + "step": 1360 + }, + { + "epoch": 1.180036610315495, + "grad_norm": 1.5546875, + "learning_rate": 1.7262000000000002e-05, + "loss": 0.2663, + "step": 1370 + }, + { + "epoch": 1.188650802196619, + "grad_norm": 1.90625, + "learning_rate": 1.7242000000000002e-05, + "loss": 0.3061, + "step": 1380 + }, + { + "epoch": 1.1972649940777431, + "grad_norm": 1.84375, + "learning_rate": 1.7222000000000002e-05, + "loss": 0.3015, + "step": 1390 + }, + { + "epoch": 1.2058791859588673, + "grad_norm": 1.8125, + "learning_rate": 1.7202000000000002e-05, + "loss": 0.2825, + "step": 1400 + }, + { + "epoch": 1.2144933778399913, + "grad_norm": 1.859375, + "learning_rate": 1.7182000000000003e-05, + "loss": 0.2818, + "step": 1410 + }, + { + "epoch": 1.2231075697211156, + "grad_norm": 1.96875, + "learning_rate": 1.7162e-05, + "loss": 0.2755, + "step": 1420 + }, + { + "epoch": 1.2317217616022398, + "grad_norm": 1.7421875, + "learning_rate": 1.7142e-05, + "loss": 0.2897, + "step": 1430 + }, + { + "epoch": 1.2403359534833638, + "grad_norm": 1.8515625, + "learning_rate": 1.7122e-05, + "loss": 0.2816, + "step": 1440 + }, + { + "epoch": 1.248950145364488, + "grad_norm": 2.03125, + "learning_rate": 1.7102e-05, + "loss": 0.2798, + "step": 1450 + }, + { + "epoch": 1.2575643372456122, + "grad_norm": 2.0, + "learning_rate": 1.7082e-05, + "loss": 0.2799, + "step": 1460 + }, + { + "epoch": 1.2661785291267362, + "grad_norm": 1.6953125, + "learning_rate": 1.7062e-05, + "loss": 0.2843, + "step": 1470 + }, + { + "epoch": 1.2747927210078605, + "grad_norm": 1.953125, + "learning_rate": 1.7042e-05, + "loss": 0.2876, + "step": 1480 + }, + { + "epoch": 1.2834069128889847, + "grad_norm": 1.8203125, + "learning_rate": 1.7022e-05, + "loss": 0.2912, + "step": 1490 + }, + { + "epoch": 1.2920211047701087, + "grad_norm": 2.0, + "learning_rate": 1.7002e-05, + "loss": 0.2811, + "step": 1500 + }, + { + "epoch": 1.300635296651233, + "grad_norm": 1.875, + "learning_rate": 1.6982e-05, + "loss": 0.2944, + "step": 1510 + }, + { + "epoch": 1.3092494885323571, + "grad_norm": 1.828125, + "learning_rate": 1.6962e-05, + "loss": 0.2796, + "step": 1520 + }, + { + "epoch": 1.3178636804134811, + "grad_norm": 1.78125, + "learning_rate": 1.6942e-05, + "loss": 0.2857, + "step": 1530 + }, + { + "epoch": 1.3264778722946053, + "grad_norm": 1.8359375, + "learning_rate": 1.6922e-05, + "loss": 0.278, + "step": 1540 + }, + { + "epoch": 1.3350920641757296, + "grad_norm": 2.125, + "learning_rate": 1.6902000000000002e-05, + "loss": 0.2711, + "step": 1550 + }, + { + "epoch": 1.3437062560568536, + "grad_norm": 1.859375, + "learning_rate": 1.6882000000000002e-05, + "loss": 0.2894, + "step": 1560 + }, + { + "epoch": 1.3523204479379778, + "grad_norm": 1.71875, + "learning_rate": 1.6862000000000002e-05, + "loss": 0.2834, + "step": 1570 + }, + { + "epoch": 1.360934639819102, + "grad_norm": 1.90625, + "learning_rate": 1.6842e-05, + "loss": 0.2748, + "step": 1580 + }, + { + "epoch": 1.369548831700226, + "grad_norm": 1.734375, + "learning_rate": 1.6822e-05, + "loss": 0.2904, + "step": 1590 + }, + { + "epoch": 1.3781630235813502, + "grad_norm": 1.65625, + "learning_rate": 1.6802e-05, + "loss": 0.2816, + "step": 1600 + }, + { + "epoch": 1.3867772154624745, + "grad_norm": 1.90625, + "learning_rate": 1.6782e-05, + "loss": 0.2993, + "step": 1610 + }, + { + "epoch": 1.3953914073435985, + "grad_norm": 1.96875, + "learning_rate": 1.6762e-05, + "loss": 0.2747, + "step": 1620 + }, + { + "epoch": 1.4040055992247227, + "grad_norm": 1.90625, + "learning_rate": 1.6742e-05, + "loss": 0.2814, + "step": 1630 + }, + { + "epoch": 1.412619791105847, + "grad_norm": 1.90625, + "learning_rate": 1.6722e-05, + "loss": 0.2777, + "step": 1640 + }, + { + "epoch": 1.421233982986971, + "grad_norm": 2.125, + "learning_rate": 1.6702e-05, + "loss": 0.2789, + "step": 1650 + }, + { + "epoch": 1.4298481748680951, + "grad_norm": 1.703125, + "learning_rate": 1.6682e-05, + "loss": 0.2876, + "step": 1660 + }, + { + "epoch": 1.4384623667492193, + "grad_norm": 1.828125, + "learning_rate": 1.6662e-05, + "loss": 0.2855, + "step": 1670 + }, + { + "epoch": 1.4470765586303436, + "grad_norm": 1.6953125, + "learning_rate": 1.6642e-05, + "loss": 0.2823, + "step": 1680 + }, + { + "epoch": 1.4556907505114676, + "grad_norm": 1.6875, + "learning_rate": 1.6622e-05, + "loss": 0.2759, + "step": 1690 + }, + { + "epoch": 1.4643049423925918, + "grad_norm": 1.734375, + "learning_rate": 1.6602e-05, + "loss": 0.2804, + "step": 1700 + }, + { + "epoch": 1.472919134273716, + "grad_norm": 1.71875, + "learning_rate": 1.6582e-05, + "loss": 0.2772, + "step": 1710 + }, + { + "epoch": 1.4815333261548402, + "grad_norm": 2.078125, + "learning_rate": 1.6562e-05, + "loss": 0.2834, + "step": 1720 + }, + { + "epoch": 1.4901475180359642, + "grad_norm": 1.765625, + "learning_rate": 1.6542000000000002e-05, + "loss": 0.2788, + "step": 1730 + }, + { + "epoch": 1.4987617099170885, + "grad_norm": 1.8359375, + "learning_rate": 1.6522e-05, + "loss": 0.2796, + "step": 1740 + }, + { + "epoch": 1.5073759017982127, + "grad_norm": 1.7578125, + "learning_rate": 1.6502e-05, + "loss": 0.2764, + "step": 1750 + }, + { + "epoch": 1.5159900936793367, + "grad_norm": 1.859375, + "learning_rate": 1.6482000000000002e-05, + "loss": 0.2893, + "step": 1760 + }, + { + "epoch": 1.524604285560461, + "grad_norm": 1.9140625, + "learning_rate": 1.6462000000000003e-05, + "loss": 0.2868, + "step": 1770 + }, + { + "epoch": 1.5332184774415851, + "grad_norm": 2.046875, + "learning_rate": 1.6442000000000003e-05, + "loss": 0.2801, + "step": 1780 + }, + { + "epoch": 1.5418326693227091, + "grad_norm": 1.78125, + "learning_rate": 1.6422000000000003e-05, + "loss": 0.2845, + "step": 1790 + }, + { + "epoch": 1.5504468612038333, + "grad_norm": 1.703125, + "learning_rate": 1.6402000000000003e-05, + "loss": 0.2814, + "step": 1800 + }, + { + "epoch": 1.5590610530849576, + "grad_norm": 1.765625, + "learning_rate": 1.6382000000000003e-05, + "loss": 0.2707, + "step": 1810 + }, + { + "epoch": 1.5676752449660816, + "grad_norm": 1.8046875, + "learning_rate": 1.6362000000000004e-05, + "loss": 0.2809, + "step": 1820 + }, + { + "epoch": 1.5762894368472058, + "grad_norm": 1.7578125, + "learning_rate": 1.6342000000000004e-05, + "loss": 0.2875, + "step": 1830 + }, + { + "epoch": 1.58490362872833, + "grad_norm": 1.734375, + "learning_rate": 1.6322e-05, + "loss": 0.292, + "step": 1840 + }, + { + "epoch": 1.593517820609454, + "grad_norm": 1.71875, + "learning_rate": 1.6302e-05, + "loss": 0.2954, + "step": 1850 + }, + { + "epoch": 1.6021320124905782, + "grad_norm": 1.796875, + "learning_rate": 1.6282e-05, + "loss": 0.2823, + "step": 1860 + }, + { + "epoch": 1.6107462043717025, + "grad_norm": 1.6796875, + "learning_rate": 1.6262e-05, + "loss": 0.2822, + "step": 1870 + }, + { + "epoch": 1.6193603962528265, + "grad_norm": 1.8515625, + "learning_rate": 1.6242e-05, + "loss": 0.2776, + "step": 1880 + }, + { + "epoch": 1.6279745881339507, + "grad_norm": 1.7890625, + "learning_rate": 1.6222e-05, + "loss": 0.2798, + "step": 1890 + }, + { + "epoch": 1.636588780015075, + "grad_norm": 2.0625, + "learning_rate": 1.6202000000000002e-05, + "loss": 0.2873, + "step": 1900 + }, + { + "epoch": 1.645202971896199, + "grad_norm": 1.8203125, + "learning_rate": 1.6182000000000002e-05, + "loss": 0.2783, + "step": 1910 + }, + { + "epoch": 1.6538171637773231, + "grad_norm": 1.796875, + "learning_rate": 1.6162000000000002e-05, + "loss": 0.2847, + "step": 1920 + }, + { + "epoch": 1.6624313556584474, + "grad_norm": 2.046875, + "learning_rate": 1.6142000000000002e-05, + "loss": 0.2917, + "step": 1930 + }, + { + "epoch": 1.6710455475395714, + "grad_norm": 1.796875, + "learning_rate": 1.6122000000000003e-05, + "loss": 0.2682, + "step": 1940 + }, + { + "epoch": 1.6796597394206956, + "grad_norm": 2.109375, + "learning_rate": 1.6102000000000003e-05, + "loss": 0.2837, + "step": 1950 + }, + { + "epoch": 1.6882739313018198, + "grad_norm": 1.78125, + "learning_rate": 1.6082000000000003e-05, + "loss": 0.2852, + "step": 1960 + }, + { + "epoch": 1.6968881231829438, + "grad_norm": 1.8515625, + "learning_rate": 1.6062000000000003e-05, + "loss": 0.2896, + "step": 1970 + }, + { + "epoch": 1.705502315064068, + "grad_norm": 1.9765625, + "learning_rate": 1.6042000000000003e-05, + "loss": 0.2827, + "step": 1980 + }, + { + "epoch": 1.7141165069451922, + "grad_norm": 1.75, + "learning_rate": 1.6022000000000003e-05, + "loss": 0.2725, + "step": 1990 + }, + { + "epoch": 1.7227306988263162, + "grad_norm": 1.8828125, + "learning_rate": 1.6002000000000004e-05, + "loss": 0.2835, + "step": 2000 + }, + { + "epoch": 1.7313448907074405, + "grad_norm": 1.859375, + "learning_rate": 1.5982e-05, + "loss": 0.2779, + "step": 2010 + }, + { + "epoch": 1.7399590825885647, + "grad_norm": 1.8046875, + "learning_rate": 1.5962e-05, + "loss": 0.2856, + "step": 2020 + }, + { + "epoch": 1.7485732744696887, + "grad_norm": 1.9453125, + "learning_rate": 1.5942e-05, + "loss": 0.2835, + "step": 2030 + }, + { + "epoch": 1.757187466350813, + "grad_norm": 1.7734375, + "learning_rate": 1.5922e-05, + "loss": 0.2777, + "step": 2040 + }, + { + "epoch": 1.7658016582319371, + "grad_norm": 1.8671875, + "learning_rate": 1.5902e-05, + "loss": 0.2787, + "step": 2050 + }, + { + "epoch": 1.7744158501130611, + "grad_norm": 1.7890625, + "learning_rate": 1.5882e-05, + "loss": 0.2842, + "step": 2060 + }, + { + "epoch": 1.7830300419941856, + "grad_norm": 1.75, + "learning_rate": 1.5862e-05, + "loss": 0.2654, + "step": 2070 + }, + { + "epoch": 1.7916442338753096, + "grad_norm": 1.875, + "learning_rate": 1.5842000000000002e-05, + "loss": 0.2701, + "step": 2080 + }, + { + "epoch": 1.8002584257564336, + "grad_norm": 1.8203125, + "learning_rate": 1.5822000000000002e-05, + "loss": 0.2709, + "step": 2090 + }, + { + "epoch": 1.808872617637558, + "grad_norm": 1.8046875, + "learning_rate": 1.5802000000000002e-05, + "loss": 0.2781, + "step": 2100 + }, + { + "epoch": 1.817486809518682, + "grad_norm": 1.875, + "learning_rate": 1.5782000000000002e-05, + "loss": 0.2823, + "step": 2110 + }, + { + "epoch": 1.826101001399806, + "grad_norm": 1.875, + "learning_rate": 1.5762000000000003e-05, + "loss": 0.2747, + "step": 2120 + }, + { + "epoch": 1.8347151932809305, + "grad_norm": 1.8828125, + "learning_rate": 1.5742000000000003e-05, + "loss": 0.2731, + "step": 2130 + }, + { + "epoch": 1.8433293851620545, + "grad_norm": 1.71875, + "learning_rate": 1.5722000000000003e-05, + "loss": 0.2738, + "step": 2140 + }, + { + "epoch": 1.8519435770431787, + "grad_norm": 1.8671875, + "learning_rate": 1.5702000000000003e-05, + "loss": 0.2887, + "step": 2150 + }, + { + "epoch": 1.860557768924303, + "grad_norm": 2.046875, + "learning_rate": 1.5682000000000003e-05, + "loss": 0.2841, + "step": 2160 + }, + { + "epoch": 1.869171960805427, + "grad_norm": 2.015625, + "learning_rate": 1.5662e-05, + "loss": 0.2916, + "step": 2170 + }, + { + "epoch": 1.8777861526865511, + "grad_norm": 1.7734375, + "learning_rate": 1.5642e-05, + "loss": 0.2747, + "step": 2180 + }, + { + "epoch": 1.8864003445676754, + "grad_norm": 1.7109375, + "learning_rate": 1.5622e-05, + "loss": 0.2769, + "step": 2190 + }, + { + "epoch": 1.8950145364487994, + "grad_norm": 1.7421875, + "learning_rate": 1.5602e-05, + "loss": 0.2798, + "step": 2200 + }, + { + "epoch": 1.9036287283299236, + "grad_norm": 1.8125, + "learning_rate": 1.5582e-05, + "loss": 0.2791, + "step": 2210 + }, + { + "epoch": 1.9122429202110478, + "grad_norm": 1.9296875, + "learning_rate": 1.5562e-05, + "loss": 0.2721, + "step": 2220 + }, + { + "epoch": 1.9208571120921718, + "grad_norm": 2.015625, + "learning_rate": 1.5542e-05, + "loss": 0.2778, + "step": 2230 + }, + { + "epoch": 1.929471303973296, + "grad_norm": 1.7265625, + "learning_rate": 1.5522e-05, + "loss": 0.2792, + "step": 2240 + }, + { + "epoch": 1.9380854958544202, + "grad_norm": 1.671875, + "learning_rate": 1.5502e-05, + "loss": 0.2819, + "step": 2250 + }, + { + "epoch": 1.9466996877355442, + "grad_norm": 1.8515625, + "learning_rate": 1.5482000000000002e-05, + "loss": 0.2691, + "step": 2260 + }, + { + "epoch": 1.9553138796166685, + "grad_norm": 1.6015625, + "learning_rate": 1.5462000000000002e-05, + "loss": 0.2731, + "step": 2270 + }, + { + "epoch": 1.9639280714977927, + "grad_norm": 1.734375, + "learning_rate": 1.5442000000000002e-05, + "loss": 0.2709, + "step": 2280 + }, + { + "epoch": 1.9725422633789167, + "grad_norm": 2.0625, + "learning_rate": 1.5422000000000002e-05, + "loss": 0.288, + "step": 2290 + }, + { + "epoch": 1.981156455260041, + "grad_norm": 1.8046875, + "learning_rate": 1.5402000000000003e-05, + "loss": 0.2807, + "step": 2300 + }, + { + "epoch": 1.9897706471411651, + "grad_norm": 1.8671875, + "learning_rate": 1.5382000000000003e-05, + "loss": 0.2769, + "step": 2310 + }, + { + "epoch": 1.9983848390222891, + "grad_norm": 1.90625, + "learning_rate": 1.5362000000000003e-05, + "loss": 0.2855, + "step": 2320 + }, + { + "epoch": 2.006891353504899, + "grad_norm": 1.875, + "learning_rate": 1.5342e-05, + "loss": 0.2429, + "step": 2330 + }, + { + "epoch": 2.0155055453860236, + "grad_norm": 2.0, + "learning_rate": 1.5322e-05, + "loss": 0.2166, + "step": 2340 + }, + { + "epoch": 2.0241197372671476, + "grad_norm": 1.8125, + "learning_rate": 1.5302e-05, + "loss": 0.2022, + "step": 2350 + }, + { + "epoch": 2.0327339291482716, + "grad_norm": 1.8984375, + "learning_rate": 1.5282e-05, + "loss": 0.1995, + "step": 2360 + }, + { + "epoch": 2.041348121029396, + "grad_norm": 1.8828125, + "learning_rate": 1.5262e-05, + "loss": 0.1997, + "step": 2370 + }, + { + "epoch": 2.04996231291052, + "grad_norm": 2.109375, + "learning_rate": 1.5242e-05, + "loss": 0.2031, + "step": 2380 + }, + { + "epoch": 2.0585765047916444, + "grad_norm": 1.9609375, + "learning_rate": 1.5222000000000001e-05, + "loss": 0.21, + "step": 2390 + }, + { + "epoch": 2.0671906966727684, + "grad_norm": 2.15625, + "learning_rate": 1.5202000000000001e-05, + "loss": 0.2159, + "step": 2400 + }, + { + "epoch": 2.0758048885538924, + "grad_norm": 2.03125, + "learning_rate": 1.5182000000000001e-05, + "loss": 0.1967, + "step": 2410 + }, + { + "epoch": 2.084419080435017, + "grad_norm": 2.140625, + "learning_rate": 1.5162000000000002e-05, + "loss": 0.2168, + "step": 2420 + }, + { + "epoch": 2.093033272316141, + "grad_norm": 1.921875, + "learning_rate": 1.5142000000000002e-05, + "loss": 0.2028, + "step": 2430 + }, + { + "epoch": 2.101647464197265, + "grad_norm": 2.015625, + "learning_rate": 1.5122000000000002e-05, + "loss": 0.1958, + "step": 2440 + }, + { + "epoch": 2.1102616560783893, + "grad_norm": 2.046875, + "learning_rate": 1.5102e-05, + "loss": 0.2105, + "step": 2450 + }, + { + "epoch": 2.1188758479595133, + "grad_norm": 1.953125, + "learning_rate": 1.5082e-05, + "loss": 0.2027, + "step": 2460 + }, + { + "epoch": 2.1274900398406373, + "grad_norm": 2.09375, + "learning_rate": 1.5062e-05, + "loss": 0.207, + "step": 2470 + }, + { + "epoch": 2.136104231721762, + "grad_norm": 2.0, + "learning_rate": 1.5042000000000001e-05, + "loss": 0.2125, + "step": 2480 + }, + { + "epoch": 2.144718423602886, + "grad_norm": 2.078125, + "learning_rate": 1.5022000000000001e-05, + "loss": 0.2052, + "step": 2490 + }, + { + "epoch": 2.15333261548401, + "grad_norm": 2.15625, + "learning_rate": 1.5002000000000001e-05, + "loss": 0.2057, + "step": 2500 + }, + { + "epoch": 2.1619468073651342, + "grad_norm": 2.109375, + "learning_rate": 1.4982000000000002e-05, + "loss": 0.2159, + "step": 2510 + }, + { + "epoch": 2.1705609992462582, + "grad_norm": 1.9453125, + "learning_rate": 1.4962000000000002e-05, + "loss": 0.2088, + "step": 2520 + }, + { + "epoch": 2.1791751911273822, + "grad_norm": 2.125, + "learning_rate": 1.4942e-05, + "loss": 0.2128, + "step": 2530 + }, + { + "epoch": 2.1877893830085067, + "grad_norm": 1.984375, + "learning_rate": 1.4922e-05, + "loss": 0.2143, + "step": 2540 + }, + { + "epoch": 2.1964035748896307, + "grad_norm": 2.109375, + "learning_rate": 1.4902e-05, + "loss": 0.2098, + "step": 2550 + }, + { + "epoch": 2.2050177667707547, + "grad_norm": 2.1875, + "learning_rate": 1.4882e-05, + "loss": 0.2027, + "step": 2560 + }, + { + "epoch": 2.213631958651879, + "grad_norm": 1.9765625, + "learning_rate": 1.4862000000000001e-05, + "loss": 0.2082, + "step": 2570 + }, + { + "epoch": 2.222246150533003, + "grad_norm": 2.03125, + "learning_rate": 1.4842000000000001e-05, + "loss": 0.2132, + "step": 2580 + }, + { + "epoch": 2.230860342414127, + "grad_norm": 2.078125, + "learning_rate": 1.4822000000000001e-05, + "loss": 0.2008, + "step": 2590 + }, + { + "epoch": 2.2394745342952516, + "grad_norm": 2.1875, + "learning_rate": 1.4802000000000002e-05, + "loss": 0.207, + "step": 2600 + }, + { + "epoch": 2.2480887261763756, + "grad_norm": 1.9765625, + "learning_rate": 1.4782e-05, + "loss": 0.2129, + "step": 2610 + }, + { + "epoch": 2.2567029180574996, + "grad_norm": 2.375, + "learning_rate": 1.4762e-05, + "loss": 0.2085, + "step": 2620 + }, + { + "epoch": 2.265317109938624, + "grad_norm": 2.140625, + "learning_rate": 1.4742e-05, + "loss": 0.216, + "step": 2630 + }, + { + "epoch": 2.273931301819748, + "grad_norm": 1.984375, + "learning_rate": 1.4722e-05, + "loss": 0.2133, + "step": 2640 + }, + { + "epoch": 2.282545493700872, + "grad_norm": 2.34375, + "learning_rate": 1.4702000000000001e-05, + "loss": 0.2032, + "step": 2650 + }, + { + "epoch": 2.2911596855819965, + "grad_norm": 2.203125, + "learning_rate": 1.4682000000000001e-05, + "loss": 0.2248, + "step": 2660 + }, + { + "epoch": 2.2997738774631205, + "grad_norm": 2.25, + "learning_rate": 1.4662000000000001e-05, + "loss": 0.2056, + "step": 2670 + }, + { + "epoch": 2.3083880693442445, + "grad_norm": 2.046875, + "learning_rate": 1.4642000000000001e-05, + "loss": 0.2115, + "step": 2680 + }, + { + "epoch": 2.317002261225369, + "grad_norm": 2.296875, + "learning_rate": 1.4622e-05, + "loss": 0.1984, + "step": 2690 + }, + { + "epoch": 2.325616453106493, + "grad_norm": 2.140625, + "learning_rate": 1.4602e-05, + "loss": 0.2122, + "step": 2700 + }, + { + "epoch": 2.334230644987617, + "grad_norm": 2.09375, + "learning_rate": 1.4582e-05, + "loss": 0.2094, + "step": 2710 + }, + { + "epoch": 2.3428448368687413, + "grad_norm": 2.109375, + "learning_rate": 1.4562e-05, + "loss": 0.2225, + "step": 2720 + }, + { + "epoch": 2.3514590287498653, + "grad_norm": 2.421875, + "learning_rate": 1.4542e-05, + "loss": 0.2131, + "step": 2730 + }, + { + "epoch": 2.36007322063099, + "grad_norm": 2.40625, + "learning_rate": 1.4522000000000001e-05, + "loss": 0.2118, + "step": 2740 + }, + { + "epoch": 2.368687412512114, + "grad_norm": 2.078125, + "learning_rate": 1.4502000000000001e-05, + "loss": 0.2148, + "step": 2750 + }, + { + "epoch": 2.377301604393238, + "grad_norm": 2.375, + "learning_rate": 1.4482000000000001e-05, + "loss": 0.214, + "step": 2760 + }, + { + "epoch": 2.385915796274362, + "grad_norm": 2.078125, + "learning_rate": 1.4462e-05, + "loss": 0.2028, + "step": 2770 + }, + { + "epoch": 2.3945299881554862, + "grad_norm": 2.25, + "learning_rate": 1.4442e-05, + "loss": 0.2073, + "step": 2780 + }, + { + "epoch": 2.4031441800366102, + "grad_norm": 1.9609375, + "learning_rate": 1.4422e-05, + "loss": 0.2145, + "step": 2790 + }, + { + "epoch": 2.4117583719177347, + "grad_norm": 2.90625, + "learning_rate": 1.4402e-05, + "loss": 0.2069, + "step": 2800 + }, + { + "epoch": 2.4203725637988587, + "grad_norm": 1.8984375, + "learning_rate": 1.4382e-05, + "loss": 0.2216, + "step": 2810 + }, + { + "epoch": 2.4289867556799827, + "grad_norm": 2.03125, + "learning_rate": 1.4362e-05, + "loss": 0.2117, + "step": 2820 + }, + { + "epoch": 2.4376009475611067, + "grad_norm": 1.9609375, + "learning_rate": 1.4342000000000001e-05, + "loss": 0.219, + "step": 2830 + }, + { + "epoch": 2.446215139442231, + "grad_norm": 2.28125, + "learning_rate": 1.4322000000000001e-05, + "loss": 0.2061, + "step": 2840 + }, + { + "epoch": 2.454829331323355, + "grad_norm": 2.03125, + "learning_rate": 1.4302e-05, + "loss": 0.2182, + "step": 2850 + }, + { + "epoch": 2.4634435232044796, + "grad_norm": 2.21875, + "learning_rate": 1.4282e-05, + "loss": 0.2135, + "step": 2860 + }, + { + "epoch": 2.4720577150856036, + "grad_norm": 2.125, + "learning_rate": 1.4262e-05, + "loss": 0.215, + "step": 2870 + }, + { + "epoch": 2.4806719069667276, + "grad_norm": 2.015625, + "learning_rate": 1.4242e-05, + "loss": 0.2105, + "step": 2880 + }, + { + "epoch": 2.489286098847852, + "grad_norm": 2.078125, + "learning_rate": 1.4222e-05, + "loss": 0.2067, + "step": 2890 + }, + { + "epoch": 2.497900290728976, + "grad_norm": 2.359375, + "learning_rate": 1.4202e-05, + "loss": 0.2145, + "step": 2900 + }, + { + "epoch": 2.5065144826101, + "grad_norm": 2.328125, + "learning_rate": 1.4182e-05, + "loss": 0.2191, + "step": 2910 + }, + { + "epoch": 2.5151286744912245, + "grad_norm": 2.375, + "learning_rate": 1.4162000000000001e-05, + "loss": 0.2068, + "step": 2920 + }, + { + "epoch": 2.5237428663723485, + "grad_norm": 2.015625, + "learning_rate": 1.4142e-05, + "loss": 0.2025, + "step": 2930 + }, + { + "epoch": 2.5323570582534725, + "grad_norm": 2.140625, + "learning_rate": 1.4122e-05, + "loss": 0.2089, + "step": 2940 + }, + { + "epoch": 2.540971250134597, + "grad_norm": 2.296875, + "learning_rate": 1.4102e-05, + "loss": 0.2167, + "step": 2950 + }, + { + "epoch": 2.549585442015721, + "grad_norm": 2.140625, + "learning_rate": 1.4082e-05, + "loss": 0.2049, + "step": 2960 + }, + { + "epoch": 2.558199633896845, + "grad_norm": 1.984375, + "learning_rate": 1.4062e-05, + "loss": 0.2034, + "step": 2970 + }, + { + "epoch": 2.5668138257779693, + "grad_norm": 1.9296875, + "learning_rate": 1.4042e-05, + "loss": 0.2095, + "step": 2980 + }, + { + "epoch": 2.5754280176590933, + "grad_norm": 2.34375, + "learning_rate": 1.4022e-05, + "loss": 0.2047, + "step": 2990 + }, + { + "epoch": 2.5840422095402173, + "grad_norm": 2.234375, + "learning_rate": 1.4002e-05, + "loss": 0.2055, + "step": 3000 + }, + { + "epoch": 2.592656401421342, + "grad_norm": 2.203125, + "learning_rate": 1.3982000000000003e-05, + "loss": 0.211, + "step": 3010 + }, + { + "epoch": 2.601270593302466, + "grad_norm": 1.90625, + "learning_rate": 1.3962000000000003e-05, + "loss": 0.2009, + "step": 3020 + }, + { + "epoch": 2.6098847851835902, + "grad_norm": 2.28125, + "learning_rate": 1.3942000000000001e-05, + "loss": 0.2173, + "step": 3030 + }, + { + "epoch": 2.6184989770647142, + "grad_norm": 2.25, + "learning_rate": 1.3922000000000002e-05, + "loss": 0.2102, + "step": 3040 + }, + { + "epoch": 2.6271131689458382, + "grad_norm": 1.9140625, + "learning_rate": 1.3902000000000002e-05, + "loss": 0.2102, + "step": 3050 + }, + { + "epoch": 2.6357273608269622, + "grad_norm": 2.34375, + "learning_rate": 1.3882000000000002e-05, + "loss": 0.213, + "step": 3060 + }, + { + "epoch": 2.6443415527080867, + "grad_norm": 2.0, + "learning_rate": 1.3862000000000002e-05, + "loss": 0.2117, + "step": 3070 + }, + { + "epoch": 2.6529557445892107, + "grad_norm": 2.15625, + "learning_rate": 1.3842000000000002e-05, + "loss": 0.2142, + "step": 3080 + }, + { + "epoch": 2.661569936470335, + "grad_norm": 2.203125, + "learning_rate": 1.3822000000000003e-05, + "loss": 0.202, + "step": 3090 + }, + { + "epoch": 2.670184128351459, + "grad_norm": 2.671875, + "learning_rate": 1.3802000000000003e-05, + "loss": 0.2133, + "step": 3100 + }, + { + "epoch": 2.678798320232583, + "grad_norm": 2.25, + "learning_rate": 1.3782000000000001e-05, + "loss": 0.2086, + "step": 3110 + }, + { + "epoch": 2.687412512113707, + "grad_norm": 1.984375, + "learning_rate": 1.3762000000000001e-05, + "loss": 0.2032, + "step": 3120 + }, + { + "epoch": 2.6960267039948316, + "grad_norm": 2.40625, + "learning_rate": 1.3742000000000002e-05, + "loss": 0.2157, + "step": 3130 + }, + { + "epoch": 2.7046408958759556, + "grad_norm": 1.90625, + "learning_rate": 1.3722000000000002e-05, + "loss": 0.2139, + "step": 3140 + }, + { + "epoch": 2.71325508775708, + "grad_norm": 2.125, + "learning_rate": 1.3702000000000002e-05, + "loss": 0.2177, + "step": 3150 + }, + { + "epoch": 2.721869279638204, + "grad_norm": 2.1875, + "learning_rate": 1.3682000000000002e-05, + "loss": 0.2082, + "step": 3160 + }, + { + "epoch": 2.730483471519328, + "grad_norm": 2.109375, + "learning_rate": 1.3662000000000002e-05, + "loss": 0.2093, + "step": 3170 + }, + { + "epoch": 2.739097663400452, + "grad_norm": 1.984375, + "learning_rate": 1.3642000000000003e-05, + "loss": 0.2054, + "step": 3180 + }, + { + "epoch": 2.7477118552815765, + "grad_norm": 2.34375, + "learning_rate": 1.3622000000000003e-05, + "loss": 0.2007, + "step": 3190 + }, + { + "epoch": 2.7563260471627005, + "grad_norm": 2.34375, + "learning_rate": 1.3602000000000001e-05, + "loss": 0.2109, + "step": 3200 + }, + { + "epoch": 2.764940239043825, + "grad_norm": 2.203125, + "learning_rate": 1.3582000000000001e-05, + "loss": 0.2106, + "step": 3210 + }, + { + "epoch": 2.773554430924949, + "grad_norm": 2.515625, + "learning_rate": 1.3562000000000002e-05, + "loss": 0.2101, + "step": 3220 + }, + { + "epoch": 2.782168622806073, + "grad_norm": 2.109375, + "learning_rate": 1.3542000000000002e-05, + "loss": 0.216, + "step": 3230 + }, + { + "epoch": 2.790782814687197, + "grad_norm": 2.109375, + "learning_rate": 1.3522000000000002e-05, + "loss": 0.2103, + "step": 3240 + }, + { + "epoch": 2.7993970065683214, + "grad_norm": 2.015625, + "learning_rate": 1.3502000000000002e-05, + "loss": 0.2083, + "step": 3250 + }, + { + "epoch": 2.8080111984494454, + "grad_norm": 2.203125, + "learning_rate": 1.3482000000000002e-05, + "loss": 0.2124, + "step": 3260 + }, + { + "epoch": 2.81662539033057, + "grad_norm": 2.546875, + "learning_rate": 1.3462000000000003e-05, + "loss": 0.2114, + "step": 3270 + }, + { + "epoch": 2.825239582211694, + "grad_norm": 2.0, + "learning_rate": 1.3442000000000001e-05, + "loss": 0.2122, + "step": 3280 + }, + { + "epoch": 2.833853774092818, + "grad_norm": 2.203125, + "learning_rate": 1.3422000000000001e-05, + "loss": 0.217, + "step": 3290 + }, + { + "epoch": 2.842467965973942, + "grad_norm": 2.34375, + "learning_rate": 1.3402000000000001e-05, + "loss": 0.2137, + "step": 3300 + }, + { + "epoch": 2.8510821578550662, + "grad_norm": 2.0, + "learning_rate": 1.3382000000000002e-05, + "loss": 0.2171, + "step": 3310 + }, + { + "epoch": 2.8596963497361902, + "grad_norm": 2.09375, + "learning_rate": 1.3362000000000002e-05, + "loss": 0.21, + "step": 3320 + }, + { + "epoch": 2.8683105416173147, + "grad_norm": 2.203125, + "learning_rate": 1.3342000000000002e-05, + "loss": 0.2083, + "step": 3330 + }, + { + "epoch": 2.8769247334984387, + "grad_norm": 2.171875, + "learning_rate": 1.3322000000000002e-05, + "loss": 0.2178, + "step": 3340 + }, + { + "epoch": 2.8855389253795627, + "grad_norm": 2.015625, + "learning_rate": 1.3302000000000002e-05, + "loss": 0.2086, + "step": 3350 + }, + { + "epoch": 2.894153117260687, + "grad_norm": 1.953125, + "learning_rate": 1.3282000000000001e-05, + "loss": 0.2144, + "step": 3360 + }, + { + "epoch": 2.902767309141811, + "grad_norm": 2.09375, + "learning_rate": 1.3262000000000001e-05, + "loss": 0.2067, + "step": 3370 + }, + { + "epoch": 2.911381501022935, + "grad_norm": 2.125, + "learning_rate": 1.3242000000000001e-05, + "loss": 0.2104, + "step": 3380 + }, + { + "epoch": 2.9199956929040596, + "grad_norm": 2.25, + "learning_rate": 1.3222000000000001e-05, + "loss": 0.2135, + "step": 3390 + }, + { + "epoch": 2.9286098847851836, + "grad_norm": 2.328125, + "learning_rate": 1.3202000000000002e-05, + "loss": 0.2095, + "step": 3400 + }, + { + "epoch": 2.9372240766663076, + "grad_norm": 2.234375, + "learning_rate": 1.3182000000000002e-05, + "loss": 0.2147, + "step": 3410 + }, + { + "epoch": 2.945838268547432, + "grad_norm": 2.484375, + "learning_rate": 1.3162000000000002e-05, + "loss": 0.2146, + "step": 3420 + }, + { + "epoch": 2.954452460428556, + "grad_norm": 2.28125, + "learning_rate": 1.3142000000000002e-05, + "loss": 0.2137, + "step": 3430 + }, + { + "epoch": 2.9630666523096805, + "grad_norm": 2.09375, + "learning_rate": 1.3122e-05, + "loss": 0.2111, + "step": 3440 + }, + { + "epoch": 2.9716808441908045, + "grad_norm": 2.15625, + "learning_rate": 1.3102000000000001e-05, + "loss": 0.2194, + "step": 3450 + }, + { + "epoch": 2.9802950360719285, + "grad_norm": 2.265625, + "learning_rate": 1.3082000000000001e-05, + "loss": 0.21, + "step": 3460 + }, + { + "epoch": 2.9889092279530525, + "grad_norm": 2.109375, + "learning_rate": 1.3062000000000001e-05, + "loss": 0.2004, + "step": 3470 + }, + { + "epoch": 2.997523419834177, + "grad_norm": 2.15625, + "learning_rate": 1.3042000000000002e-05, + "loss": 0.212, + "step": 3480 + }, + { + "epoch": 3.006029934316787, + "grad_norm": 2.328125, + "learning_rate": 1.3022000000000002e-05, + "loss": 0.1743, + "step": 3490 + }, + { + "epoch": 3.014644126197911, + "grad_norm": 2.28125, + "learning_rate": 1.3002000000000002e-05, + "loss": 0.1524, + "step": 3500 + }, + { + "epoch": 3.0232583180790353, + "grad_norm": 3.359375, + "learning_rate": 1.2982000000000002e-05, + "loss": 0.1476, + "step": 3510 + }, + { + "epoch": 3.0318725099601593, + "grad_norm": 2.359375, + "learning_rate": 1.2962e-05, + "loss": 0.1408, + "step": 3520 + }, + { + "epoch": 3.0404867018412833, + "grad_norm": 3.03125, + "learning_rate": 1.2942e-05, + "loss": 0.1495, + "step": 3530 + }, + { + "epoch": 3.049100893722408, + "grad_norm": 2.296875, + "learning_rate": 1.2922000000000001e-05, + "loss": 0.1457, + "step": 3540 + }, + { + "epoch": 3.057715085603532, + "grad_norm": 2.71875, + "learning_rate": 1.2902000000000001e-05, + "loss": 0.1483, + "step": 3550 + }, + { + "epoch": 3.066329277484656, + "grad_norm": 2.234375, + "learning_rate": 1.2882000000000001e-05, + "loss": 0.1451, + "step": 3560 + }, + { + "epoch": 3.0749434693657802, + "grad_norm": 2.5, + "learning_rate": 1.2862000000000002e-05, + "loss": 0.1378, + "step": 3570 + }, + { + "epoch": 3.0835576612469042, + "grad_norm": 2.53125, + "learning_rate": 1.2842000000000002e-05, + "loss": 0.1539, + "step": 3580 + }, + { + "epoch": 3.0921718531280282, + "grad_norm": 2.46875, + "learning_rate": 1.2822000000000002e-05, + "loss": 0.1503, + "step": 3590 + }, + { + "epoch": 3.1007860450091527, + "grad_norm": 2.15625, + "learning_rate": 1.2802e-05, + "loss": 0.147, + "step": 3600 + }, + { + "epoch": 3.1094002368902767, + "grad_norm": 2.53125, + "learning_rate": 1.2782e-05, + "loss": 0.1447, + "step": 3610 + }, + { + "epoch": 3.1180144287714007, + "grad_norm": 2.5625, + "learning_rate": 1.2762e-05, + "loss": 0.1568, + "step": 3620 + }, + { + "epoch": 3.126628620652525, + "grad_norm": 2.640625, + "learning_rate": 1.2742000000000001e-05, + "loss": 0.1397, + "step": 3630 + }, + { + "epoch": 3.135242812533649, + "grad_norm": 2.546875, + "learning_rate": 1.2722000000000001e-05, + "loss": 0.1347, + "step": 3640 + }, + { + "epoch": 3.143857004414773, + "grad_norm": 2.578125, + "learning_rate": 1.2702000000000001e-05, + "loss": 0.1487, + "step": 3650 + }, + { + "epoch": 3.1524711962958976, + "grad_norm": 2.296875, + "learning_rate": 1.2682000000000002e-05, + "loss": 0.1458, + "step": 3660 + }, + { + "epoch": 3.1610853881770216, + "grad_norm": 2.359375, + "learning_rate": 1.2662000000000002e-05, + "loss": 0.139, + "step": 3670 + }, + { + "epoch": 3.169699580058146, + "grad_norm": 2.46875, + "learning_rate": 1.2642e-05, + "loss": 0.141, + "step": 3680 + }, + { + "epoch": 3.17831377193927, + "grad_norm": 2.640625, + "learning_rate": 1.2622e-05, + "loss": 0.1355, + "step": 3690 + }, + { + "epoch": 3.186927963820394, + "grad_norm": 2.703125, + "learning_rate": 1.2602e-05, + "loss": 0.149, + "step": 3700 + }, + { + "epoch": 3.1955421557015184, + "grad_norm": 2.546875, + "learning_rate": 1.2582e-05, + "loss": 0.1485, + "step": 3710 + }, + { + "epoch": 3.2041563475826425, + "grad_norm": 2.4375, + "learning_rate": 1.2562000000000001e-05, + "loss": 0.1475, + "step": 3720 + }, + { + "epoch": 3.2127705394637665, + "grad_norm": 2.671875, + "learning_rate": 1.2542000000000001e-05, + "loss": 0.1469, + "step": 3730 + }, + { + "epoch": 3.221384731344891, + "grad_norm": 2.625, + "learning_rate": 1.2522000000000001e-05, + "loss": 0.1448, + "step": 3740 + }, + { + "epoch": 3.229998923226015, + "grad_norm": 2.1875, + "learning_rate": 1.2502000000000002e-05, + "loss": 0.1454, + "step": 3750 + }, + { + "epoch": 3.238613115107139, + "grad_norm": 2.234375, + "learning_rate": 1.2482e-05, + "loss": 0.1437, + "step": 3760 + }, + { + "epoch": 3.2472273069882633, + "grad_norm": 2.515625, + "learning_rate": 1.2462e-05, + "loss": 0.1521, + "step": 3770 + }, + { + "epoch": 3.2558414988693873, + "grad_norm": 2.40625, + "learning_rate": 1.2442e-05, + "loss": 0.1462, + "step": 3780 + }, + { + "epoch": 3.2644556907505113, + "grad_norm": 2.84375, + "learning_rate": 1.2422e-05, + "loss": 0.1528, + "step": 3790 + }, + { + "epoch": 3.273069882631636, + "grad_norm": 2.5, + "learning_rate": 1.2402000000000001e-05, + "loss": 0.1438, + "step": 3800 + }, + { + "epoch": 3.28168407451276, + "grad_norm": 2.71875, + "learning_rate": 1.2382000000000001e-05, + "loss": 0.1481, + "step": 3810 + }, + { + "epoch": 3.290298266393884, + "grad_norm": 2.59375, + "learning_rate": 1.2362000000000001e-05, + "loss": 0.1521, + "step": 3820 + }, + { + "epoch": 3.2989124582750082, + "grad_norm": 2.375, + "learning_rate": 1.2342000000000001e-05, + "loss": 0.1453, + "step": 3830 + }, + { + "epoch": 3.3075266501561322, + "grad_norm": 2.75, + "learning_rate": 1.2322e-05, + "loss": 0.1453, + "step": 3840 + }, + { + "epoch": 3.3161408420372562, + "grad_norm": 2.328125, + "learning_rate": 1.2302e-05, + "loss": 0.1512, + "step": 3850 + }, + { + "epoch": 3.3247550339183807, + "grad_norm": 2.484375, + "learning_rate": 1.2282e-05, + "loss": 0.1484, + "step": 3860 + }, + { + "epoch": 3.3333692257995047, + "grad_norm": 2.6875, + "learning_rate": 1.2262e-05, + "loss": 0.1374, + "step": 3870 + }, + { + "epoch": 3.3419834176806287, + "grad_norm": 2.515625, + "learning_rate": 1.2242e-05, + "loss": 0.149, + "step": 3880 + }, + { + "epoch": 3.350597609561753, + "grad_norm": 2.75, + "learning_rate": 1.2222000000000001e-05, + "loss": 0.151, + "step": 3890 + }, + { + "epoch": 3.359211801442877, + "grad_norm": 2.328125, + "learning_rate": 1.2202000000000001e-05, + "loss": 0.143, + "step": 3900 + }, + { + "epoch": 3.367825993324001, + "grad_norm": 2.6875, + "learning_rate": 1.2182000000000001e-05, + "loss": 0.1484, + "step": 3910 + }, + { + "epoch": 3.3764401852051256, + "grad_norm": 2.234375, + "learning_rate": 1.2162e-05, + "loss": 0.1532, + "step": 3920 + }, + { + "epoch": 3.3850543770862496, + "grad_norm": 2.640625, + "learning_rate": 1.2142e-05, + "loss": 0.1472, + "step": 3930 + }, + { + "epoch": 3.3936685689673736, + "grad_norm": 2.5, + "learning_rate": 1.2122e-05, + "loss": 0.1448, + "step": 3940 + }, + { + "epoch": 3.402282760848498, + "grad_norm": 2.421875, + "learning_rate": 1.2102e-05, + "loss": 0.1548, + "step": 3950 + }, + { + "epoch": 3.410896952729622, + "grad_norm": 2.734375, + "learning_rate": 1.2082e-05, + "loss": 0.1372, + "step": 3960 + }, + { + "epoch": 3.419511144610746, + "grad_norm": 2.921875, + "learning_rate": 1.2062e-05, + "loss": 0.1466, + "step": 3970 + }, + { + "epoch": 3.4281253364918705, + "grad_norm": 2.421875, + "learning_rate": 1.2042000000000001e-05, + "loss": 0.1406, + "step": 3980 + }, + { + "epoch": 3.4367395283729945, + "grad_norm": 2.765625, + "learning_rate": 1.2022000000000001e-05, + "loss": 0.1516, + "step": 3990 + }, + { + "epoch": 3.4453537202541185, + "grad_norm": 2.59375, + "learning_rate": 1.2002e-05, + "loss": 0.1491, + "step": 4000 + }, + { + "epoch": 3.453967912135243, + "grad_norm": 2.6875, + "learning_rate": 1.1982e-05, + "loss": 0.146, + "step": 4010 + }, + { + "epoch": 3.462582104016367, + "grad_norm": 2.53125, + "learning_rate": 1.1962e-05, + "loss": 0.1438, + "step": 4020 + }, + { + "epoch": 3.4711962958974913, + "grad_norm": 2.96875, + "learning_rate": 1.1942e-05, + "loss": 0.1372, + "step": 4030 + }, + { + "epoch": 3.4798104877786153, + "grad_norm": 2.359375, + "learning_rate": 1.1922e-05, + "loss": 0.1501, + "step": 4040 + }, + { + "epoch": 3.4884246796597393, + "grad_norm": 2.71875, + "learning_rate": 1.1902e-05, + "loss": 0.1515, + "step": 4050 + }, + { + "epoch": 3.4970388715408633, + "grad_norm": 2.796875, + "learning_rate": 1.1882e-05, + "loss": 0.151, + "step": 4060 + }, + { + "epoch": 3.505653063421988, + "grad_norm": 2.53125, + "learning_rate": 1.1862000000000001e-05, + "loss": 0.1468, + "step": 4070 + }, + { + "epoch": 3.514267255303112, + "grad_norm": 2.6875, + "learning_rate": 1.1842e-05, + "loss": 0.1477, + "step": 4080 + }, + { + "epoch": 3.5228814471842362, + "grad_norm": 2.375, + "learning_rate": 1.1822e-05, + "loss": 0.1516, + "step": 4090 + }, + { + "epoch": 3.5314956390653602, + "grad_norm": 2.5625, + "learning_rate": 1.1802e-05, + "loss": 0.145, + "step": 4100 + }, + { + "epoch": 3.5401098309464842, + "grad_norm": 2.671875, + "learning_rate": 1.1782e-05, + "loss": 0.1434, + "step": 4110 + }, + { + "epoch": 3.5487240228276082, + "grad_norm": 2.515625, + "learning_rate": 1.1762e-05, + "loss": 0.1424, + "step": 4120 + }, + { + "epoch": 3.5573382147087327, + "grad_norm": 2.140625, + "learning_rate": 1.1742e-05, + "loss": 0.1433, + "step": 4130 + }, + { + "epoch": 3.5659524065898567, + "grad_norm": 2.578125, + "learning_rate": 1.1722e-05, + "loss": 0.1511, + "step": 4140 + }, + { + "epoch": 3.574566598470981, + "grad_norm": 2.6875, + "learning_rate": 1.1702e-05, + "loss": 0.149, + "step": 4150 + }, + { + "epoch": 3.583180790352105, + "grad_norm": 2.6875, + "learning_rate": 1.1682e-05, + "loss": 0.151, + "step": 4160 + }, + { + "epoch": 3.591794982233229, + "grad_norm": 2.796875, + "learning_rate": 1.1662e-05, + "loss": 0.1412, + "step": 4170 + }, + { + "epoch": 3.600409174114353, + "grad_norm": 2.359375, + "learning_rate": 1.1642e-05, + "loss": 0.1461, + "step": 4180 + }, + { + "epoch": 3.6090233659954776, + "grad_norm": 2.546875, + "learning_rate": 1.1622e-05, + "loss": 0.1481, + "step": 4190 + }, + { + "epoch": 3.6176375578766016, + "grad_norm": 2.484375, + "learning_rate": 1.1602e-05, + "loss": 0.1535, + "step": 4200 + }, + { + "epoch": 3.626251749757726, + "grad_norm": 2.5, + "learning_rate": 1.1582e-05, + "loss": 0.1579, + "step": 4210 + }, + { + "epoch": 3.63486594163885, + "grad_norm": 3.078125, + "learning_rate": 1.1562e-05, + "loss": 0.1528, + "step": 4220 + }, + { + "epoch": 3.643480133519974, + "grad_norm": 2.625, + "learning_rate": 1.1542e-05, + "loss": 0.1436, + "step": 4230 + }, + { + "epoch": 3.6520943254010985, + "grad_norm": 2.78125, + "learning_rate": 1.1521999999999999e-05, + "loss": 0.1488, + "step": 4240 + }, + { + "epoch": 3.6607085172822225, + "grad_norm": 2.53125, + "learning_rate": 1.1502e-05, + "loss": 0.1404, + "step": 4250 + }, + { + "epoch": 3.6693227091633465, + "grad_norm": 2.5625, + "learning_rate": 1.1482000000000001e-05, + "loss": 0.1584, + "step": 4260 + }, + { + "epoch": 3.677936901044471, + "grad_norm": 2.65625, + "learning_rate": 1.1462000000000001e-05, + "loss": 0.1429, + "step": 4270 + }, + { + "epoch": 3.686551092925595, + "grad_norm": 2.9375, + "learning_rate": 1.1442000000000002e-05, + "loss": 0.1478, + "step": 4280 + }, + { + "epoch": 3.695165284806719, + "grad_norm": 2.28125, + "learning_rate": 1.1422000000000002e-05, + "loss": 0.1495, + "step": 4290 + }, + { + "epoch": 3.7037794766878434, + "grad_norm": 2.65625, + "learning_rate": 1.1402000000000002e-05, + "loss": 0.1487, + "step": 4300 + }, + { + "epoch": 3.7123936685689674, + "grad_norm": 2.75, + "learning_rate": 1.1382000000000002e-05, + "loss": 0.1515, + "step": 4310 + }, + { + "epoch": 3.721007860450092, + "grad_norm": 2.4375, + "learning_rate": 1.1362000000000002e-05, + "loss": 0.1524, + "step": 4320 + }, + { + "epoch": 3.729622052331216, + "grad_norm": 2.65625, + "learning_rate": 1.1342000000000003e-05, + "loss": 0.1506, + "step": 4330 + }, + { + "epoch": 3.73823624421234, + "grad_norm": 2.609375, + "learning_rate": 1.1322000000000001e-05, + "loss": 0.1448, + "step": 4340 + }, + { + "epoch": 3.746850436093464, + "grad_norm": 2.390625, + "learning_rate": 1.1302000000000001e-05, + "loss": 0.1444, + "step": 4350 + }, + { + "epoch": 3.7554646279745882, + "grad_norm": 2.203125, + "learning_rate": 1.1282000000000001e-05, + "loss": 0.1483, + "step": 4360 + }, + { + "epoch": 3.7640788198557122, + "grad_norm": 2.953125, + "learning_rate": 1.1262000000000002e-05, + "loss": 0.1524, + "step": 4370 + }, + { + "epoch": 3.7726930117368367, + "grad_norm": 2.640625, + "learning_rate": 1.1242000000000002e-05, + "loss": 0.1553, + "step": 4380 + }, + { + "epoch": 3.7813072036179607, + "grad_norm": 2.484375, + "learning_rate": 1.1222000000000002e-05, + "loss": 0.156, + "step": 4390 + }, + { + "epoch": 3.7899213954990847, + "grad_norm": 2.59375, + "learning_rate": 1.1202000000000002e-05, + "loss": 0.1543, + "step": 4400 + }, + { + "epoch": 3.7985355873802087, + "grad_norm": 2.515625, + "learning_rate": 1.1182000000000002e-05, + "loss": 0.1521, + "step": 4410 + }, + { + "epoch": 3.807149779261333, + "grad_norm": 3.046875, + "learning_rate": 1.1162000000000003e-05, + "loss": 0.1515, + "step": 4420 + }, + { + "epoch": 3.815763971142457, + "grad_norm": 2.703125, + "learning_rate": 1.1142000000000001e-05, + "loss": 0.1527, + "step": 4430 + }, + { + "epoch": 3.8243781630235816, + "grad_norm": 3.015625, + "learning_rate": 1.1122000000000001e-05, + "loss": 0.1543, + "step": 4440 + }, + { + "epoch": 3.8329923549047056, + "grad_norm": 2.546875, + "learning_rate": 1.1102000000000001e-05, + "loss": 0.1461, + "step": 4450 + }, + { + "epoch": 3.8416065467858296, + "grad_norm": 3.03125, + "learning_rate": 1.1082000000000002e-05, + "loss": 0.1448, + "step": 4460 + }, + { + "epoch": 3.8502207386669536, + "grad_norm": 2.453125, + "learning_rate": 1.1062000000000002e-05, + "loss": 0.1462, + "step": 4470 + }, + { + "epoch": 3.858834930548078, + "grad_norm": 2.59375, + "learning_rate": 1.1042000000000002e-05, + "loss": 0.146, + "step": 4480 + }, + { + "epoch": 3.867449122429202, + "grad_norm": 2.84375, + "learning_rate": 1.1022000000000002e-05, + "loss": 0.1416, + "step": 4490 + }, + { + "epoch": 3.8760633143103265, + "grad_norm": 2.359375, + "learning_rate": 1.1002000000000002e-05, + "loss": 0.156, + "step": 4500 + }, + { + "epoch": 3.8846775061914505, + "grad_norm": 3.140625, + "learning_rate": 1.0982000000000001e-05, + "loss": 0.1462, + "step": 4510 + }, + { + "epoch": 3.8932916980725745, + "grad_norm": 2.578125, + "learning_rate": 1.0962000000000001e-05, + "loss": 0.1447, + "step": 4520 + }, + { + "epoch": 3.9019058899536985, + "grad_norm": 2.78125, + "learning_rate": 1.0942000000000001e-05, + "loss": 0.1467, + "step": 4530 + }, + { + "epoch": 3.910520081834823, + "grad_norm": 2.53125, + "learning_rate": 1.0922000000000001e-05, + "loss": 0.1452, + "step": 4540 + }, + { + "epoch": 3.919134273715947, + "grad_norm": 2.609375, + "learning_rate": 1.0902000000000002e-05, + "loss": 0.1519, + "step": 4550 + }, + { + "epoch": 3.9277484655970714, + "grad_norm": 2.296875, + "learning_rate": 1.0882000000000002e-05, + "loss": 0.1485, + "step": 4560 + }, + { + "epoch": 3.9363626574781954, + "grad_norm": 2.921875, + "learning_rate": 1.0862000000000002e-05, + "loss": 0.1473, + "step": 4570 + }, + { + "epoch": 3.9449768493593194, + "grad_norm": 2.78125, + "learning_rate": 1.0842000000000002e-05, + "loss": 0.1462, + "step": 4580 + }, + { + "epoch": 3.9535910412404434, + "grad_norm": 2.5625, + "learning_rate": 1.0822e-05, + "loss": 0.1472, + "step": 4590 + }, + { + "epoch": 3.962205233121568, + "grad_norm": 2.484375, + "learning_rate": 1.0802000000000001e-05, + "loss": 0.1497, + "step": 4600 + }, + { + "epoch": 3.970819425002692, + "grad_norm": 2.421875, + "learning_rate": 1.0782000000000001e-05, + "loss": 0.1474, + "step": 4610 + }, + { + "epoch": 3.9794336168838162, + "grad_norm": 2.46875, + "learning_rate": 1.0762000000000001e-05, + "loss": 0.1477, + "step": 4620 + }, + { + "epoch": 3.9880478087649402, + "grad_norm": 2.734375, + "learning_rate": 1.0742000000000002e-05, + "loss": 0.1508, + "step": 4630 + }, + { + "epoch": 3.9966620006460643, + "grad_norm": 2.515625, + "learning_rate": 1.0722000000000002e-05, + "loss": 0.1449, + "step": 4640 + }, + { + "epoch": 4.005168515128674, + "grad_norm": 2.328125, + "learning_rate": 1.0702000000000002e-05, + "loss": 0.1182, + "step": 4650 + }, + { + "epoch": 4.013782707009798, + "grad_norm": 2.75, + "learning_rate": 1.0682000000000002e-05, + "loss": 0.1047, + "step": 4660 + }, + { + "epoch": 4.022396898890923, + "grad_norm": 3.0625, + "learning_rate": 1.0662e-05, + "loss": 0.0972, + "step": 4670 + }, + { + "epoch": 4.031011090772047, + "grad_norm": 3.109375, + "learning_rate": 1.0642e-05, + "loss": 0.099, + "step": 4680 + }, + { + "epoch": 4.039625282653171, + "grad_norm": 2.609375, + "learning_rate": 1.0622000000000001e-05, + "loss": 0.0977, + "step": 4690 + }, + { + "epoch": 4.048239474534295, + "grad_norm": 2.515625, + "learning_rate": 1.0602000000000001e-05, + "loss": 0.1003, + "step": 4700 + }, + { + "epoch": 4.056853666415419, + "grad_norm": 2.625, + "learning_rate": 1.0582000000000001e-05, + "loss": 0.0964, + "step": 4710 + }, + { + "epoch": 4.065467858296543, + "grad_norm": 2.625, + "learning_rate": 1.0562000000000002e-05, + "loss": 0.0969, + "step": 4720 + }, + { + "epoch": 4.074082050177668, + "grad_norm": 3.234375, + "learning_rate": 1.0542000000000002e-05, + "loss": 0.1009, + "step": 4730 + }, + { + "epoch": 4.082696242058792, + "grad_norm": 2.625, + "learning_rate": 1.0522000000000002e-05, + "loss": 0.1039, + "step": 4740 + }, + { + "epoch": 4.091310433939916, + "grad_norm": 2.65625, + "learning_rate": 1.0502e-05, + "loss": 0.102, + "step": 4750 + }, + { + "epoch": 4.09992462582104, + "grad_norm": 2.71875, + "learning_rate": 1.0482e-05, + "loss": 0.0998, + "step": 4760 + }, + { + "epoch": 4.108538817702164, + "grad_norm": 2.46875, + "learning_rate": 1.0462e-05, + "loss": 0.1001, + "step": 4770 + }, + { + "epoch": 4.117153009583289, + "grad_norm": 2.390625, + "learning_rate": 1.0442000000000001e-05, + "loss": 0.0929, + "step": 4780 + }, + { + "epoch": 4.125767201464413, + "grad_norm": 2.78125, + "learning_rate": 1.0422000000000001e-05, + "loss": 0.0971, + "step": 4790 + }, + { + "epoch": 4.134381393345537, + "grad_norm": 2.65625, + "learning_rate": 1.0402000000000001e-05, + "loss": 0.0997, + "step": 4800 + }, + { + "epoch": 4.142995585226661, + "grad_norm": 2.515625, + "learning_rate": 1.0382000000000002e-05, + "loss": 0.1048, + "step": 4810 + }, + { + "epoch": 4.151609777107785, + "grad_norm": 2.84375, + "learning_rate": 1.0362000000000002e-05, + "loss": 0.1024, + "step": 4820 + }, + { + "epoch": 4.160223968988909, + "grad_norm": 2.828125, + "learning_rate": 1.0342e-05, + "loss": 0.1015, + "step": 4830 + }, + { + "epoch": 4.168838160870034, + "grad_norm": 2.875, + "learning_rate": 1.0322e-05, + "loss": 0.1015, + "step": 4840 + }, + { + "epoch": 4.177452352751158, + "grad_norm": 3.15625, + "learning_rate": 1.0302e-05, + "loss": 0.0962, + "step": 4850 + }, + { + "epoch": 4.186066544632282, + "grad_norm": 2.84375, + "learning_rate": 1.0282e-05, + "loss": 0.1004, + "step": 4860 + }, + { + "epoch": 4.194680736513406, + "grad_norm": 2.8125, + "learning_rate": 1.0262000000000001e-05, + "loss": 0.0913, + "step": 4870 + }, + { + "epoch": 4.20329492839453, + "grad_norm": 4.0, + "learning_rate": 1.0242000000000001e-05, + "loss": 0.0974, + "step": 4880 + }, + { + "epoch": 4.211909120275654, + "grad_norm": 2.875, + "learning_rate": 1.0222000000000001e-05, + "loss": 0.1002, + "step": 4890 + }, + { + "epoch": 4.220523312156779, + "grad_norm": 2.546875, + "learning_rate": 1.0202000000000002e-05, + "loss": 0.1032, + "step": 4900 + }, + { + "epoch": 4.229137504037903, + "grad_norm": 3.0, + "learning_rate": 1.0182e-05, + "loss": 0.0937, + "step": 4910 + }, + { + "epoch": 4.237751695919027, + "grad_norm": 2.921875, + "learning_rate": 1.0162e-05, + "loss": 0.0974, + "step": 4920 + }, + { + "epoch": 4.246365887800151, + "grad_norm": 2.703125, + "learning_rate": 1.0142e-05, + "loss": 0.1001, + "step": 4930 + }, + { + "epoch": 4.254980079681275, + "grad_norm": 3.578125, + "learning_rate": 1.0122e-05, + "loss": 0.1008, + "step": 4940 + }, + { + "epoch": 4.263594271562399, + "grad_norm": 2.578125, + "learning_rate": 1.0102000000000001e-05, + "loss": 0.0994, + "step": 4950 + }, + { + "epoch": 4.272208463443524, + "grad_norm": 2.359375, + "learning_rate": 1.0082000000000001e-05, + "loss": 0.1013, + "step": 4960 + }, + { + "epoch": 4.280822655324648, + "grad_norm": 3.4375, + "learning_rate": 1.0062000000000001e-05, + "loss": 0.0959, + "step": 4970 + }, + { + "epoch": 4.289436847205772, + "grad_norm": 3.59375, + "learning_rate": 1.0042000000000001e-05, + "loss": 0.0992, + "step": 4980 + }, + { + "epoch": 4.298051039086896, + "grad_norm": 2.734375, + "learning_rate": 1.0022e-05, + "loss": 0.0962, + "step": 4990 + }, + { + "epoch": 4.30666523096802, + "grad_norm": 2.890625, + "learning_rate": 1.0002e-05, + "loss": 0.1062, + "step": 5000 + }, + { + "epoch": 4.315279422849144, + "grad_norm": 2.71875, + "learning_rate": 9.982e-06, + "loss": 0.0995, + "step": 5010 + }, + { + "epoch": 4.3238936147302685, + "grad_norm": 2.828125, + "learning_rate": 9.962e-06, + "loss": 0.1022, + "step": 5020 + }, + { + "epoch": 4.3325078066113925, + "grad_norm": 2.6875, + "learning_rate": 9.942e-06, + "loss": 0.0996, + "step": 5030 + }, + { + "epoch": 4.3411219984925165, + "grad_norm": 3.09375, + "learning_rate": 9.922000000000001e-06, + "loss": 0.0993, + "step": 5040 + }, + { + "epoch": 4.3497361903736405, + "grad_norm": 2.84375, + "learning_rate": 9.902000000000001e-06, + "loss": 0.1029, + "step": 5050 + }, + { + "epoch": 4.3583503822547645, + "grad_norm": 2.90625, + "learning_rate": 9.882000000000001e-06, + "loss": 0.1003, + "step": 5060 + }, + { + "epoch": 4.3669645741358885, + "grad_norm": 2.984375, + "learning_rate": 9.862e-06, + "loss": 0.0979, + "step": 5070 + }, + { + "epoch": 4.375578766017013, + "grad_norm": 2.734375, + "learning_rate": 9.842e-06, + "loss": 0.1003, + "step": 5080 + }, + { + "epoch": 4.384192957898137, + "grad_norm": 2.34375, + "learning_rate": 9.822e-06, + "loss": 0.0981, + "step": 5090 + }, + { + "epoch": 4.392807149779261, + "grad_norm": 2.90625, + "learning_rate": 9.802e-06, + "loss": 0.098, + "step": 5100 + }, + { + "epoch": 4.401421341660385, + "grad_norm": 3.265625, + "learning_rate": 9.782e-06, + "loss": 0.1007, + "step": 5110 + }, + { + "epoch": 4.410035533541509, + "grad_norm": 2.75, + "learning_rate": 9.762e-06, + "loss": 0.0968, + "step": 5120 + }, + { + "epoch": 4.418649725422634, + "grad_norm": 2.75, + "learning_rate": 9.742000000000001e-06, + "loss": 0.1031, + "step": 5130 + }, + { + "epoch": 4.427263917303758, + "grad_norm": 3.078125, + "learning_rate": 9.722000000000001e-06, + "loss": 0.0996, + "step": 5140 + }, + { + "epoch": 4.435878109184882, + "grad_norm": 2.828125, + "learning_rate": 9.702e-06, + "loss": 0.0982, + "step": 5150 + }, + { + "epoch": 4.444492301066006, + "grad_norm": 2.609375, + "learning_rate": 9.682e-06, + "loss": 0.0991, + "step": 5160 + }, + { + "epoch": 4.45310649294713, + "grad_norm": 3.296875, + "learning_rate": 9.662e-06, + "loss": 0.0999, + "step": 5170 + }, + { + "epoch": 4.461720684828254, + "grad_norm": 3.109375, + "learning_rate": 9.642e-06, + "loss": 0.1042, + "step": 5180 + }, + { + "epoch": 4.470334876709378, + "grad_norm": 2.765625, + "learning_rate": 9.622000000000002e-06, + "loss": 0.1016, + "step": 5190 + }, + { + "epoch": 4.478949068590503, + "grad_norm": 2.5, + "learning_rate": 9.602e-06, + "loss": 0.0992, + "step": 5200 + }, + { + "epoch": 4.487563260471627, + "grad_norm": 3.140625, + "learning_rate": 9.582e-06, + "loss": 0.0986, + "step": 5210 + }, + { + "epoch": 4.496177452352751, + "grad_norm": 2.53125, + "learning_rate": 9.562000000000001e-06, + "loss": 0.0995, + "step": 5220 + }, + { + "epoch": 4.504791644233875, + "grad_norm": 3.25, + "learning_rate": 9.542000000000001e-06, + "loss": 0.1039, + "step": 5230 + }, + { + "epoch": 4.513405836114999, + "grad_norm": 2.890625, + "learning_rate": 9.522000000000001e-06, + "loss": 0.1017, + "step": 5240 + }, + { + "epoch": 4.522020027996124, + "grad_norm": 2.671875, + "learning_rate": 9.502000000000002e-06, + "loss": 0.1018, + "step": 5250 + }, + { + "epoch": 4.530634219877248, + "grad_norm": 3.203125, + "learning_rate": 9.482000000000002e-06, + "loss": 0.1021, + "step": 5260 + }, + { + "epoch": 4.539248411758372, + "grad_norm": 2.5625, + "learning_rate": 9.462000000000002e-06, + "loss": 0.1007, + "step": 5270 + }, + { + "epoch": 4.547862603639496, + "grad_norm": 2.359375, + "learning_rate": 9.442e-06, + "loss": 0.0974, + "step": 5280 + }, + { + "epoch": 4.55647679552062, + "grad_norm": 2.5625, + "learning_rate": 9.422e-06, + "loss": 0.0994, + "step": 5290 + }, + { + "epoch": 4.565090987401744, + "grad_norm": 2.890625, + "learning_rate": 9.402e-06, + "loss": 0.1027, + "step": 5300 + }, + { + "epoch": 4.573705179282869, + "grad_norm": 3.015625, + "learning_rate": 9.382000000000001e-06, + "loss": 0.1081, + "step": 5310 + }, + { + "epoch": 4.582319371163993, + "grad_norm": 3.015625, + "learning_rate": 9.362000000000001e-06, + "loss": 0.0977, + "step": 5320 + }, + { + "epoch": 4.590933563045117, + "grad_norm": 2.75, + "learning_rate": 9.342000000000001e-06, + "loss": 0.1057, + "step": 5330 + }, + { + "epoch": 4.599547754926241, + "grad_norm": 3.0, + "learning_rate": 9.322000000000002e-06, + "loss": 0.1008, + "step": 5340 + }, + { + "epoch": 4.608161946807365, + "grad_norm": 2.9375, + "learning_rate": 9.302000000000002e-06, + "loss": 0.102, + "step": 5350 + }, + { + "epoch": 4.616776138688489, + "grad_norm": 3.015625, + "learning_rate": 9.282e-06, + "loss": 0.106, + "step": 5360 + }, + { + "epoch": 4.625390330569614, + "grad_norm": 3.015625, + "learning_rate": 9.262e-06, + "loss": 0.1025, + "step": 5370 + }, + { + "epoch": 4.634004522450738, + "grad_norm": 2.59375, + "learning_rate": 9.242e-06, + "loss": 0.0968, + "step": 5380 + }, + { + "epoch": 4.642618714331862, + "grad_norm": 2.6875, + "learning_rate": 9.222e-06, + "loss": 0.0951, + "step": 5390 + }, + { + "epoch": 4.651232906212986, + "grad_norm": 2.71875, + "learning_rate": 9.202000000000001e-06, + "loss": 0.1027, + "step": 5400 + }, + { + "epoch": 4.65984709809411, + "grad_norm": 2.515625, + "learning_rate": 9.182000000000001e-06, + "loss": 0.1014, + "step": 5410 + }, + { + "epoch": 4.668461289975234, + "grad_norm": 2.9375, + "learning_rate": 9.162000000000001e-06, + "loss": 0.1017, + "step": 5420 + }, + { + "epoch": 4.677075481856359, + "grad_norm": 3.046875, + "learning_rate": 9.142000000000002e-06, + "loss": 0.1019, + "step": 5430 + }, + { + "epoch": 4.685689673737483, + "grad_norm": 2.828125, + "learning_rate": 9.122e-06, + "loss": 0.1034, + "step": 5440 + }, + { + "epoch": 4.694303865618607, + "grad_norm": 2.671875, + "learning_rate": 9.102e-06, + "loss": 0.098, + "step": 5450 + }, + { + "epoch": 4.702918057499731, + "grad_norm": 3.203125, + "learning_rate": 9.082e-06, + "loss": 0.0971, + "step": 5460 + }, + { + "epoch": 4.711532249380855, + "grad_norm": 2.78125, + "learning_rate": 9.062e-06, + "loss": 0.0989, + "step": 5470 + }, + { + "epoch": 4.72014644126198, + "grad_norm": 2.78125, + "learning_rate": 9.042e-06, + "loss": 0.0987, + "step": 5480 + }, + { + "epoch": 4.728760633143104, + "grad_norm": 3.046875, + "learning_rate": 9.022000000000001e-06, + "loss": 0.0983, + "step": 5490 + }, + { + "epoch": 4.737374825024228, + "grad_norm": 2.890625, + "learning_rate": 9.002000000000001e-06, + "loss": 0.102, + "step": 5500 + }, + { + "epoch": 4.745989016905352, + "grad_norm": 2.71875, + "learning_rate": 8.982000000000001e-06, + "loss": 0.1022, + "step": 5510 + }, + { + "epoch": 4.754603208786476, + "grad_norm": 2.8125, + "learning_rate": 8.962e-06, + "loss": 0.0997, + "step": 5520 + }, + { + "epoch": 4.7632174006676, + "grad_norm": 2.625, + "learning_rate": 8.942e-06, + "loss": 0.1032, + "step": 5530 + }, + { + "epoch": 4.771831592548724, + "grad_norm": 2.71875, + "learning_rate": 8.922e-06, + "loss": 0.0995, + "step": 5540 + }, + { + "epoch": 4.7804457844298485, + "grad_norm": 2.90625, + "learning_rate": 8.902e-06, + "loss": 0.1053, + "step": 5550 + }, + { + "epoch": 4.7890599763109725, + "grad_norm": 2.59375, + "learning_rate": 8.882e-06, + "loss": 0.0929, + "step": 5560 + }, + { + "epoch": 4.7976741681920965, + "grad_norm": 2.734375, + "learning_rate": 8.862000000000001e-06, + "loss": 0.1015, + "step": 5570 + }, + { + "epoch": 4.8062883600732205, + "grad_norm": 2.90625, + "learning_rate": 8.842000000000001e-06, + "loss": 0.0987, + "step": 5580 + }, + { + "epoch": 4.8149025519543445, + "grad_norm": 2.859375, + "learning_rate": 8.822000000000001e-06, + "loss": 0.1039, + "step": 5590 + }, + { + "epoch": 4.823516743835469, + "grad_norm": 3.34375, + "learning_rate": 8.802e-06, + "loss": 0.102, + "step": 5600 + }, + { + "epoch": 4.832130935716593, + "grad_norm": 2.9375, + "learning_rate": 8.782e-06, + "loss": 0.1002, + "step": 5610 + }, + { + "epoch": 4.840745127597717, + "grad_norm": 2.828125, + "learning_rate": 8.762e-06, + "loss": 0.1023, + "step": 5620 + }, + { + "epoch": 4.849359319478841, + "grad_norm": 2.46875, + "learning_rate": 8.742e-06, + "loss": 0.1007, + "step": 5630 + }, + { + "epoch": 4.857973511359965, + "grad_norm": 2.890625, + "learning_rate": 8.722e-06, + "loss": 0.1036, + "step": 5640 + }, + { + "epoch": 4.866587703241089, + "grad_norm": 2.765625, + "learning_rate": 8.702e-06, + "loss": 0.1044, + "step": 5650 + }, + { + "epoch": 4.875201895122213, + "grad_norm": 2.765625, + "learning_rate": 8.682000000000001e-06, + "loss": 0.0943, + "step": 5660 + }, + { + "epoch": 4.883816087003338, + "grad_norm": 2.609375, + "learning_rate": 8.662000000000001e-06, + "loss": 0.0982, + "step": 5670 + }, + { + "epoch": 4.892430278884462, + "grad_norm": 3.125, + "learning_rate": 8.642e-06, + "loss": 0.1033, + "step": 5680 + }, + { + "epoch": 4.901044470765586, + "grad_norm": 2.3125, + "learning_rate": 8.622e-06, + "loss": 0.1034, + "step": 5690 + }, + { + "epoch": 4.90965866264671, + "grad_norm": 2.9375, + "learning_rate": 8.602e-06, + "loss": 0.0999, + "step": 5700 + }, + { + "epoch": 4.918272854527834, + "grad_norm": 3.03125, + "learning_rate": 8.582e-06, + "loss": 0.1048, + "step": 5710 + }, + { + "epoch": 4.926887046408959, + "grad_norm": 3.75, + "learning_rate": 8.562e-06, + "loss": 0.0956, + "step": 5720 + }, + { + "epoch": 4.935501238290083, + "grad_norm": 2.609375, + "learning_rate": 8.542e-06, + "loss": 0.1041, + "step": 5730 + }, + { + "epoch": 4.944115430171207, + "grad_norm": 2.6875, + "learning_rate": 8.522e-06, + "loss": 0.1029, + "step": 5740 + }, + { + "epoch": 4.952729622052331, + "grad_norm": 2.609375, + "learning_rate": 8.502000000000001e-06, + "loss": 0.1009, + "step": 5750 + }, + { + "epoch": 4.961343813933455, + "grad_norm": 2.515625, + "learning_rate": 8.482e-06, + "loss": 0.1065, + "step": 5760 + }, + { + "epoch": 4.969958005814579, + "grad_norm": 2.78125, + "learning_rate": 8.462e-06, + "loss": 0.0968, + "step": 5770 + }, + { + "epoch": 4.978572197695704, + "grad_norm": 2.96875, + "learning_rate": 8.442e-06, + "loss": 0.1061, + "step": 5780 + }, + { + "epoch": 4.987186389576828, + "grad_norm": 2.65625, + "learning_rate": 8.422e-06, + "loss": 0.0998, + "step": 5790 + }, + { + "epoch": 4.995800581457952, + "grad_norm": 2.8125, + "learning_rate": 8.402e-06, + "loss": 0.1004, + "step": 5800 + }, + { + "epoch": 5.004307095940562, + "grad_norm": 2.09375, + "learning_rate": 8.382e-06, + "loss": 0.0826, + "step": 5810 + }, + { + "epoch": 5.012921287821686, + "grad_norm": 2.9375, + "learning_rate": 8.362e-06, + "loss": 0.0723, + "step": 5820 + }, + { + "epoch": 5.02153547970281, + "grad_norm": 2.4375, + "learning_rate": 8.342e-06, + "loss": 0.0668, + "step": 5830 + }, + { + "epoch": 5.030149671583935, + "grad_norm": 2.8125, + "learning_rate": 8.322000000000001e-06, + "loss": 0.072, + "step": 5840 + }, + { + "epoch": 5.038763863465059, + "grad_norm": 2.828125, + "learning_rate": 8.302000000000001e-06, + "loss": 0.0673, + "step": 5850 + }, + { + "epoch": 5.047378055346183, + "grad_norm": 3.03125, + "learning_rate": 8.282000000000001e-06, + "loss": 0.0656, + "step": 5860 + }, + { + "epoch": 5.055992247227307, + "grad_norm": 3.21875, + "learning_rate": 8.262000000000002e-06, + "loss": 0.0721, + "step": 5870 + }, + { + "epoch": 5.064606439108431, + "grad_norm": 2.953125, + "learning_rate": 8.242000000000002e-06, + "loss": 0.0737, + "step": 5880 + }, + { + "epoch": 5.073220630989555, + "grad_norm": 2.796875, + "learning_rate": 8.222000000000002e-06, + "loss": 0.068, + "step": 5890 + }, + { + "epoch": 5.08183482287068, + "grad_norm": 2.921875, + "learning_rate": 8.202e-06, + "loss": 0.0682, + "step": 5900 + }, + { + "epoch": 5.090449014751804, + "grad_norm": 3.09375, + "learning_rate": 8.182e-06, + "loss": 0.0677, + "step": 5910 + }, + { + "epoch": 5.099063206632928, + "grad_norm": 2.71875, + "learning_rate": 8.162e-06, + "loss": 0.0686, + "step": 5920 + }, + { + "epoch": 5.107677398514052, + "grad_norm": 2.5625, + "learning_rate": 8.142000000000001e-06, + "loss": 0.0688, + "step": 5930 + }, + { + "epoch": 5.116291590395176, + "grad_norm": 2.4375, + "learning_rate": 8.122000000000001e-06, + "loss": 0.072, + "step": 5940 + }, + { + "epoch": 5.1249057822763, + "grad_norm": 2.703125, + "learning_rate": 8.102000000000001e-06, + "loss": 0.0707, + "step": 5950 + }, + { + "epoch": 5.133519974157425, + "grad_norm": 2.734375, + "learning_rate": 8.082000000000002e-06, + "loss": 0.0733, + "step": 5960 + }, + { + "epoch": 5.142134166038549, + "grad_norm": 2.515625, + "learning_rate": 8.062000000000002e-06, + "loss": 0.0726, + "step": 5970 + }, + { + "epoch": 5.150748357919673, + "grad_norm": 3.109375, + "learning_rate": 8.042e-06, + "loss": 0.069, + "step": 5980 + }, + { + "epoch": 5.159362549800797, + "grad_norm": 2.640625, + "learning_rate": 8.022e-06, + "loss": 0.0699, + "step": 5990 + }, + { + "epoch": 5.167976741681921, + "grad_norm": 3.046875, + "learning_rate": 8.002e-06, + "loss": 0.0704, + "step": 6000 + }, + { + "epoch": 5.176590933563045, + "grad_norm": 2.515625, + "learning_rate": 7.982e-06, + "loss": 0.0676, + "step": 6010 + }, + { + "epoch": 5.18520512544417, + "grad_norm": 2.859375, + "learning_rate": 7.962000000000001e-06, + "loss": 0.0674, + "step": 6020 + }, + { + "epoch": 5.193819317325294, + "grad_norm": 2.265625, + "learning_rate": 7.942000000000001e-06, + "loss": 0.0721, + "step": 6030 + }, + { + "epoch": 5.202433509206418, + "grad_norm": 3.140625, + "learning_rate": 7.922000000000001e-06, + "loss": 0.0727, + "step": 6040 + }, + { + "epoch": 5.211047701087542, + "grad_norm": 2.703125, + "learning_rate": 7.902000000000002e-06, + "loss": 0.071, + "step": 6050 + }, + { + "epoch": 5.219661892968666, + "grad_norm": 3.5, + "learning_rate": 7.882e-06, + "loss": 0.0682, + "step": 6060 + }, + { + "epoch": 5.2282760848497905, + "grad_norm": 2.703125, + "learning_rate": 7.862e-06, + "loss": 0.0708, + "step": 6070 + }, + { + "epoch": 5.2368902767309145, + "grad_norm": 3.015625, + "learning_rate": 7.842e-06, + "loss": 0.0695, + "step": 6080 + }, + { + "epoch": 5.2455044686120385, + "grad_norm": 2.421875, + "learning_rate": 7.822e-06, + "loss": 0.0731, + "step": 6090 + }, + { + "epoch": 5.2541186604931625, + "grad_norm": 3.140625, + "learning_rate": 7.802000000000001e-06, + "loss": 0.0695, + "step": 6100 + }, + { + "epoch": 5.2627328523742865, + "grad_norm": 3.046875, + "learning_rate": 7.782000000000001e-06, + "loss": 0.0765, + "step": 6110 + }, + { + "epoch": 5.2713470442554105, + "grad_norm": 2.984375, + "learning_rate": 7.762000000000001e-06, + "loss": 0.0702, + "step": 6120 + }, + { + "epoch": 5.279961236136535, + "grad_norm": 3.015625, + "learning_rate": 7.742000000000001e-06, + "loss": 0.0698, + "step": 6130 + }, + { + "epoch": 5.288575428017659, + "grad_norm": 2.75, + "learning_rate": 7.722e-06, + "loss": 0.0724, + "step": 6140 + }, + { + "epoch": 5.297189619898783, + "grad_norm": 2.46875, + "learning_rate": 7.702e-06, + "loss": 0.0708, + "step": 6150 + }, + { + "epoch": 5.305803811779907, + "grad_norm": 2.78125, + "learning_rate": 7.682e-06, + "loss": 0.0703, + "step": 6160 + }, + { + "epoch": 5.314418003661031, + "grad_norm": 2.53125, + "learning_rate": 7.662e-06, + "loss": 0.0704, + "step": 6170 + }, + { + "epoch": 5.323032195542155, + "grad_norm": 2.5625, + "learning_rate": 7.642e-06, + "loss": 0.0674, + "step": 6180 + }, + { + "epoch": 5.33164638742328, + "grad_norm": 4.5, + "learning_rate": 7.622000000000001e-06, + "loss": 0.077, + "step": 6190 + }, + { + "epoch": 5.340260579304404, + "grad_norm": 2.921875, + "learning_rate": 7.602e-06, + "loss": 0.0722, + "step": 6200 + }, + { + "epoch": 5.348874771185528, + "grad_norm": 2.5625, + "learning_rate": 7.582e-06, + "loss": 0.0728, + "step": 6210 + }, + { + "epoch": 5.357488963066652, + "grad_norm": 2.609375, + "learning_rate": 7.562000000000001e-06, + "loss": 0.0709, + "step": 6220 + }, + { + "epoch": 5.366103154947776, + "grad_norm": 3.4375, + "learning_rate": 7.542000000000001e-06, + "loss": 0.069, + "step": 6230 + }, + { + "epoch": 5.3747173468289, + "grad_norm": 3.109375, + "learning_rate": 7.522e-06, + "loss": 0.0717, + "step": 6240 + }, + { + "epoch": 5.383331538710025, + "grad_norm": 2.53125, + "learning_rate": 7.502e-06, + "loss": 0.0677, + "step": 6250 + }, + { + "epoch": 5.391945730591149, + "grad_norm": 2.96875, + "learning_rate": 7.4820000000000005e-06, + "loss": 0.0706, + "step": 6260 + }, + { + "epoch": 5.400559922472273, + "grad_norm": 3.015625, + "learning_rate": 7.462000000000001e-06, + "loss": 0.0753, + "step": 6270 + }, + { + "epoch": 5.409174114353397, + "grad_norm": 3.3125, + "learning_rate": 7.442e-06, + "loss": 0.0698, + "step": 6280 + }, + { + "epoch": 5.417788306234521, + "grad_norm": 2.953125, + "learning_rate": 7.422e-06, + "loss": 0.0733, + "step": 6290 + }, + { + "epoch": 5.426402498115645, + "grad_norm": 3.171875, + "learning_rate": 7.4020000000000005e-06, + "loss": 0.0672, + "step": 6300 + }, + { + "epoch": 5.43501668999677, + "grad_norm": 2.6875, + "learning_rate": 7.382000000000001e-06, + "loss": 0.0701, + "step": 6310 + }, + { + "epoch": 5.443630881877894, + "grad_norm": 2.765625, + "learning_rate": 7.362e-06, + "loss": 0.0677, + "step": 6320 + }, + { + "epoch": 5.452245073759018, + "grad_norm": 2.953125, + "learning_rate": 7.342e-06, + "loss": 0.0715, + "step": 6330 + }, + { + "epoch": 5.460859265640142, + "grad_norm": 3.09375, + "learning_rate": 7.322e-06, + "loss": 0.0713, + "step": 6340 + }, + { + "epoch": 5.469473457521266, + "grad_norm": 2.59375, + "learning_rate": 7.3020000000000006e-06, + "loss": 0.0697, + "step": 6350 + }, + { + "epoch": 5.47808764940239, + "grad_norm": 2.734375, + "learning_rate": 7.282e-06, + "loss": 0.0686, + "step": 6360 + }, + { + "epoch": 5.486701841283515, + "grad_norm": 2.78125, + "learning_rate": 7.262e-06, + "loss": 0.0685, + "step": 6370 + }, + { + "epoch": 5.495316033164639, + "grad_norm": 2.625, + "learning_rate": 7.242e-06, + "loss": 0.0697, + "step": 6380 + }, + { + "epoch": 5.503930225045763, + "grad_norm": 2.8125, + "learning_rate": 7.2220000000000005e-06, + "loss": 0.0684, + "step": 6390 + }, + { + "epoch": 5.512544416926887, + "grad_norm": 2.296875, + "learning_rate": 7.202e-06, + "loss": 0.0674, + "step": 6400 + }, + { + "epoch": 5.521158608808011, + "grad_norm": 3.21875, + "learning_rate": 7.182e-06, + "loss": 0.0728, + "step": 6410 + }, + { + "epoch": 5.529772800689136, + "grad_norm": 2.953125, + "learning_rate": 7.162e-06, + "loss": 0.0706, + "step": 6420 + }, + { + "epoch": 5.53838699257026, + "grad_norm": 2.6875, + "learning_rate": 7.142e-06, + "loss": 0.0725, + "step": 6430 + }, + { + "epoch": 5.547001184451384, + "grad_norm": 2.375, + "learning_rate": 7.1220000000000014e-06, + "loss": 0.07, + "step": 6440 + }, + { + "epoch": 5.555615376332508, + "grad_norm": 2.671875, + "learning_rate": 7.102000000000001e-06, + "loss": 0.0682, + "step": 6450 + }, + { + "epoch": 5.564229568213632, + "grad_norm": 3.046875, + "learning_rate": 7.082000000000001e-06, + "loss": 0.0715, + "step": 6460 + }, + { + "epoch": 5.572843760094756, + "grad_norm": 2.515625, + "learning_rate": 7.062000000000001e-06, + "loss": 0.0715, + "step": 6470 + }, + { + "epoch": 5.58145795197588, + "grad_norm": 2.859375, + "learning_rate": 7.042000000000001e-06, + "loss": 0.0702, + "step": 6480 + }, + { + "epoch": 5.590072143857005, + "grad_norm": 2.765625, + "learning_rate": 7.022000000000001e-06, + "loss": 0.0729, + "step": 6490 + }, + { + "epoch": 5.598686335738129, + "grad_norm": 2.984375, + "learning_rate": 7.002000000000001e-06, + "loss": 0.0734, + "step": 6500 + }, + { + "epoch": 5.607300527619253, + "grad_norm": 3.15625, + "learning_rate": 6.982000000000001e-06, + "loss": 0.0785, + "step": 6510 + }, + { + "epoch": 5.615914719500377, + "grad_norm": 2.875, + "learning_rate": 6.962000000000001e-06, + "loss": 0.0727, + "step": 6520 + }, + { + "epoch": 5.624528911381501, + "grad_norm": 2.890625, + "learning_rate": 6.942000000000001e-06, + "loss": 0.0703, + "step": 6530 + }, + { + "epoch": 5.633143103262626, + "grad_norm": 3.453125, + "learning_rate": 6.922000000000001e-06, + "loss": 0.0698, + "step": 6540 + }, + { + "epoch": 5.64175729514375, + "grad_norm": 2.953125, + "learning_rate": 6.902000000000001e-06, + "loss": 0.0667, + "step": 6550 + }, + { + "epoch": 5.650371487024874, + "grad_norm": 3.5625, + "learning_rate": 6.882000000000001e-06, + "loss": 0.0707, + "step": 6560 + }, + { + "epoch": 5.658985678905998, + "grad_norm": 2.84375, + "learning_rate": 6.8620000000000005e-06, + "loss": 0.0696, + "step": 6570 + }, + { + "epoch": 5.667599870787122, + "grad_norm": 2.671875, + "learning_rate": 6.842000000000001e-06, + "loss": 0.0726, + "step": 6580 + }, + { + "epoch": 5.676214062668246, + "grad_norm": 2.609375, + "learning_rate": 6.822000000000001e-06, + "loss": 0.0737, + "step": 6590 + }, + { + "epoch": 5.6848282545493705, + "grad_norm": 2.875, + "learning_rate": 6.802000000000001e-06, + "loss": 0.0752, + "step": 6600 + }, + { + "epoch": 5.6934424464304945, + "grad_norm": 2.34375, + "learning_rate": 6.7820000000000005e-06, + "loss": 0.0711, + "step": 6610 + }, + { + "epoch": 5.7020566383116185, + "grad_norm": 3.0625, + "learning_rate": 6.762000000000001e-06, + "loss": 0.0715, + "step": 6620 + }, + { + "epoch": 5.7106708301927425, + "grad_norm": 3.140625, + "learning_rate": 6.742000000000001e-06, + "loss": 0.0715, + "step": 6630 + }, + { + "epoch": 5.7192850220738665, + "grad_norm": 2.96875, + "learning_rate": 6.722000000000001e-06, + "loss": 0.0706, + "step": 6640 + }, + { + "epoch": 5.7278992139549905, + "grad_norm": 2.875, + "learning_rate": 6.702e-06, + "loss": 0.0671, + "step": 6650 + }, + { + "epoch": 5.736513405836115, + "grad_norm": 2.984375, + "learning_rate": 6.6820000000000006e-06, + "loss": 0.0754, + "step": 6660 + }, + { + "epoch": 5.745127597717239, + "grad_norm": 2.734375, + "learning_rate": 6.662000000000001e-06, + "loss": 0.0702, + "step": 6670 + }, + { + "epoch": 5.753741789598363, + "grad_norm": 2.84375, + "learning_rate": 6.642000000000001e-06, + "loss": 0.071, + "step": 6680 + }, + { + "epoch": 5.762355981479487, + "grad_norm": 2.84375, + "learning_rate": 6.622e-06, + "loss": 0.0724, + "step": 6690 + }, + { + "epoch": 5.770970173360611, + "grad_norm": 3.21875, + "learning_rate": 6.6020000000000005e-06, + "loss": 0.0739, + "step": 6700 + }, + { + "epoch": 5.779584365241735, + "grad_norm": 3.203125, + "learning_rate": 6.582000000000001e-06, + "loss": 0.0713, + "step": 6710 + }, + { + "epoch": 5.78819855712286, + "grad_norm": 2.6875, + "learning_rate": 6.562000000000001e-06, + "loss": 0.0691, + "step": 6720 + }, + { + "epoch": 5.796812749003984, + "grad_norm": 2.75, + "learning_rate": 6.542e-06, + "loss": 0.0713, + "step": 6730 + }, + { + "epoch": 5.805426940885108, + "grad_norm": 2.6875, + "learning_rate": 6.522e-06, + "loss": 0.0685, + "step": 6740 + }, + { + "epoch": 5.814041132766232, + "grad_norm": 3.46875, + "learning_rate": 6.502000000000001e-06, + "loss": 0.0728, + "step": 6750 + }, + { + "epoch": 5.822655324647356, + "grad_norm": 3.03125, + "learning_rate": 6.482000000000001e-06, + "loss": 0.0722, + "step": 6760 + }, + { + "epoch": 5.831269516528481, + "grad_norm": 2.59375, + "learning_rate": 6.462e-06, + "loss": 0.0687, + "step": 6770 + }, + { + "epoch": 5.839883708409605, + "grad_norm": 3.0, + "learning_rate": 6.442e-06, + "loss": 0.0682, + "step": 6780 + }, + { + "epoch": 5.848497900290729, + "grad_norm": 2.40625, + "learning_rate": 6.4220000000000005e-06, + "loss": 0.0674, + "step": 6790 + }, + { + "epoch": 5.857112092171853, + "grad_norm": 2.703125, + "learning_rate": 6.402000000000001e-06, + "loss": 0.0743, + "step": 6800 + }, + { + "epoch": 5.865726284052977, + "grad_norm": 2.734375, + "learning_rate": 6.382e-06, + "loss": 0.071, + "step": 6810 + }, + { + "epoch": 5.874340475934101, + "grad_norm": 3.0625, + "learning_rate": 6.362e-06, + "loss": 0.0688, + "step": 6820 + }, + { + "epoch": 5.882954667815225, + "grad_norm": 2.734375, + "learning_rate": 6.3420000000000004e-06, + "loss": 0.0696, + "step": 6830 + }, + { + "epoch": 5.89156885969635, + "grad_norm": 2.875, + "learning_rate": 6.322000000000001e-06, + "loss": 0.0698, + "step": 6840 + }, + { + "epoch": 5.900183051577474, + "grad_norm": 2.53125, + "learning_rate": 6.302e-06, + "loss": 0.0688, + "step": 6850 + }, + { + "epoch": 5.908797243458598, + "grad_norm": 3.0, + "learning_rate": 6.282e-06, + "loss": 0.0704, + "step": 6860 + }, + { + "epoch": 5.917411435339722, + "grad_norm": 3.0, + "learning_rate": 6.262e-06, + "loss": 0.0742, + "step": 6870 + }, + { + "epoch": 5.926025627220846, + "grad_norm": 2.46875, + "learning_rate": 6.2420000000000005e-06, + "loss": 0.0656, + "step": 6880 + }, + { + "epoch": 5.934639819101971, + "grad_norm": 2.8125, + "learning_rate": 6.222e-06, + "loss": 0.072, + "step": 6890 + }, + { + "epoch": 5.943254010983095, + "grad_norm": 2.921875, + "learning_rate": 6.202e-06, + "loss": 0.0668, + "step": 6900 + }, + { + "epoch": 5.951868202864219, + "grad_norm": 2.765625, + "learning_rate": 6.182e-06, + "loss": 0.0706, + "step": 6910 + }, + { + "epoch": 5.960482394745343, + "grad_norm": 2.921875, + "learning_rate": 6.1620000000000005e-06, + "loss": 0.0685, + "step": 6920 + }, + { + "epoch": 5.969096586626467, + "grad_norm": 2.703125, + "learning_rate": 6.142e-06, + "loss": 0.074, + "step": 6930 + }, + { + "epoch": 5.977710778507591, + "grad_norm": 3.40625, + "learning_rate": 6.122e-06, + "loss": 0.0713, + "step": 6940 + }, + { + "epoch": 5.986324970388715, + "grad_norm": 2.390625, + "learning_rate": 6.102e-06, + "loss": 0.0719, + "step": 6950 + }, + { + "epoch": 5.99493916226984, + "grad_norm": 3.0, + "learning_rate": 6.082e-06, + "loss": 0.0704, + "step": 6960 + }, + { + "epoch": 6.00344567675245, + "grad_norm": 2.0, + "learning_rate": 6.062e-06, + "loss": 0.0591, + "step": 6970 + }, + { + "epoch": 6.012059868633574, + "grad_norm": 2.421875, + "learning_rate": 6.042e-06, + "loss": 0.0506, + "step": 6980 + }, + { + "epoch": 6.020674060514698, + "grad_norm": 2.859375, + "learning_rate": 6.022e-06, + "loss": 0.055, + "step": 6990 + }, + { + "epoch": 6.029288252395822, + "grad_norm": 2.28125, + "learning_rate": 6.002e-06, + "loss": 0.0564, + "step": 7000 + }, + { + "epoch": 6.037902444276947, + "grad_norm": 2.546875, + "learning_rate": 5.982e-06, + "loss": 0.0567, + "step": 7010 + }, + { + "epoch": 6.046516636158071, + "grad_norm": 2.5, + "learning_rate": 5.962e-06, + "loss": 0.056, + "step": 7020 + }, + { + "epoch": 6.055130828039195, + "grad_norm": 2.640625, + "learning_rate": 5.942e-06, + "loss": 0.0564, + "step": 7030 + }, + { + "epoch": 6.063745019920319, + "grad_norm": 2.171875, + "learning_rate": 5.922e-06, + "loss": 0.0511, + "step": 7040 + }, + { + "epoch": 6.072359211801443, + "grad_norm": 2.5625, + "learning_rate": 5.9019999999999996e-06, + "loss": 0.0535, + "step": 7050 + }, + { + "epoch": 6.080973403682567, + "grad_norm": 2.546875, + "learning_rate": 5.882e-06, + "loss": 0.0558, + "step": 7060 + }, + { + "epoch": 6.089587595563692, + "grad_norm": 2.609375, + "learning_rate": 5.862000000000001e-06, + "loss": 0.0526, + "step": 7070 + }, + { + "epoch": 6.098201787444816, + "grad_norm": 2.25, + "learning_rate": 5.842000000000001e-06, + "loss": 0.0541, + "step": 7080 + }, + { + "epoch": 6.10681597932594, + "grad_norm": 3.328125, + "learning_rate": 5.822000000000001e-06, + "loss": 0.0552, + "step": 7090 + }, + { + "epoch": 6.115430171207064, + "grad_norm": 2.453125, + "learning_rate": 5.802000000000001e-06, + "loss": 0.0551, + "step": 7100 + }, + { + "epoch": 6.124044363088188, + "grad_norm": 2.171875, + "learning_rate": 5.782000000000001e-06, + "loss": 0.0528, + "step": 7110 + }, + { + "epoch": 6.132658554969312, + "grad_norm": 2.453125, + "learning_rate": 5.762000000000001e-06, + "loss": 0.0528, + "step": 7120 + }, + { + "epoch": 6.1412727468504364, + "grad_norm": 2.125, + "learning_rate": 5.742000000000001e-06, + "loss": 0.0538, + "step": 7130 + }, + { + "epoch": 6.1498869387315604, + "grad_norm": 2.171875, + "learning_rate": 5.722000000000001e-06, + "loss": 0.0521, + "step": 7140 + }, + { + "epoch": 6.1585011306126844, + "grad_norm": 2.78125, + "learning_rate": 5.702000000000001e-06, + "loss": 0.0557, + "step": 7150 + }, + { + "epoch": 6.1671153224938084, + "grad_norm": 2.6875, + "learning_rate": 5.682000000000001e-06, + "loss": 0.0536, + "step": 7160 + }, + { + "epoch": 6.1757295143749324, + "grad_norm": 2.234375, + "learning_rate": 5.662000000000001e-06, + "loss": 0.0526, + "step": 7170 + }, + { + "epoch": 6.1843437062560564, + "grad_norm": 2.953125, + "learning_rate": 5.642000000000001e-06, + "loss": 0.0547, + "step": 7180 + }, + { + "epoch": 6.192957898137181, + "grad_norm": 2.40625, + "learning_rate": 5.6220000000000006e-06, + "loss": 0.0508, + "step": 7190 + }, + { + "epoch": 6.201572090018305, + "grad_norm": 2.453125, + "learning_rate": 5.602000000000001e-06, + "loss": 0.0541, + "step": 7200 + }, + { + "epoch": 6.210186281899429, + "grad_norm": 2.953125, + "learning_rate": 5.582000000000001e-06, + "loss": 0.0547, + "step": 7210 + }, + { + "epoch": 6.218800473780553, + "grad_norm": 2.703125, + "learning_rate": 5.562000000000001e-06, + "loss": 0.056, + "step": 7220 + }, + { + "epoch": 6.227414665661677, + "grad_norm": 2.734375, + "learning_rate": 5.5420000000000005e-06, + "loss": 0.0538, + "step": 7230 + }, + { + "epoch": 6.236028857542801, + "grad_norm": 2.1875, + "learning_rate": 5.522000000000001e-06, + "loss": 0.0491, + "step": 7240 + }, + { + "epoch": 6.244643049423926, + "grad_norm": 2.390625, + "learning_rate": 5.502000000000001e-06, + "loss": 0.0532, + "step": 7250 + }, + { + "epoch": 6.25325724130505, + "grad_norm": 2.265625, + "learning_rate": 5.482000000000001e-06, + "loss": 0.0496, + "step": 7260 + }, + { + "epoch": 6.261871433186174, + "grad_norm": 2.734375, + "learning_rate": 5.462e-06, + "loss": 0.0587, + "step": 7270 + }, + { + "epoch": 6.270485625067298, + "grad_norm": 2.546875, + "learning_rate": 5.442000000000001e-06, + "loss": 0.055, + "step": 7280 + }, + { + "epoch": 6.279099816948422, + "grad_norm": 2.4375, + "learning_rate": 5.422000000000001e-06, + "loss": 0.0559, + "step": 7290 + }, + { + "epoch": 6.287714008829546, + "grad_norm": 2.59375, + "learning_rate": 5.402000000000001e-06, + "loss": 0.0572, + "step": 7300 + }, + { + "epoch": 6.296328200710671, + "grad_norm": 2.421875, + "learning_rate": 5.382e-06, + "loss": 0.0516, + "step": 7310 + }, + { + "epoch": 6.304942392591795, + "grad_norm": 2.21875, + "learning_rate": 5.3620000000000005e-06, + "loss": 0.05, + "step": 7320 + }, + { + "epoch": 6.313556584472919, + "grad_norm": 2.734375, + "learning_rate": 5.342000000000001e-06, + "loss": 0.0534, + "step": 7330 + }, + { + "epoch": 6.322170776354043, + "grad_norm": 2.21875, + "learning_rate": 5.322000000000001e-06, + "loss": 0.056, + "step": 7340 + }, + { + "epoch": 6.330784968235167, + "grad_norm": 2.6875, + "learning_rate": 5.302e-06, + "loss": 0.0542, + "step": 7350 + }, + { + "epoch": 6.339399160116292, + "grad_norm": 2.703125, + "learning_rate": 5.282e-06, + "loss": 0.0573, + "step": 7360 + }, + { + "epoch": 6.348013351997416, + "grad_norm": 2.921875, + "learning_rate": 5.262000000000001e-06, + "loss": 0.0518, + "step": 7370 + }, + { + "epoch": 6.35662754387854, + "grad_norm": 2.5, + "learning_rate": 5.242000000000001e-06, + "loss": 0.0567, + "step": 7380 + }, + { + "epoch": 6.365241735759664, + "grad_norm": 2.859375, + "learning_rate": 5.222e-06, + "loss": 0.0563, + "step": 7390 + }, + { + "epoch": 6.373855927640788, + "grad_norm": 2.40625, + "learning_rate": 5.202e-06, + "loss": 0.0534, + "step": 7400 + }, + { + "epoch": 6.382470119521912, + "grad_norm": 2.734375, + "learning_rate": 5.1820000000000005e-06, + "loss": 0.0515, + "step": 7410 + }, + { + "epoch": 6.391084311403037, + "grad_norm": 2.71875, + "learning_rate": 5.162000000000001e-06, + "loss": 0.0568, + "step": 7420 + }, + { + "epoch": 6.399698503284161, + "grad_norm": 3.140625, + "learning_rate": 5.142e-06, + "loss": 0.0527, + "step": 7430 + }, + { + "epoch": 6.408312695165285, + "grad_norm": 2.671875, + "learning_rate": 5.122e-06, + "loss": 0.0528, + "step": 7440 + }, + { + "epoch": 6.416926887046409, + "grad_norm": 3.515625, + "learning_rate": 5.1020000000000004e-06, + "loss": 0.0553, + "step": 7450 + }, + { + "epoch": 6.425541078927533, + "grad_norm": 2.6875, + "learning_rate": 5.082000000000001e-06, + "loss": 0.0531, + "step": 7460 + }, + { + "epoch": 6.434155270808657, + "grad_norm": 2.046875, + "learning_rate": 5.062e-06, + "loss": 0.05, + "step": 7470 + }, + { + "epoch": 6.442769462689782, + "grad_norm": 3.25, + "learning_rate": 5.042e-06, + "loss": 0.0562, + "step": 7480 + }, + { + "epoch": 6.451383654570906, + "grad_norm": 2.828125, + "learning_rate": 5.022e-06, + "loss": 0.0531, + "step": 7490 + }, + { + "epoch": 6.45999784645203, + "grad_norm": 2.625, + "learning_rate": 5.0020000000000006e-06, + "loss": 0.0545, + "step": 7500 + }, + { + "epoch": 6.468612038333154, + "grad_norm": 2.4375, + "learning_rate": 4.982e-06, + "loss": 0.0528, + "step": 7510 + }, + { + "epoch": 6.477226230214278, + "grad_norm": 2.296875, + "learning_rate": 4.962e-06, + "loss": 0.0519, + "step": 7520 + }, + { + "epoch": 6.485840422095402, + "grad_norm": 2.40625, + "learning_rate": 4.942e-06, + "loss": 0.0548, + "step": 7530 + }, + { + "epoch": 6.494454613976527, + "grad_norm": 3.390625, + "learning_rate": 4.9220000000000005e-06, + "loss": 0.0534, + "step": 7540 + }, + { + "epoch": 6.503068805857651, + "grad_norm": 2.625, + "learning_rate": 4.902000000000001e-06, + "loss": 0.0516, + "step": 7550 + }, + { + "epoch": 6.511682997738775, + "grad_norm": 2.609375, + "learning_rate": 4.882000000000001e-06, + "loss": 0.055, + "step": 7560 + }, + { + "epoch": 6.520297189619899, + "grad_norm": 2.265625, + "learning_rate": 4.862e-06, + "loss": 0.0508, + "step": 7570 + }, + { + "epoch": 6.528911381501023, + "grad_norm": 2.59375, + "learning_rate": 4.842e-06, + "loss": 0.0557, + "step": 7580 + }, + { + "epoch": 6.537525573382148, + "grad_norm": 2.765625, + "learning_rate": 4.822000000000001e-06, + "loss": 0.0564, + "step": 7590 + }, + { + "epoch": 6.546139765263272, + "grad_norm": 2.140625, + "learning_rate": 4.802000000000001e-06, + "loss": 0.0519, + "step": 7600 + }, + { + "epoch": 6.554753957144396, + "grad_norm": 2.53125, + "learning_rate": 4.782e-06, + "loss": 0.056, + "step": 7610 + }, + { + "epoch": 6.56336814902552, + "grad_norm": 2.546875, + "learning_rate": 4.762e-06, + "loss": 0.0554, + "step": 7620 + }, + { + "epoch": 6.571982340906644, + "grad_norm": 2.703125, + "learning_rate": 4.7420000000000005e-06, + "loss": 0.0538, + "step": 7630 + }, + { + "epoch": 6.580596532787768, + "grad_norm": 2.59375, + "learning_rate": 4.722000000000001e-06, + "loss": 0.0527, + "step": 7640 + }, + { + "epoch": 6.589210724668892, + "grad_norm": 2.765625, + "learning_rate": 4.702e-06, + "loss": 0.0558, + "step": 7650 + }, + { + "epoch": 6.5978249165500165, + "grad_norm": 2.4375, + "learning_rate": 4.682e-06, + "loss": 0.0506, + "step": 7660 + }, + { + "epoch": 6.6064391084311405, + "grad_norm": 3.4375, + "learning_rate": 4.6620000000000004e-06, + "loss": 0.055, + "step": 7670 + }, + { + "epoch": 6.6150533003122645, + "grad_norm": 2.484375, + "learning_rate": 4.642000000000001e-06, + "loss": 0.0535, + "step": 7680 + }, + { + "epoch": 6.6236674921933885, + "grad_norm": 2.65625, + "learning_rate": 4.622e-06, + "loss": 0.0564, + "step": 7690 + }, + { + "epoch": 6.6322816840745125, + "grad_norm": 2.6875, + "learning_rate": 4.602e-06, + "loss": 0.0567, + "step": 7700 + }, + { + "epoch": 6.640895875955637, + "grad_norm": 2.84375, + "learning_rate": 4.582e-06, + "loss": 0.0557, + "step": 7710 + }, + { + "epoch": 6.649510067836761, + "grad_norm": 2.625, + "learning_rate": 4.5620000000000005e-06, + "loss": 0.0533, + "step": 7720 + }, + { + "epoch": 6.658124259717885, + "grad_norm": 2.1875, + "learning_rate": 4.542e-06, + "loss": 0.0548, + "step": 7730 + }, + { + "epoch": 6.666738451599009, + "grad_norm": 2.734375, + "learning_rate": 4.522e-06, + "loss": 0.0543, + "step": 7740 + }, + { + "epoch": 6.675352643480133, + "grad_norm": 2.484375, + "learning_rate": 4.502e-06, + "loss": 0.0498, + "step": 7750 + }, + { + "epoch": 6.683966835361257, + "grad_norm": 2.640625, + "learning_rate": 4.4820000000000005e-06, + "loss": 0.0559, + "step": 7760 + }, + { + "epoch": 6.692581027242381, + "grad_norm": 2.796875, + "learning_rate": 4.462e-06, + "loss": 0.0556, + "step": 7770 + }, + { + "epoch": 6.701195219123506, + "grad_norm": 2.3125, + "learning_rate": 4.442e-06, + "loss": 0.051, + "step": 7780 + }, + { + "epoch": 6.70980941100463, + "grad_norm": 2.65625, + "learning_rate": 4.422e-06, + "loss": 0.0555, + "step": 7790 + }, + { + "epoch": 6.718423602885754, + "grad_norm": 2.5625, + "learning_rate": 4.402e-06, + "loss": 0.0522, + "step": 7800 + }, + { + "epoch": 6.727037794766878, + "grad_norm": 2.265625, + "learning_rate": 4.382e-06, + "loss": 0.0555, + "step": 7810 + }, + { + "epoch": 6.735651986648002, + "grad_norm": 4.125, + "learning_rate": 4.362e-06, + "loss": 0.0519, + "step": 7820 + }, + { + "epoch": 6.744266178529127, + "grad_norm": 3.25, + "learning_rate": 4.342e-06, + "loss": 0.0538, + "step": 7830 + }, + { + "epoch": 6.752880370410251, + "grad_norm": 3.328125, + "learning_rate": 4.322e-06, + "loss": 0.056, + "step": 7840 + }, + { + "epoch": 6.761494562291375, + "grad_norm": 2.734375, + "learning_rate": 4.3020000000000005e-06, + "loss": 0.0538, + "step": 7850 + }, + { + "epoch": 6.770108754172499, + "grad_norm": 2.171875, + "learning_rate": 4.282000000000001e-06, + "loss": 0.0513, + "step": 7860 + }, + { + "epoch": 6.778722946053623, + "grad_norm": 2.921875, + "learning_rate": 4.262000000000001e-06, + "loss": 0.0548, + "step": 7870 + }, + { + "epoch": 6.787337137934747, + "grad_norm": 3.0625, + "learning_rate": 4.242e-06, + "loss": 0.0538, + "step": 7880 + }, + { + "epoch": 6.795951329815872, + "grad_norm": 2.75, + "learning_rate": 4.222e-06, + "loss": 0.0537, + "step": 7890 + }, + { + "epoch": 6.804565521696996, + "grad_norm": 2.859375, + "learning_rate": 4.202000000000001e-06, + "loss": 0.0582, + "step": 7900 + }, + { + "epoch": 6.81317971357812, + "grad_norm": 2.421875, + "learning_rate": 4.182000000000001e-06, + "loss": 0.0577, + "step": 7910 + }, + { + "epoch": 6.821793905459244, + "grad_norm": 2.75, + "learning_rate": 4.162e-06, + "loss": 0.0564, + "step": 7920 + }, + { + "epoch": 6.830408097340368, + "grad_norm": 2.9375, + "learning_rate": 4.142e-06, + "loss": 0.0551, + "step": 7930 + }, + { + "epoch": 6.839022289221492, + "grad_norm": 2.734375, + "learning_rate": 4.1220000000000005e-06, + "loss": 0.0501, + "step": 7940 + }, + { + "epoch": 6.847636481102617, + "grad_norm": 2.515625, + "learning_rate": 4.102000000000001e-06, + "loss": 0.0498, + "step": 7950 + }, + { + "epoch": 6.856250672983741, + "grad_norm": 3.125, + "learning_rate": 4.082e-06, + "loss": 0.0512, + "step": 7960 + }, + { + "epoch": 6.864864864864865, + "grad_norm": 2.5, + "learning_rate": 4.062e-06, + "loss": 0.052, + "step": 7970 + }, + { + "epoch": 6.873479056745989, + "grad_norm": 2.84375, + "learning_rate": 4.0420000000000004e-06, + "loss": 0.0563, + "step": 7980 + }, + { + "epoch": 6.882093248627113, + "grad_norm": 2.671875, + "learning_rate": 4.022000000000001e-06, + "loss": 0.0526, + "step": 7990 + }, + { + "epoch": 6.890707440508237, + "grad_norm": 2.640625, + "learning_rate": 4.002e-06, + "loss": 0.0556, + "step": 8000 + }, + { + "epoch": 6.899321632389362, + "grad_norm": 2.8125, + "learning_rate": 3.982e-06, + "loss": 0.0553, + "step": 8010 + }, + { + "epoch": 6.907935824270486, + "grad_norm": 2.828125, + "learning_rate": 3.962e-06, + "loss": 0.0538, + "step": 8020 + }, + { + "epoch": 6.91655001615161, + "grad_norm": 3.046875, + "learning_rate": 3.9420000000000005e-06, + "loss": 0.056, + "step": 8030 + }, + { + "epoch": 6.925164208032734, + "grad_norm": 3.203125, + "learning_rate": 3.922e-06, + "loss": 0.0553, + "step": 8040 + }, + { + "epoch": 6.933778399913858, + "grad_norm": 2.796875, + "learning_rate": 3.902e-06, + "loss": 0.054, + "step": 8050 + }, + { + "epoch": 6.942392591794983, + "grad_norm": 2.390625, + "learning_rate": 3.882e-06, + "loss": 0.056, + "step": 8060 + }, + { + "epoch": 6.951006783676107, + "grad_norm": 2.296875, + "learning_rate": 3.8620000000000005e-06, + "loss": 0.055, + "step": 8070 + }, + { + "epoch": 6.959620975557231, + "grad_norm": 2.6875, + "learning_rate": 3.842e-06, + "loss": 0.0542, + "step": 8080 + }, + { + "epoch": 6.968235167438355, + "grad_norm": 2.5, + "learning_rate": 3.822e-06, + "loss": 0.052, + "step": 8090 + }, + { + "epoch": 6.976849359319479, + "grad_norm": 2.34375, + "learning_rate": 3.802e-06, + "loss": 0.0528, + "step": 8100 + }, + { + "epoch": 6.985463551200603, + "grad_norm": 2.875, + "learning_rate": 3.782e-06, + "loss": 0.0551, + "step": 8110 + }, + { + "epoch": 6.994077743081727, + "grad_norm": 2.46875, + "learning_rate": 3.762e-06, + "loss": 0.0545, + "step": 8120 + }, + { + "epoch": 7.0025842575643376, + "grad_norm": 2.46875, + "learning_rate": 3.742e-06, + "loss": 0.0536, + "step": 8130 + }, + { + "epoch": 7.0111984494454616, + "grad_norm": 1.9296875, + "learning_rate": 3.722e-06, + "loss": 0.0441, + "step": 8140 + }, + { + "epoch": 7.019812641326586, + "grad_norm": 2.234375, + "learning_rate": 3.702e-06, + "loss": 0.0494, + "step": 8150 + }, + { + "epoch": 7.02842683320771, + "grad_norm": 2.546875, + "learning_rate": 3.6820000000000005e-06, + "loss": 0.0473, + "step": 8160 + }, + { + "epoch": 7.037041025088834, + "grad_norm": 2.546875, + "learning_rate": 3.6620000000000007e-06, + "loss": 0.0443, + "step": 8170 + }, + { + "epoch": 7.0456552169699584, + "grad_norm": 2.25, + "learning_rate": 3.6420000000000005e-06, + "loss": 0.0454, + "step": 8180 + }, + { + "epoch": 7.0542694088510824, + "grad_norm": 2.484375, + "learning_rate": 3.6220000000000006e-06, + "loss": 0.0465, + "step": 8190 + }, + { + "epoch": 7.0628836007322064, + "grad_norm": 2.5625, + "learning_rate": 3.6020000000000004e-06, + "loss": 0.0463, + "step": 8200 + }, + { + "epoch": 7.0714977926133304, + "grad_norm": 2.375, + "learning_rate": 3.5820000000000006e-06, + "loss": 0.0454, + "step": 8210 + }, + { + "epoch": 7.0801119844944544, + "grad_norm": 2.421875, + "learning_rate": 3.5620000000000004e-06, + "loss": 0.0453, + "step": 8220 + }, + { + "epoch": 7.0887261763755784, + "grad_norm": 1.984375, + "learning_rate": 3.5420000000000006e-06, + "loss": 0.0459, + "step": 8230 + }, + { + "epoch": 7.097340368256703, + "grad_norm": 2.53125, + "learning_rate": 3.5220000000000003e-06, + "loss": 0.0462, + "step": 8240 + }, + { + "epoch": 7.105954560137827, + "grad_norm": 2.921875, + "learning_rate": 3.5020000000000005e-06, + "loss": 0.0486, + "step": 8250 + }, + { + "epoch": 7.114568752018951, + "grad_norm": 2.390625, + "learning_rate": 3.4820000000000003e-06, + "loss": 0.0436, + "step": 8260 + }, + { + "epoch": 7.123182943900075, + "grad_norm": 2.375, + "learning_rate": 3.4620000000000005e-06, + "loss": 0.0462, + "step": 8270 + }, + { + "epoch": 7.131797135781199, + "grad_norm": 2.21875, + "learning_rate": 3.4420000000000002e-06, + "loss": 0.045, + "step": 8280 + }, + { + "epoch": 7.140411327662323, + "grad_norm": 2.65625, + "learning_rate": 3.4220000000000004e-06, + "loss": 0.0461, + "step": 8290 + }, + { + "epoch": 7.149025519543448, + "grad_norm": 2.09375, + "learning_rate": 3.402e-06, + "loss": 0.0453, + "step": 8300 + }, + { + "epoch": 7.157639711424572, + "grad_norm": 2.375, + "learning_rate": 3.3820000000000004e-06, + "loss": 0.045, + "step": 8310 + }, + { + "epoch": 7.166253903305696, + "grad_norm": 2.46875, + "learning_rate": 3.362e-06, + "loss": 0.0483, + "step": 8320 + }, + { + "epoch": 7.17486809518682, + "grad_norm": 2.25, + "learning_rate": 3.3420000000000004e-06, + "loss": 0.0452, + "step": 8330 + }, + { + "epoch": 7.183482287067944, + "grad_norm": 2.40625, + "learning_rate": 3.322e-06, + "loss": 0.0485, + "step": 8340 + }, + { + "epoch": 7.192096478949068, + "grad_norm": 2.28125, + "learning_rate": 3.3020000000000003e-06, + "loss": 0.0485, + "step": 8350 + }, + { + "epoch": 7.200710670830193, + "grad_norm": 2.125, + "learning_rate": 3.282e-06, + "loss": 0.0461, + "step": 8360 + }, + { + "epoch": 7.209324862711317, + "grad_norm": 2.5625, + "learning_rate": 3.2620000000000003e-06, + "loss": 0.0498, + "step": 8370 + }, + { + "epoch": 7.217939054592441, + "grad_norm": 2.296875, + "learning_rate": 3.242e-06, + "loss": 0.0469, + "step": 8380 + }, + { + "epoch": 7.226553246473565, + "grad_norm": 1.9609375, + "learning_rate": 3.2220000000000002e-06, + "loss": 0.0428, + "step": 8390 + }, + { + "epoch": 7.235167438354689, + "grad_norm": 2.328125, + "learning_rate": 3.202e-06, + "loss": 0.0449, + "step": 8400 + }, + { + "epoch": 7.243781630235813, + "grad_norm": 2.09375, + "learning_rate": 3.182e-06, + "loss": 0.0441, + "step": 8410 + }, + { + "epoch": 7.252395822116938, + "grad_norm": 2.4375, + "learning_rate": 3.162e-06, + "loss": 0.0452, + "step": 8420 + }, + { + "epoch": 7.261010013998062, + "grad_norm": 2.578125, + "learning_rate": 3.142e-06, + "loss": 0.0458, + "step": 8430 + }, + { + "epoch": 7.269624205879186, + "grad_norm": 2.390625, + "learning_rate": 3.122e-06, + "loss": 0.0421, + "step": 8440 + }, + { + "epoch": 7.27823839776031, + "grad_norm": 2.65625, + "learning_rate": 3.102e-06, + "loss": 0.0443, + "step": 8450 + }, + { + "epoch": 7.286852589641434, + "grad_norm": 2.734375, + "learning_rate": 3.082e-06, + "loss": 0.0446, + "step": 8460 + }, + { + "epoch": 7.295466781522558, + "grad_norm": 2.515625, + "learning_rate": 3.0620000000000005e-06, + "loss": 0.0449, + "step": 8470 + }, + { + "epoch": 7.304080973403683, + "grad_norm": 2.15625, + "learning_rate": 3.0420000000000007e-06, + "loss": 0.0413, + "step": 8480 + }, + { + "epoch": 7.312695165284807, + "grad_norm": 2.46875, + "learning_rate": 3.0220000000000005e-06, + "loss": 0.0456, + "step": 8490 + }, + { + "epoch": 7.321309357165931, + "grad_norm": 2.5, + "learning_rate": 3.0020000000000006e-06, + "loss": 0.0459, + "step": 8500 + }, + { + "epoch": 7.329923549047055, + "grad_norm": 2.140625, + "learning_rate": 2.9820000000000004e-06, + "loss": 0.0477, + "step": 8510 + }, + { + "epoch": 7.338537740928179, + "grad_norm": 2.375, + "learning_rate": 2.9620000000000006e-06, + "loss": 0.0462, + "step": 8520 + }, + { + "epoch": 7.347151932809304, + "grad_norm": 2.3125, + "learning_rate": 2.9420000000000004e-06, + "loss": 0.0428, + "step": 8530 + }, + { + "epoch": 7.355766124690428, + "grad_norm": 2.359375, + "learning_rate": 2.9220000000000006e-06, + "loss": 0.0438, + "step": 8540 + }, + { + "epoch": 7.364380316571552, + "grad_norm": 2.65625, + "learning_rate": 2.9020000000000003e-06, + "loss": 0.0451, + "step": 8550 + }, + { + "epoch": 7.372994508452676, + "grad_norm": 2.4375, + "learning_rate": 2.8820000000000005e-06, + "loss": 0.0468, + "step": 8560 + }, + { + "epoch": 7.3816087003338, + "grad_norm": 2.171875, + "learning_rate": 2.8620000000000003e-06, + "loss": 0.0462, + "step": 8570 + }, + { + "epoch": 7.390222892214924, + "grad_norm": 2.546875, + "learning_rate": 2.8420000000000005e-06, + "loss": 0.0465, + "step": 8580 + }, + { + "epoch": 7.398837084096048, + "grad_norm": 2.375, + "learning_rate": 2.8220000000000003e-06, + "loss": 0.0471, + "step": 8590 + }, + { + "epoch": 7.407451275977173, + "grad_norm": 2.296875, + "learning_rate": 2.8020000000000004e-06, + "loss": 0.0473, + "step": 8600 + }, + { + "epoch": 7.416065467858297, + "grad_norm": 2.625, + "learning_rate": 2.7820000000000002e-06, + "loss": 0.0486, + "step": 8610 + }, + { + "epoch": 7.424679659739421, + "grad_norm": 2.40625, + "learning_rate": 2.7620000000000004e-06, + "loss": 0.0457, + "step": 8620 + }, + { + "epoch": 7.433293851620545, + "grad_norm": 2.0625, + "learning_rate": 2.742e-06, + "loss": 0.0481, + "step": 8630 + }, + { + "epoch": 7.441908043501669, + "grad_norm": 2.6875, + "learning_rate": 2.7220000000000004e-06, + "loss": 0.0452, + "step": 8640 + }, + { + "epoch": 7.450522235382794, + "grad_norm": 2.265625, + "learning_rate": 2.702e-06, + "loss": 0.0428, + "step": 8650 + }, + { + "epoch": 7.459136427263918, + "grad_norm": 2.40625, + "learning_rate": 2.6820000000000003e-06, + "loss": 0.0508, + "step": 8660 + }, + { + "epoch": 7.467750619145042, + "grad_norm": 2.28125, + "learning_rate": 2.662e-06, + "loss": 0.0443, + "step": 8670 + }, + { + "epoch": 7.476364811026166, + "grad_norm": 2.5, + "learning_rate": 2.6420000000000003e-06, + "loss": 0.047, + "step": 8680 + }, + { + "epoch": 7.48497900290729, + "grad_norm": 2.609375, + "learning_rate": 2.622e-06, + "loss": 0.0457, + "step": 8690 + }, + { + "epoch": 7.493593194788414, + "grad_norm": 2.5625, + "learning_rate": 2.6020000000000002e-06, + "loss": 0.0467, + "step": 8700 + }, + { + "epoch": 7.502207386669538, + "grad_norm": 2.59375, + "learning_rate": 2.582e-06, + "loss": 0.0468, + "step": 8710 + }, + { + "epoch": 7.5108215785506625, + "grad_norm": 2.078125, + "learning_rate": 2.562e-06, + "loss": 0.0447, + "step": 8720 + }, + { + "epoch": 7.5194357704317865, + "grad_norm": 2.0625, + "learning_rate": 2.542e-06, + "loss": 0.0449, + "step": 8730 + }, + { + "epoch": 7.5280499623129105, + "grad_norm": 2.46875, + "learning_rate": 2.522e-06, + "loss": 0.0466, + "step": 8740 + }, + { + "epoch": 7.5366641541940345, + "grad_norm": 2.3125, + "learning_rate": 2.502e-06, + "loss": 0.0429, + "step": 8750 + }, + { + "epoch": 7.5452783460751585, + "grad_norm": 2.859375, + "learning_rate": 2.482e-06, + "loss": 0.0474, + "step": 8760 + }, + { + "epoch": 7.553892537956283, + "grad_norm": 2.5625, + "learning_rate": 2.4620000000000003e-06, + "loss": 0.0469, + "step": 8770 + }, + { + "epoch": 7.562506729837407, + "grad_norm": 2.421875, + "learning_rate": 2.442e-06, + "loss": 0.0467, + "step": 8780 + }, + { + "epoch": 7.571120921718531, + "grad_norm": 2.8125, + "learning_rate": 2.4220000000000003e-06, + "loss": 0.0497, + "step": 8790 + }, + { + "epoch": 7.579735113599655, + "grad_norm": 2.28125, + "learning_rate": 2.402e-06, + "loss": 0.045, + "step": 8800 + }, + { + "epoch": 7.588349305480779, + "grad_norm": 2.953125, + "learning_rate": 2.3820000000000002e-06, + "loss": 0.0472, + "step": 8810 + }, + { + "epoch": 7.596963497361903, + "grad_norm": 2.859375, + "learning_rate": 2.362e-06, + "loss": 0.0495, + "step": 8820 + }, + { + "epoch": 7.605577689243028, + "grad_norm": 2.40625, + "learning_rate": 2.342e-06, + "loss": 0.0441, + "step": 8830 + }, + { + "epoch": 7.614191881124152, + "grad_norm": 2.078125, + "learning_rate": 2.322e-06, + "loss": 0.0466, + "step": 8840 + }, + { + "epoch": 7.622806073005276, + "grad_norm": 2.671875, + "learning_rate": 2.302e-06, + "loss": 0.0447, + "step": 8850 + }, + { + "epoch": 7.6314202648864, + "grad_norm": 2.3125, + "learning_rate": 2.282e-06, + "loss": 0.0469, + "step": 8860 + }, + { + "epoch": 7.640034456767524, + "grad_norm": 3.34375, + "learning_rate": 2.262e-06, + "loss": 0.0475, + "step": 8870 + }, + { + "epoch": 7.648648648648649, + "grad_norm": 2.515625, + "learning_rate": 2.2420000000000003e-06, + "loss": 0.0457, + "step": 8880 + }, + { + "epoch": 7.657262840529773, + "grad_norm": 2.4375, + "learning_rate": 2.222e-06, + "loss": 0.0437, + "step": 8890 + }, + { + "epoch": 7.665877032410897, + "grad_norm": 2.25, + "learning_rate": 2.2020000000000003e-06, + "loss": 0.0467, + "step": 8900 + }, + { + "epoch": 7.674491224292021, + "grad_norm": 2.75, + "learning_rate": 2.182e-06, + "loss": 0.0472, + "step": 8910 + }, + { + "epoch": 7.683105416173145, + "grad_norm": 2.84375, + "learning_rate": 2.1620000000000002e-06, + "loss": 0.0477, + "step": 8920 + }, + { + "epoch": 7.691719608054269, + "grad_norm": 2.875, + "learning_rate": 2.142e-06, + "loss": 0.0469, + "step": 8930 + }, + { + "epoch": 7.700333799935393, + "grad_norm": 2.609375, + "learning_rate": 2.122e-06, + "loss": 0.0481, + "step": 8940 + }, + { + "epoch": 7.708947991816518, + "grad_norm": 2.984375, + "learning_rate": 2.102e-06, + "loss": 0.0464, + "step": 8950 + }, + { + "epoch": 7.717562183697642, + "grad_norm": 2.578125, + "learning_rate": 2.082e-06, + "loss": 0.0469, + "step": 8960 + }, + { + "epoch": 7.726176375578766, + "grad_norm": 2.515625, + "learning_rate": 2.062e-06, + "loss": 0.045, + "step": 8970 + }, + { + "epoch": 7.73479056745989, + "grad_norm": 2.4375, + "learning_rate": 2.042e-06, + "loss": 0.0434, + "step": 8980 + }, + { + "epoch": 7.743404759341014, + "grad_norm": 2.78125, + "learning_rate": 2.022e-06, + "loss": 0.0437, + "step": 8990 + }, + { + "epoch": 7.752018951222139, + "grad_norm": 2.59375, + "learning_rate": 2.002e-06, + "loss": 0.0469, + "step": 9000 + }, + { + "epoch": 7.760633143103263, + "grad_norm": 2.453125, + "learning_rate": 1.982e-06, + "loss": 0.0458, + "step": 9010 + }, + { + "epoch": 7.769247334984387, + "grad_norm": 2.609375, + "learning_rate": 1.9620000000000004e-06, + "loss": 0.049, + "step": 9020 + }, + { + "epoch": 7.777861526865511, + "grad_norm": 2.390625, + "learning_rate": 1.942e-06, + "loss": 0.0472, + "step": 9030 + }, + { + "epoch": 7.786475718746635, + "grad_norm": 2.1875, + "learning_rate": 1.9220000000000004e-06, + "loss": 0.0474, + "step": 9040 + }, + { + "epoch": 7.795089910627759, + "grad_norm": 2.609375, + "learning_rate": 1.9020000000000002e-06, + "loss": 0.0456, + "step": 9050 + }, + { + "epoch": 7.803704102508883, + "grad_norm": 2.296875, + "learning_rate": 1.8820000000000001e-06, + "loss": 0.0427, + "step": 9060 + }, + { + "epoch": 7.812318294390008, + "grad_norm": 2.71875, + "learning_rate": 1.8620000000000001e-06, + "loss": 0.0441, + "step": 9070 + }, + { + "epoch": 7.820932486271132, + "grad_norm": 2.296875, + "learning_rate": 1.8420000000000001e-06, + "loss": 0.0435, + "step": 9080 + }, + { + "epoch": 7.829546678152256, + "grad_norm": 2.4375, + "learning_rate": 1.822e-06, + "loss": 0.0489, + "step": 9090 + }, + { + "epoch": 7.83816087003338, + "grad_norm": 2.453125, + "learning_rate": 1.802e-06, + "loss": 0.0443, + "step": 9100 + }, + { + "epoch": 7.846775061914504, + "grad_norm": 2.578125, + "learning_rate": 1.782e-06, + "loss": 0.045, + "step": 9110 + }, + { + "epoch": 7.855389253795629, + "grad_norm": 2.765625, + "learning_rate": 1.762e-06, + "loss": 0.0422, + "step": 9120 + }, + { + "epoch": 7.864003445676753, + "grad_norm": 2.546875, + "learning_rate": 1.742e-06, + "loss": 0.0454, + "step": 9130 + }, + { + "epoch": 7.872617637557877, + "grad_norm": 2.46875, + "learning_rate": 1.722e-06, + "loss": 0.0441, + "step": 9140 + }, + { + "epoch": 7.881231829439001, + "grad_norm": 2.4375, + "learning_rate": 1.702e-06, + "loss": 0.046, + "step": 9150 + }, + { + "epoch": 7.889846021320125, + "grad_norm": 2.71875, + "learning_rate": 1.682e-06, + "loss": 0.0467, + "step": 9160 + }, + { + "epoch": 7.898460213201249, + "grad_norm": 2.75, + "learning_rate": 1.662e-06, + "loss": 0.0486, + "step": 9170 + }, + { + "epoch": 7.907074405082374, + "grad_norm": 2.640625, + "learning_rate": 1.6420000000000003e-06, + "loss": 0.0475, + "step": 9180 + }, + { + "epoch": 7.915688596963498, + "grad_norm": 2.40625, + "learning_rate": 1.6220000000000003e-06, + "loss": 0.0476, + "step": 9190 + }, + { + "epoch": 7.924302788844622, + "grad_norm": 2.234375, + "learning_rate": 1.6020000000000003e-06, + "loss": 0.0425, + "step": 9200 + }, + { + "epoch": 7.932916980725746, + "grad_norm": 2.28125, + "learning_rate": 1.5820000000000003e-06, + "loss": 0.0447, + "step": 9210 + }, + { + "epoch": 7.94153117260687, + "grad_norm": 2.109375, + "learning_rate": 1.5620000000000002e-06, + "loss": 0.0484, + "step": 9220 + }, + { + "epoch": 7.950145364487994, + "grad_norm": 2.46875, + "learning_rate": 1.5420000000000002e-06, + "loss": 0.0455, + "step": 9230 + }, + { + "epoch": 7.9587595563691185, + "grad_norm": 2.703125, + "learning_rate": 1.5220000000000002e-06, + "loss": 0.0462, + "step": 9240 + }, + { + "epoch": 7.9673737482502425, + "grad_norm": 2.328125, + "learning_rate": 1.5020000000000002e-06, + "loss": 0.045, + "step": 9250 + }, + { + "epoch": 7.9759879401313665, + "grad_norm": 2.34375, + "learning_rate": 1.4820000000000002e-06, + "loss": 0.0447, + "step": 9260 + }, + { + "epoch": 7.9846021320124905, + "grad_norm": 3.34375, + "learning_rate": 1.4620000000000001e-06, + "loss": 0.0472, + "step": 9270 + }, + { + "epoch": 7.9932163238936145, + "grad_norm": 2.578125, + "learning_rate": 1.4420000000000001e-06, + "loss": 0.047, + "step": 9280 + }, + { + "epoch": 8.001722838376224, + "grad_norm": 2.03125, + "learning_rate": 1.4220000000000001e-06, + "loss": 0.0444, + "step": 9290 + }, + { + "epoch": 8.010337030257348, + "grad_norm": 2.59375, + "learning_rate": 1.402e-06, + "loss": 0.0444, + "step": 9300 + }, + { + "epoch": 8.018951222138472, + "grad_norm": 1.953125, + "learning_rate": 1.382e-06, + "loss": 0.0397, + "step": 9310 + }, + { + "epoch": 8.027565414019596, + "grad_norm": 2.40625, + "learning_rate": 1.362e-06, + "loss": 0.0425, + "step": 9320 + }, + { + "epoch": 8.036179605900722, + "grad_norm": 2.1875, + "learning_rate": 1.3420000000000002e-06, + "loss": 0.043, + "step": 9330 + }, + { + "epoch": 8.044793797781846, + "grad_norm": 2.390625, + "learning_rate": 1.3220000000000002e-06, + "loss": 0.0451, + "step": 9340 + }, + { + "epoch": 8.05340798966297, + "grad_norm": 2.25, + "learning_rate": 1.3020000000000002e-06, + "loss": 0.0442, + "step": 9350 + }, + { + "epoch": 8.062022181544094, + "grad_norm": 1.96875, + "learning_rate": 1.2820000000000002e-06, + "loss": 0.0428, + "step": 9360 + }, + { + "epoch": 8.070636373425218, + "grad_norm": 2.171875, + "learning_rate": 1.2620000000000002e-06, + "loss": 0.0441, + "step": 9370 + }, + { + "epoch": 8.079250565306342, + "grad_norm": 2.390625, + "learning_rate": 1.2420000000000001e-06, + "loss": 0.045, + "step": 9380 + }, + { + "epoch": 8.087864757187466, + "grad_norm": 2.421875, + "learning_rate": 1.2220000000000001e-06, + "loss": 0.0443, + "step": 9390 + }, + { + "epoch": 8.09647894906859, + "grad_norm": 2.515625, + "learning_rate": 1.202e-06, + "loss": 0.0422, + "step": 9400 + }, + { + "epoch": 8.105093140949714, + "grad_norm": 2.28125, + "learning_rate": 1.182e-06, + "loss": 0.0417, + "step": 9410 + }, + { + "epoch": 8.113707332830838, + "grad_norm": 2.28125, + "learning_rate": 1.162e-06, + "loss": 0.0432, + "step": 9420 + }, + { + "epoch": 8.122321524711962, + "grad_norm": 2.453125, + "learning_rate": 1.142e-06, + "loss": 0.0419, + "step": 9430 + }, + { + "epoch": 8.130935716593086, + "grad_norm": 2.109375, + "learning_rate": 1.122e-06, + "loss": 0.0416, + "step": 9440 + }, + { + "epoch": 8.139549908474212, + "grad_norm": 2.359375, + "learning_rate": 1.1020000000000002e-06, + "loss": 0.0422, + "step": 9450 + }, + { + "epoch": 8.148164100355336, + "grad_norm": 2.09375, + "learning_rate": 1.0820000000000002e-06, + "loss": 0.0455, + "step": 9460 + }, + { + "epoch": 8.15677829223646, + "grad_norm": 2.609375, + "learning_rate": 1.0620000000000002e-06, + "loss": 0.0455, + "step": 9470 + }, + { + "epoch": 8.165392484117584, + "grad_norm": 2.296875, + "learning_rate": 1.0420000000000001e-06, + "loss": 0.0441, + "step": 9480 + }, + { + "epoch": 8.174006675998708, + "grad_norm": 2.484375, + "learning_rate": 1.0220000000000001e-06, + "loss": 0.0433, + "step": 9490 + }, + { + "epoch": 8.182620867879832, + "grad_norm": 2.453125, + "learning_rate": 1.002e-06, + "loss": 0.045, + "step": 9500 + }, + { + "epoch": 8.191235059760956, + "grad_norm": 2.15625, + "learning_rate": 9.82e-07, + "loss": 0.0443, + "step": 9510 + }, + { + "epoch": 8.19984925164208, + "grad_norm": 2.203125, + "learning_rate": 9.62e-07, + "loss": 0.0427, + "step": 9520 + }, + { + "epoch": 8.208463443523204, + "grad_norm": 2.0625, + "learning_rate": 9.420000000000002e-07, + "loss": 0.0412, + "step": 9530 + }, + { + "epoch": 8.217077635404328, + "grad_norm": 2.234375, + "learning_rate": 9.220000000000001e-07, + "loss": 0.0431, + "step": 9540 + }, + { + "epoch": 8.225691827285452, + "grad_norm": 2.28125, + "learning_rate": 9.020000000000001e-07, + "loss": 0.0414, + "step": 9550 + }, + { + "epoch": 8.234306019166578, + "grad_norm": 2.796875, + "learning_rate": 8.820000000000001e-07, + "loss": 0.0454, + "step": 9560 + }, + { + "epoch": 8.242920211047702, + "grad_norm": 2.40625, + "learning_rate": 8.620000000000001e-07, + "loss": 0.0417, + "step": 9570 + }, + { + "epoch": 8.251534402928826, + "grad_norm": 2.25, + "learning_rate": 8.42e-07, + "loss": 0.0446, + "step": 9580 + }, + { + "epoch": 8.26014859480995, + "grad_norm": 2.828125, + "learning_rate": 8.22e-07, + "loss": 0.0434, + "step": 9590 + }, + { + "epoch": 8.268762786691074, + "grad_norm": 2.140625, + "learning_rate": 8.02e-07, + "loss": 0.0397, + "step": 9600 + }, + { + "epoch": 8.277376978572198, + "grad_norm": 2.0625, + "learning_rate": 7.820000000000001e-07, + "loss": 0.0429, + "step": 9610 + }, + { + "epoch": 8.285991170453322, + "grad_norm": 1.921875, + "learning_rate": 7.620000000000001e-07, + "loss": 0.0421, + "step": 9620 + }, + { + "epoch": 8.294605362334446, + "grad_norm": 2.65625, + "learning_rate": 7.420000000000001e-07, + "loss": 0.0433, + "step": 9630 + }, + { + "epoch": 8.30321955421557, + "grad_norm": 2.203125, + "learning_rate": 7.22e-07, + "loss": 0.0449, + "step": 9640 + }, + { + "epoch": 8.311833746096694, + "grad_norm": 2.203125, + "learning_rate": 7.02e-07, + "loss": 0.043, + "step": 9650 + }, + { + "epoch": 8.320447937977818, + "grad_norm": 2.25, + "learning_rate": 6.82e-07, + "loss": 0.0442, + "step": 9660 + }, + { + "epoch": 8.329062129858942, + "grad_norm": 2.171875, + "learning_rate": 6.62e-07, + "loss": 0.045, + "step": 9670 + }, + { + "epoch": 8.337676321740068, + "grad_norm": 2.359375, + "learning_rate": 6.42e-07, + "loss": 0.0457, + "step": 9680 + }, + { + "epoch": 8.346290513621192, + "grad_norm": 2.265625, + "learning_rate": 6.22e-07, + "loss": 0.0431, + "step": 9690 + }, + { + "epoch": 8.354904705502316, + "grad_norm": 2.640625, + "learning_rate": 6.02e-07, + "loss": 0.0465, + "step": 9700 + }, + { + "epoch": 8.36351889738344, + "grad_norm": 2.34375, + "learning_rate": 5.820000000000001e-07, + "loss": 0.0423, + "step": 9710 + }, + { + "epoch": 8.372133089264564, + "grad_norm": 2.296875, + "learning_rate": 5.620000000000001e-07, + "loss": 0.0428, + "step": 9720 + }, + { + "epoch": 8.380747281145688, + "grad_norm": 2.21875, + "learning_rate": 5.420000000000001e-07, + "loss": 0.041, + "step": 9730 + }, + { + "epoch": 8.389361473026812, + "grad_norm": 2.734375, + "learning_rate": 5.22e-07, + "loss": 0.0434, + "step": 9740 + }, + { + "epoch": 8.397975664907936, + "grad_norm": 2.1875, + "learning_rate": 5.02e-07, + "loss": 0.0451, + "step": 9750 + }, + { + "epoch": 8.40658985678906, + "grad_norm": 2.34375, + "learning_rate": 4.82e-07, + "loss": 0.0422, + "step": 9760 + }, + { + "epoch": 8.415204048670184, + "grad_norm": 2.140625, + "learning_rate": 4.6200000000000003e-07, + "loss": 0.0443, + "step": 9770 + }, + { + "epoch": 8.423818240551308, + "grad_norm": 2.15625, + "learning_rate": 4.4200000000000007e-07, + "loss": 0.043, + "step": 9780 + }, + { + "epoch": 8.432432432432432, + "grad_norm": 2.46875, + "learning_rate": 4.2200000000000005e-07, + "loss": 0.043, + "step": 9790 + }, + { + "epoch": 8.441046624313557, + "grad_norm": 2.03125, + "learning_rate": 4.02e-07, + "loss": 0.0437, + "step": 9800 + }, + { + "epoch": 8.449660816194681, + "grad_norm": 2.4375, + "learning_rate": 3.82e-07, + "loss": 0.0428, + "step": 9810 + }, + { + "epoch": 8.458275008075805, + "grad_norm": 2.421875, + "learning_rate": 3.6200000000000004e-07, + "loss": 0.0453, + "step": 9820 + }, + { + "epoch": 8.46688919995693, + "grad_norm": 2.453125, + "learning_rate": 3.42e-07, + "loss": 0.0444, + "step": 9830 + }, + { + "epoch": 8.475503391838053, + "grad_norm": 2.890625, + "learning_rate": 3.22e-07, + "loss": 0.0433, + "step": 9840 + }, + { + "epoch": 8.484117583719177, + "grad_norm": 2.078125, + "learning_rate": 3.0200000000000003e-07, + "loss": 0.0442, + "step": 9850 + }, + { + "epoch": 8.492731775600301, + "grad_norm": 2.234375, + "learning_rate": 2.82e-07, + "loss": 0.0434, + "step": 9860 + }, + { + "epoch": 8.501345967481425, + "grad_norm": 2.40625, + "learning_rate": 2.6200000000000004e-07, + "loss": 0.0454, + "step": 9870 + }, + { + "epoch": 8.50996015936255, + "grad_norm": 2.0625, + "learning_rate": 2.42e-07, + "loss": 0.0412, + "step": 9880 + }, + { + "epoch": 8.518574351243673, + "grad_norm": 2.265625, + "learning_rate": 2.2200000000000003e-07, + "loss": 0.043, + "step": 9890 + }, + { + "epoch": 8.527188543124797, + "grad_norm": 2.703125, + "learning_rate": 2.02e-07, + "loss": 0.0412, + "step": 9900 + }, + { + "epoch": 8.535802735005923, + "grad_norm": 2.4375, + "learning_rate": 1.8200000000000002e-07, + "loss": 0.0431, + "step": 9910 + }, + { + "epoch": 8.544416926887047, + "grad_norm": 2.734375, + "learning_rate": 1.62e-07, + "loss": 0.0413, + "step": 9920 + }, + { + "epoch": 8.553031118768171, + "grad_norm": 2.296875, + "learning_rate": 1.4200000000000003e-07, + "loss": 0.0458, + "step": 9930 + }, + { + "epoch": 8.561645310649295, + "grad_norm": 2.359375, + "learning_rate": 1.22e-07, + "loss": 0.0445, + "step": 9940 + }, + { + "epoch": 8.57025950253042, + "grad_norm": 2.46875, + "learning_rate": 1.0200000000000001e-07, + "loss": 0.0454, + "step": 9950 + }, + { + "epoch": 8.578873694411543, + "grad_norm": 2.546875, + "learning_rate": 8.200000000000002e-08, + "loss": 0.0428, + "step": 9960 + }, + { + "epoch": 8.587487886292667, + "grad_norm": 2.09375, + "learning_rate": 6.2e-08, + "loss": 0.0419, + "step": 9970 + }, + { + "epoch": 8.596102078173791, + "grad_norm": 1.8984375, + "learning_rate": 4.2e-08, + "loss": 0.044, + "step": 9980 + }, + { + "epoch": 8.604716270054915, + "grad_norm": 2.078125, + "learning_rate": 2.2000000000000002e-08, + "loss": 0.0411, + "step": 9990 + }, + { + "epoch": 8.61333046193604, + "grad_norm": 2.5625, + "learning_rate": 2e-09, + "loss": 0.0433, + "step": 10000 + } + ], + "logging_steps": 10, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.840766623509979e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}