| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.2001067235859125, | |
| "eval_steps": 500, | |
| "global_step": 2250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0008893632159373888, | |
| "grad_norm": 68.5, | |
| "learning_rate": 2.7e-06, | |
| "loss": 8.8776, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0017787264318747777, | |
| "grad_norm": 39.0, | |
| "learning_rate": 5.7000000000000005e-06, | |
| "loss": 7.711, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0026680896478121665, | |
| "grad_norm": 29.375, | |
| "learning_rate": 8.7e-06, | |
| "loss": 5.7038, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0035574528637495554, | |
| "grad_norm": 24.5, | |
| "learning_rate": 1.1700000000000001e-05, | |
| "loss": 4.746, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.004446816079686944, | |
| "grad_norm": 22.625, | |
| "learning_rate": 1.47e-05, | |
| "loss": 4.1094, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.005336179295624333, | |
| "grad_norm": 21.625, | |
| "learning_rate": 1.77e-05, | |
| "loss": 4.0746, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0062255425115617215, | |
| "grad_norm": 25.25, | |
| "learning_rate": 2.07e-05, | |
| "loss": 3.8396, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.007114905727499111, | |
| "grad_norm": 20.875, | |
| "learning_rate": 2.37e-05, | |
| "loss": 3.8629, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0080042689434365, | |
| "grad_norm": 22.875, | |
| "learning_rate": 2.6700000000000002e-05, | |
| "loss": 3.9097, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.008893632159373888, | |
| "grad_norm": 19.375, | |
| "learning_rate": 2.97e-05, | |
| "loss": 3.763, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.009782995375311278, | |
| "grad_norm": 19.625, | |
| "learning_rate": 2.9874418604651165e-05, | |
| "loss": 4.1501, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.010672358591248666, | |
| "grad_norm": 19.0, | |
| "learning_rate": 2.9734883720930235e-05, | |
| "loss": 3.9038, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.011561721807186055, | |
| "grad_norm": 16.625, | |
| "learning_rate": 2.9595348837209305e-05, | |
| "loss": 4.0583, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.012451085023123443, | |
| "grad_norm": 16.375, | |
| "learning_rate": 2.9455813953488376e-05, | |
| "loss": 3.5958, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.013340448239060833, | |
| "grad_norm": 15.375, | |
| "learning_rate": 2.9316279069767443e-05, | |
| "loss": 3.9075, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.014229811454998222, | |
| "grad_norm": 15.625, | |
| "learning_rate": 2.9176744186046513e-05, | |
| "loss": 3.9495, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.01511917467093561, | |
| "grad_norm": 18.125, | |
| "learning_rate": 2.9037209302325583e-05, | |
| "loss": 3.3247, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.016008537886873, | |
| "grad_norm": 18.75, | |
| "learning_rate": 2.889767441860465e-05, | |
| "loss": 3.6863, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.01689790110281039, | |
| "grad_norm": 15.625, | |
| "learning_rate": 2.875813953488372e-05, | |
| "loss": 3.3097, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.017787264318747775, | |
| "grad_norm": 14.3125, | |
| "learning_rate": 2.861860465116279e-05, | |
| "loss": 3.48, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.018676627534685165, | |
| "grad_norm": 17.875, | |
| "learning_rate": 2.847906976744186e-05, | |
| "loss": 3.6216, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.019565990750622556, | |
| "grad_norm": 15.4375, | |
| "learning_rate": 2.833953488372093e-05, | |
| "loss": 3.4751, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.020455353966559942, | |
| "grad_norm": 14.75, | |
| "learning_rate": 2.8199999999999998e-05, | |
| "loss": 3.9, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.021344717182497332, | |
| "grad_norm": 14.8125, | |
| "learning_rate": 2.8060465116279068e-05, | |
| "loss": 3.7454, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.02223408039843472, | |
| "grad_norm": 15.6875, | |
| "learning_rate": 2.7920930232558138e-05, | |
| "loss": 3.5053, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.02312344361437211, | |
| "grad_norm": 14.5, | |
| "learning_rate": 2.778139534883721e-05, | |
| "loss": 3.3656, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.0240128068303095, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 2.764186046511628e-05, | |
| "loss": 3.769, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.024902170046246886, | |
| "grad_norm": 15.0625, | |
| "learning_rate": 2.750232558139535e-05, | |
| "loss": 3.5721, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.025791533262184276, | |
| "grad_norm": 14.6875, | |
| "learning_rate": 2.736279069767442e-05, | |
| "loss": 3.6104, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.026680896478121666, | |
| "grad_norm": 13.875, | |
| "learning_rate": 2.722325581395349e-05, | |
| "loss": 3.5904, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.027570259694059053, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 2.7083720930232556e-05, | |
| "loss": 3.5633, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.028459622909996443, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 2.6944186046511626e-05, | |
| "loss": 3.3366, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.02934898612593383, | |
| "grad_norm": 14.3125, | |
| "learning_rate": 2.6804651162790697e-05, | |
| "loss": 3.6519, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.03023834934187122, | |
| "grad_norm": 14.4375, | |
| "learning_rate": 2.6665116279069767e-05, | |
| "loss": 3.4482, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.03112771255780861, | |
| "grad_norm": 13.9375, | |
| "learning_rate": 2.6525581395348837e-05, | |
| "loss": 3.3554, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.032017075773746, | |
| "grad_norm": 14.5625, | |
| "learning_rate": 2.6386046511627907e-05, | |
| "loss": 3.4008, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.03290643898968339, | |
| "grad_norm": 14.375, | |
| "learning_rate": 2.6246511627906978e-05, | |
| "loss": 3.3279, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.03379580220562078, | |
| "grad_norm": 13.9375, | |
| "learning_rate": 2.6106976744186048e-05, | |
| "loss": 3.487, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.03468516542155817, | |
| "grad_norm": 14.625, | |
| "learning_rate": 2.5967441860465115e-05, | |
| "loss": 3.2542, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.03557452863749555, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 2.5827906976744185e-05, | |
| "loss": 3.3328, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.03646389185343294, | |
| "grad_norm": 13.25, | |
| "learning_rate": 2.5688372093023255e-05, | |
| "loss": 3.5205, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.03735325506937033, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 2.5548837209302325e-05, | |
| "loss": 3.4151, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.03824261828530772, | |
| "grad_norm": 13.625, | |
| "learning_rate": 2.5409302325581396e-05, | |
| "loss": 3.1404, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.03913198150124511, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 2.5269767441860466e-05, | |
| "loss": 3.5277, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.040021344717182494, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 2.5130232558139536e-05, | |
| "loss": 3.1584, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.040910707933119884, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 2.4990697674418606e-05, | |
| "loss": 3.4669, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.041800071149057275, | |
| "grad_norm": 14.5625, | |
| "learning_rate": 2.4851162790697673e-05, | |
| "loss": 3.2653, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.042689434364994665, | |
| "grad_norm": 13.9375, | |
| "learning_rate": 2.4711627906976743e-05, | |
| "loss": 3.4219, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.043578797580932055, | |
| "grad_norm": 12.875, | |
| "learning_rate": 2.4572093023255814e-05, | |
| "loss": 3.508, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.04446816079686944, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 2.4432558139534884e-05, | |
| "loss": 3.3343, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.04535752401280683, | |
| "grad_norm": 13.75, | |
| "learning_rate": 2.4293023255813954e-05, | |
| "loss": 3.5054, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.04624688722874422, | |
| "grad_norm": 15.25, | |
| "learning_rate": 2.4153488372093024e-05, | |
| "loss": 3.0298, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.04713625044468161, | |
| "grad_norm": 13.75, | |
| "learning_rate": 2.4013953488372095e-05, | |
| "loss": 3.4306, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.048025613660619, | |
| "grad_norm": 14.25, | |
| "learning_rate": 2.3874418604651165e-05, | |
| "loss": 3.1824, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.04891497687655639, | |
| "grad_norm": 12.3125, | |
| "learning_rate": 2.373488372093023e-05, | |
| "loss": 3.211, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.04980434009249377, | |
| "grad_norm": 13.75, | |
| "learning_rate": 2.3595348837209302e-05, | |
| "loss": 3.091, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.05069370330843116, | |
| "grad_norm": 13.75, | |
| "learning_rate": 2.3455813953488372e-05, | |
| "loss": 3.234, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.05158306652436855, | |
| "grad_norm": 13.75, | |
| "learning_rate": 2.3316279069767442e-05, | |
| "loss": 3.1404, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.05247242974030594, | |
| "grad_norm": 13.25, | |
| "learning_rate": 2.3176744186046513e-05, | |
| "loss": 3.2191, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.05336179295624333, | |
| "grad_norm": 12.625, | |
| "learning_rate": 2.3037209302325583e-05, | |
| "loss": 3.0968, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.054251156172180716, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 2.2897674418604653e-05, | |
| "loss": 3.1108, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.055140519388118106, | |
| "grad_norm": 12.75, | |
| "learning_rate": 2.2758139534883723e-05, | |
| "loss": 3.028, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.056029882604055496, | |
| "grad_norm": 12.375, | |
| "learning_rate": 2.261860465116279e-05, | |
| "loss": 3.1469, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.056919245819992886, | |
| "grad_norm": 14.8125, | |
| "learning_rate": 2.247906976744186e-05, | |
| "loss": 3.1258, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.057808609035930276, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 2.233953488372093e-05, | |
| "loss": 3.1147, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.05869797225186766, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 2.22e-05, | |
| "loss": 3.0447, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.05958733546780505, | |
| "grad_norm": 15.3125, | |
| "learning_rate": 2.206046511627907e-05, | |
| "loss": 3.1177, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.06047669868374244, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 2.192093023255814e-05, | |
| "loss": 3.2542, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.06136606189967983, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 2.178139534883721e-05, | |
| "loss": 3.188, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.06225542511561722, | |
| "grad_norm": 13.625, | |
| "learning_rate": 2.1641860465116282e-05, | |
| "loss": 3.2298, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.06314478833155461, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 2.150232558139535e-05, | |
| "loss": 3.1127, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.064034151547492, | |
| "grad_norm": 15.1875, | |
| "learning_rate": 2.136279069767442e-05, | |
| "loss": 3.1468, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.06492351476342939, | |
| "grad_norm": 12.625, | |
| "learning_rate": 2.122325581395349e-05, | |
| "loss": 3.2763, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.06581287797936677, | |
| "grad_norm": 13.875, | |
| "learning_rate": 2.108372093023256e-05, | |
| "loss": 3.051, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.06670224119530416, | |
| "grad_norm": 13.9375, | |
| "learning_rate": 2.094418604651163e-05, | |
| "loss": 3.1937, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.06759160441124155, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 2.08046511627907e-05, | |
| "loss": 3.0506, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.06848096762717894, | |
| "grad_norm": 12.625, | |
| "learning_rate": 2.066511627906977e-05, | |
| "loss": 2.975, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.06937033084311633, | |
| "grad_norm": 12.75, | |
| "learning_rate": 2.052558139534884e-05, | |
| "loss": 2.9174, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.07025969405905372, | |
| "grad_norm": 12.875, | |
| "learning_rate": 2.0386046511627907e-05, | |
| "loss": 3.0532, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.0711490572749911, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 2.0246511627906977e-05, | |
| "loss": 3.2092, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.0720384204909285, | |
| "grad_norm": 12.625, | |
| "learning_rate": 2.0106976744186048e-05, | |
| "loss": 2.9702, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.07292778370686588, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 1.9967441860465118e-05, | |
| "loss": 3.0595, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.07381714692280328, | |
| "grad_norm": 14.5, | |
| "learning_rate": 1.9827906976744188e-05, | |
| "loss": 3.0102, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.07470651013874066, | |
| "grad_norm": 13.25, | |
| "learning_rate": 1.968837209302326e-05, | |
| "loss": 3.0725, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.07559587335467804, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 1.954883720930233e-05, | |
| "loss": 3.0679, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.07648523657061544, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 1.94093023255814e-05, | |
| "loss": 3.0764, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.07737459978655283, | |
| "grad_norm": 14.5, | |
| "learning_rate": 1.9269767441860466e-05, | |
| "loss": 2.768, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.07826396300249022, | |
| "grad_norm": 14.875, | |
| "learning_rate": 1.9130232558139536e-05, | |
| "loss": 3.2016, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.0791533262184276, | |
| "grad_norm": 12.875, | |
| "learning_rate": 1.8990697674418606e-05, | |
| "loss": 3.0906, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.08004268943436499, | |
| "grad_norm": 13.0, | |
| "learning_rate": 1.8851162790697673e-05, | |
| "loss": 3.1441, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.08093205265030239, | |
| "grad_norm": 13.25, | |
| "learning_rate": 1.8711627906976743e-05, | |
| "loss": 2.9791, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.08182141586623977, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 1.8572093023255814e-05, | |
| "loss": 2.8393, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.08271077908217717, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 1.8432558139534884e-05, | |
| "loss": 2.9332, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.08360014229811455, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 1.8293023255813954e-05, | |
| "loss": 2.9787, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.08448950551405193, | |
| "grad_norm": 15.375, | |
| "learning_rate": 1.815348837209302e-05, | |
| "loss": 2.8229, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.08537886872998933, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 1.801395348837209e-05, | |
| "loss": 2.8112, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.08626823194592671, | |
| "grad_norm": 16.75, | |
| "learning_rate": 1.787441860465116e-05, | |
| "loss": 3.0131, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.08715759516186411, | |
| "grad_norm": 13.9375, | |
| "learning_rate": 1.773488372093023e-05, | |
| "loss": 2.9496, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.08804695837780149, | |
| "grad_norm": 15.0625, | |
| "learning_rate": 1.7595348837209302e-05, | |
| "loss": 3.0641, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.08893632159373888, | |
| "grad_norm": 13.0, | |
| "learning_rate": 1.7455813953488372e-05, | |
| "loss": 2.8714, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.08982568480967627, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 1.7316279069767442e-05, | |
| "loss": 2.7792, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.09071504802561366, | |
| "grad_norm": 13.875, | |
| "learning_rate": 1.7176744186046512e-05, | |
| "loss": 2.9472, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.09160441124155105, | |
| "grad_norm": 14.0, | |
| "learning_rate": 1.703720930232558e-05, | |
| "loss": 2.5662, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.09249377445748844, | |
| "grad_norm": 13.9375, | |
| "learning_rate": 1.689767441860465e-05, | |
| "loss": 2.82, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.09338313767342583, | |
| "grad_norm": 13.875, | |
| "learning_rate": 1.675813953488372e-05, | |
| "loss": 2.8245, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.09427250088936322, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 1.661860465116279e-05, | |
| "loss": 2.8141, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.0951618641053006, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 1.647906976744186e-05, | |
| "loss": 2.8975, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.096051227321238, | |
| "grad_norm": 14.625, | |
| "learning_rate": 1.633953488372093e-05, | |
| "loss": 2.7996, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.09694059053717538, | |
| "grad_norm": 12.625, | |
| "learning_rate": 1.62e-05, | |
| "loss": 2.7679, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.09782995375311278, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 1.606046511627907e-05, | |
| "loss": 3.0057, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.09871931696905016, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 1.5920930232558138e-05, | |
| "loss": 2.5742, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.09960868018498754, | |
| "grad_norm": 15.3125, | |
| "learning_rate": 1.5781395348837208e-05, | |
| "loss": 2.7903, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.10049804340092494, | |
| "grad_norm": 14.8125, | |
| "learning_rate": 1.564186046511628e-05, | |
| "loss": 2.5664, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.10138740661686232, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 1.550232558139535e-05, | |
| "loss": 2.9392, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.10227676983279972, | |
| "grad_norm": 13.9375, | |
| "learning_rate": 1.536279069767442e-05, | |
| "loss": 2.7105, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.1031661330487371, | |
| "grad_norm": 14.5, | |
| "learning_rate": 1.5223255813953489e-05, | |
| "loss": 2.9472, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.10405549626467449, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 1.508372093023256e-05, | |
| "loss": 2.7698, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.10494485948061189, | |
| "grad_norm": 13.125, | |
| "learning_rate": 1.4944186046511628e-05, | |
| "loss": 2.8998, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.10583422269654927, | |
| "grad_norm": 14.125, | |
| "learning_rate": 1.4804651162790698e-05, | |
| "loss": 2.7952, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.10672358591248667, | |
| "grad_norm": 14.875, | |
| "learning_rate": 1.4665116279069768e-05, | |
| "loss": 2.7689, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.10761294912842405, | |
| "grad_norm": 13.375, | |
| "learning_rate": 1.4525581395348837e-05, | |
| "loss": 3.0368, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.10850231234436143, | |
| "grad_norm": 14.5, | |
| "learning_rate": 1.4386046511627907e-05, | |
| "loss": 2.8113, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.10939167556029883, | |
| "grad_norm": 13.25, | |
| "learning_rate": 1.4246511627906977e-05, | |
| "loss": 2.5883, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.11028103877623621, | |
| "grad_norm": 14.25, | |
| "learning_rate": 1.4106976744186048e-05, | |
| "loss": 2.9207, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.11117040199217361, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 1.3967441860465116e-05, | |
| "loss": 2.8662, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.11205976520811099, | |
| "grad_norm": 14.25, | |
| "learning_rate": 1.3827906976744186e-05, | |
| "loss": 2.8439, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.11294912842404838, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 1.3688372093023257e-05, | |
| "loss": 2.8322, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.11383849163998577, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 1.3548837209302327e-05, | |
| "loss": 2.8627, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.11472785485592316, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 1.3409302325581395e-05, | |
| "loss": 2.733, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.11561721807186055, | |
| "grad_norm": 12.75, | |
| "learning_rate": 1.3269767441860466e-05, | |
| "loss": 2.7608, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.11650658128779794, | |
| "grad_norm": 14.4375, | |
| "learning_rate": 1.3130232558139536e-05, | |
| "loss": 2.7633, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.11739594450373532, | |
| "grad_norm": 13.875, | |
| "learning_rate": 1.2990697674418606e-05, | |
| "loss": 2.682, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.11828530771967272, | |
| "grad_norm": 14.4375, | |
| "learning_rate": 1.2851162790697675e-05, | |
| "loss": 2.7205, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.1191746709356101, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 1.2711627906976745e-05, | |
| "loss": 2.7408, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.1200640341515475, | |
| "grad_norm": 13.5, | |
| "learning_rate": 1.2572093023255815e-05, | |
| "loss": 3.0955, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.12095339736748488, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 1.2432558139534885e-05, | |
| "loss": 2.8928, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.12184276058342226, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 1.2293023255813954e-05, | |
| "loss": 2.7429, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.12273212379935966, | |
| "grad_norm": 14.3125, | |
| "learning_rate": 1.2153488372093024e-05, | |
| "loss": 2.6795, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.12362148701529704, | |
| "grad_norm": 13.125, | |
| "learning_rate": 1.2013953488372094e-05, | |
| "loss": 2.6802, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.12451085023123444, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 1.1874418604651165e-05, | |
| "loss": 2.6818, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.12540021344717184, | |
| "grad_norm": 13.875, | |
| "learning_rate": 1.1734883720930233e-05, | |
| "loss": 2.5968, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.12628957666310922, | |
| "grad_norm": 13.75, | |
| "learning_rate": 1.1595348837209303e-05, | |
| "loss": 2.7026, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.1271789398790466, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 1.1455813953488372e-05, | |
| "loss": 2.8092, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.128068303094984, | |
| "grad_norm": 13.9375, | |
| "learning_rate": 1.1316279069767442e-05, | |
| "loss": 2.6957, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.12895766631092137, | |
| "grad_norm": 14.0, | |
| "learning_rate": 1.117674418604651e-05, | |
| "loss": 2.8901, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.12984702952685878, | |
| "grad_norm": 14.3125, | |
| "learning_rate": 1.1037209302325581e-05, | |
| "loss": 2.6284, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.13073639274279616, | |
| "grad_norm": 13.875, | |
| "learning_rate": 1.0897674418604651e-05, | |
| "loss": 2.9493, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.13162575595873355, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 1.0758139534883721e-05, | |
| "loss": 2.6665, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.13251511917467093, | |
| "grad_norm": 14.125, | |
| "learning_rate": 1.061860465116279e-05, | |
| "loss": 2.8124, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.13340448239060831, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 1.047906976744186e-05, | |
| "loss": 2.8094, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.13429384560654573, | |
| "grad_norm": 14.3125, | |
| "learning_rate": 1.033953488372093e-05, | |
| "loss": 2.6246, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.1351832088224831, | |
| "grad_norm": 14.5625, | |
| "learning_rate": 1.02e-05, | |
| "loss": 2.8486, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.1360725720384205, | |
| "grad_norm": 13.625, | |
| "learning_rate": 1.0060465116279069e-05, | |
| "loss": 2.7712, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.13696193525435787, | |
| "grad_norm": 14.5625, | |
| "learning_rate": 9.92093023255814e-06, | |
| "loss": 2.7667, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.13785129847029526, | |
| "grad_norm": 14.375, | |
| "learning_rate": 9.78139534883721e-06, | |
| "loss": 2.6918, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.13874066168623267, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 9.64186046511628e-06, | |
| "loss": 2.872, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.13963002490217005, | |
| "grad_norm": 14.375, | |
| "learning_rate": 9.502325581395348e-06, | |
| "loss": 2.9586, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.14051938811810744, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 9.362790697674419e-06, | |
| "loss": 2.5803, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.14140875133404482, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 9.223255813953489e-06, | |
| "loss": 2.72, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.1422981145499822, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 9.083720930232559e-06, | |
| "loss": 2.7268, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.1431874777659196, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 8.944186046511628e-06, | |
| "loss": 2.7176, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.144076840981857, | |
| "grad_norm": 15.125, | |
| "learning_rate": 8.804651162790698e-06, | |
| "loss": 2.6748, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.14496620419779438, | |
| "grad_norm": 15.25, | |
| "learning_rate": 8.665116279069768e-06, | |
| "loss": 2.7349, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.14585556741373176, | |
| "grad_norm": 15.0625, | |
| "learning_rate": 8.525581395348838e-06, | |
| "loss": 2.729, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.14674493062966915, | |
| "grad_norm": 14.5625, | |
| "learning_rate": 8.386046511627907e-06, | |
| "loss": 2.6542, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.14763429384560656, | |
| "grad_norm": 14.625, | |
| "learning_rate": 8.246511627906977e-06, | |
| "loss": 2.6102, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.14852365706154394, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 8.106976744186047e-06, | |
| "loss": 2.7355, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.14941302027748132, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 7.967441860465118e-06, | |
| "loss": 2.7689, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.1503023834934187, | |
| "grad_norm": 14.3125, | |
| "learning_rate": 7.827906976744186e-06, | |
| "loss": 2.7373, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.1511917467093561, | |
| "grad_norm": 15.25, | |
| "learning_rate": 7.688372093023256e-06, | |
| "loss": 2.7359, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.1520811099252935, | |
| "grad_norm": 15.0, | |
| "learning_rate": 7.548837209302326e-06, | |
| "loss": 2.6088, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.15297047314123088, | |
| "grad_norm": 14.4375, | |
| "learning_rate": 7.409302325581395e-06, | |
| "loss": 2.7727, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.15385983635716827, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 7.269767441860465e-06, | |
| "loss": 2.8457, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.15474919957310565, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 7.130232558139535e-06, | |
| "loss": 2.6671, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.15563856278904303, | |
| "grad_norm": 14.0, | |
| "learning_rate": 6.990697674418605e-06, | |
| "loss": 2.593, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.15652792600498044, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 6.851162790697674e-06, | |
| "loss": 2.6174, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.15741728922091783, | |
| "grad_norm": 14.375, | |
| "learning_rate": 6.711627906976745e-06, | |
| "loss": 2.6143, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.1583066524368552, | |
| "grad_norm": 12.875, | |
| "learning_rate": 6.572093023255814e-06, | |
| "loss": 2.7958, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.1591960156527926, | |
| "grad_norm": 13.875, | |
| "learning_rate": 6.432558139534884e-06, | |
| "loss": 2.6567, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.16008537886872998, | |
| "grad_norm": 14.625, | |
| "learning_rate": 6.293023255813954e-06, | |
| "loss": 2.5427, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.1609747420846674, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 6.153488372093024e-06, | |
| "loss": 2.6013, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.16186410530060477, | |
| "grad_norm": 14.8125, | |
| "learning_rate": 6.013953488372093e-06, | |
| "loss": 2.6624, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.16275346851654215, | |
| "grad_norm": 14.625, | |
| "learning_rate": 5.8744186046511635e-06, | |
| "loss": 2.6861, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.16364283173247954, | |
| "grad_norm": 12.875, | |
| "learning_rate": 5.734883720930233e-06, | |
| "loss": 2.7337, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.16453219494841692, | |
| "grad_norm": 13.25, | |
| "learning_rate": 5.595348837209303e-06, | |
| "loss": 2.532, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.16542155816435433, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 5.4558139534883726e-06, | |
| "loss": 2.5045, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.16631092138029172, | |
| "grad_norm": 14.5625, | |
| "learning_rate": 5.316279069767443e-06, | |
| "loss": 2.8413, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.1672002845962291, | |
| "grad_norm": 14.8125, | |
| "learning_rate": 5.176744186046511e-06, | |
| "loss": 2.6274, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.16808964781216648, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 5.0372093023255816e-06, | |
| "loss": 2.5649, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.16897901102810386, | |
| "grad_norm": 15.0625, | |
| "learning_rate": 4.897674418604651e-06, | |
| "loss": 2.7143, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.16986837424404128, | |
| "grad_norm": 15.125, | |
| "learning_rate": 4.758139534883721e-06, | |
| "loss": 2.642, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.17075773745997866, | |
| "grad_norm": 12.875, | |
| "learning_rate": 4.618604651162791e-06, | |
| "loss": 2.8059, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.17164710067591604, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 4.479069767441861e-06, | |
| "loss": 2.6971, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.17253646389185343, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 4.33953488372093e-06, | |
| "loss": 2.6398, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.1734258271077908, | |
| "grad_norm": 16.375, | |
| "learning_rate": 4.2000000000000004e-06, | |
| "loss": 2.9306, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.17431519032372822, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 4.06046511627907e-06, | |
| "loss": 2.7713, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.1752045535396656, | |
| "grad_norm": 13.125, | |
| "learning_rate": 3.92093023255814e-06, | |
| "loss": 2.7845, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.17609391675560299, | |
| "grad_norm": 13.5, | |
| "learning_rate": 3.7813953488372095e-06, | |
| "loss": 2.5285, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.17698327997154037, | |
| "grad_norm": 14.75, | |
| "learning_rate": 3.6418604651162793e-06, | |
| "loss": 2.7907, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.17787264318747775, | |
| "grad_norm": 14.25, | |
| "learning_rate": 3.502325581395349e-06, | |
| "loss": 2.6383, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.17876200640341516, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 3.3627906976744185e-06, | |
| "loss": 2.62, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.17965136961935255, | |
| "grad_norm": 16.0, | |
| "learning_rate": 3.2232558139534883e-06, | |
| "loss": 2.6536, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.18054073283528993, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 3.083720930232558e-06, | |
| "loss": 2.6178, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.1814300960512273, | |
| "grad_norm": 15.0, | |
| "learning_rate": 2.944186046511628e-06, | |
| "loss": 2.7065, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.1823194592671647, | |
| "grad_norm": 12.375, | |
| "learning_rate": 2.8046511627906977e-06, | |
| "loss": 2.6898, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.1832088224831021, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 2.6651162790697675e-06, | |
| "loss": 2.5896, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.1840981856990395, | |
| "grad_norm": 15.4375, | |
| "learning_rate": 2.5255813953488374e-06, | |
| "loss": 2.6833, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.18498754891497687, | |
| "grad_norm": 15.5, | |
| "learning_rate": 2.386046511627907e-06, | |
| "loss": 2.7408, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.18587691213091426, | |
| "grad_norm": 13.375, | |
| "learning_rate": 2.246511627906977e-06, | |
| "loss": 2.7091, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.18676627534685167, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 2.1069767441860464e-06, | |
| "loss": 2.4644, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.18765563856278905, | |
| "grad_norm": 14.625, | |
| "learning_rate": 1.967441860465116e-06, | |
| "loss": 2.7544, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.18854500177872643, | |
| "grad_norm": 14.625, | |
| "learning_rate": 1.8279069767441862e-06, | |
| "loss": 2.7613, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.18943436499466382, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 1.6883720930232558e-06, | |
| "loss": 2.5281, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.1903237282106012, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 1.5488372093023256e-06, | |
| "loss": 2.56, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.1912130914265386, | |
| "grad_norm": 14.0, | |
| "learning_rate": 1.4093023255813954e-06, | |
| "loss": 2.6563, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.192102454642476, | |
| "grad_norm": 14.0, | |
| "learning_rate": 1.2697674418604653e-06, | |
| "loss": 2.6377, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.19299181785841338, | |
| "grad_norm": 14.4375, | |
| "learning_rate": 1.1302325581395349e-06, | |
| "loss": 2.694, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.19388118107435076, | |
| "grad_norm": 14.5, | |
| "learning_rate": 9.906976744186047e-07, | |
| "loss": 2.6686, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.19477054429028814, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 8.511627906976745e-07, | |
| "loss": 2.7232, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.19565990750622556, | |
| "grad_norm": 14.5625, | |
| "learning_rate": 7.116279069767442e-07, | |
| "loss": 2.7544, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.19654927072216294, | |
| "grad_norm": 13.5, | |
| "learning_rate": 5.72093023255814e-07, | |
| "loss": 2.6411, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.19743863393810032, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 4.325581395348837e-07, | |
| "loss": 2.5296, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.1983279971540377, | |
| "grad_norm": 15.25, | |
| "learning_rate": 2.9302325581395347e-07, | |
| "loss": 2.5208, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.1992173603699751, | |
| "grad_norm": 13.125, | |
| "learning_rate": 1.5348837209302325e-07, | |
| "loss": 2.6465, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.2001067235859125, | |
| "grad_norm": 14.75, | |
| "learning_rate": 1.3953488372093025e-08, | |
| "loss": 2.7407, | |
| "step": 2250 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3699129752e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |