{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.6351851851851853, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 341.3775576636905, "epoch": 0.006481481481481481, "grad_norm": 0.14961056411266327, "learning_rate": 0.0, "loss": -0.0119, "num_tokens": 3461091.0, "reward": 1.2052154455866133, "reward_std": 0.5575123486064729, "rewards/acc_reward_func": 1.2052154285567147, "step": 1 }, { "clip_ratio": 0.0, "epoch": 0.012962962962962963, "grad_norm": 0.1496139019727707, "learning_rate": 1.2987012987012988e-08, "loss": -0.0119, "step": 2 }, { "clip_ratio": 0.0001939246332247941, "epoch": 0.019444444444444445, "grad_norm": 0.1510600745677948, "learning_rate": 2.5974025974025976e-08, "loss": -0.0118, "step": 3 }, { "clip_ratio": 0.0001826526765528667, "epoch": 0.025925925925925925, "grad_norm": 0.15029065310955048, "learning_rate": 3.8961038961038956e-08, "loss": -0.0118, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 339.6235874720982, "epoch": 0.032407407407407406, "grad_norm": 0.14211858808994293, "learning_rate": 5.194805194805195e-08, "loss": 0.003, "num_tokens": 7190101.0, "reward": 1.2403628343627566, "reward_std": 0.5266946000712258, "rewards/acc_reward_func": 1.2403628116562253, "step": 5 }, { "clip_ratio": 0.00025525176377933737, "epoch": 0.03888888888888889, "grad_norm": 0.1438485085964203, "learning_rate": 6.493506493506492e-08, "loss": 0.0029, "step": 6 }, { "clip_ratio": 0.0002173587291465429, "epoch": 0.04537037037037037, "grad_norm": 0.14308251440525055, "learning_rate": 7.792207792207791e-08, "loss": 0.003, "step": 7 }, { "clip_ratio": 0.00018235097082825157, "epoch": 0.05185185185185185, "grad_norm": 0.14172835648059845, "learning_rate": 9.09090909090909e-08, "loss": 0.0029, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 342.4966067359561, "epoch": 0.058333333333333334, "grad_norm": 0.15887229144573212, "learning_rate": 1.038961038961039e-07, "loss": -0.0015, "num_tokens": 10969747.0, "reward": 1.119047638915834, "reward_std": 0.5877589228607359, "rewards/acc_reward_func": 1.1190476105326699, "step": 9 }, { "clip_ratio": 0.00026269415199446183, "epoch": 0.06481481481481481, "grad_norm": 0.15948110818862915, "learning_rate": 1.1688311688311688e-07, "loss": -0.0015, "step": 10 }, { "clip_ratio": 0.00023042162952368104, "epoch": 0.0712962962962963, "grad_norm": 0.16856980323791504, "learning_rate": 1.2987012987012984e-07, "loss": -0.0015, "step": 11 }, { "clip_ratio": 0.0002131570793045241, "epoch": 0.07777777777777778, "grad_norm": 0.15859223902225494, "learning_rate": 1.4285714285714285e-07, "loss": -0.0015, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 336.6644025530134, "epoch": 0.08425925925925926, "grad_norm": 0.14740879833698273, "learning_rate": 1.5584415584415582e-07, "loss": -0.0089, "num_tokens": 14484005.0, "reward": 1.2426304050854273, "reward_std": 0.5664714901220231, "rewards/acc_reward_func": 1.2426303937321617, "step": 13 }, { "clip_ratio": 0.00024305369533110586, "epoch": 0.09074074074074075, "grad_norm": 0.14839285612106323, "learning_rate": 1.6883116883116883e-07, "loss": -0.0089, "step": 14 }, { "clip_ratio": 0.0002752210618512306, "epoch": 0.09722222222222222, "grad_norm": 0.14957986772060394, "learning_rate": 1.818181818181818e-07, "loss": -0.0089, "step": 15 }, { "clip_ratio": 0.0002536561978991986, "epoch": 0.1037037037037037, "grad_norm": 0.14792965352535248, "learning_rate": 1.948051948051948e-07, "loss": -0.0089, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 342.96145775204616, "epoch": 0.11018518518518519, "grad_norm": 0.1590029001235962, "learning_rate": 2.077922077922078e-07, "loss": -0.0036, "num_tokens": 17968861.0, "reward": 1.1337868769963582, "reward_std": 0.6494243343671163, "rewards/acc_reward_func": 1.1337868372599285, "step": 17 }, { "clip_ratio": 0.00021958356748135493, "epoch": 0.11666666666666667, "grad_norm": 0.1575755625963211, "learning_rate": 2.2077922077922076e-07, "loss": -0.0036, "step": 18 }, { "clip_ratio": 0.00023815594821436598, "epoch": 0.12314814814814815, "grad_norm": 0.15869197249412537, "learning_rate": 2.3376623376623376e-07, "loss": -0.0036, "step": 19 }, { "clip_ratio": 0.00024751310792496604, "epoch": 0.12962962962962962, "grad_norm": 0.16153784096240997, "learning_rate": 2.4675324675324674e-07, "loss": -0.0036, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 348.9512561616443, "epoch": 0.1361111111111111, "grad_norm": 0.14674022793769836, "learning_rate": 2.597402597402597e-07, "loss": 0.0003, "num_tokens": 21588414.0, "reward": 1.1496598862466358, "reward_std": 0.5963577500411442, "rewards/acc_reward_func": 1.1496598692167372, "step": 21 }, { "clip_ratio": 0.00019937331585207998, "epoch": 0.1425925925925926, "grad_norm": 0.14694689214229584, "learning_rate": 2.727272727272727e-07, "loss": 0.0003, "step": 22 }, { "clip_ratio": 0.00015258416533470154, "epoch": 0.14907407407407408, "grad_norm": 0.14661847054958344, "learning_rate": 2.857142857142857e-07, "loss": 0.0003, "step": 23 }, { "clip_ratio": 0.00018231397500910264, "epoch": 0.15555555555555556, "grad_norm": 0.14601057767868042, "learning_rate": 2.987012987012987e-07, "loss": 0.0003, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 342.5408208937872, "epoch": 0.16203703703703703, "grad_norm": 0.13536295294761658, "learning_rate": 3.1168831168831165e-07, "loss": 0.006, "num_tokens": 25015431.0, "reward": 1.2891156673431396, "reward_std": 0.529950432124592, "rewards/acc_reward_func": 1.289115655989874, "step": 25 }, { "clip_ratio": 0.000159220098707703, "epoch": 0.1685185185185185, "grad_norm": 0.1358516961336136, "learning_rate": 3.2467532467532465e-07, "loss": 0.006, "step": 26 }, { "clip_ratio": 0.00018200912347224186, "epoch": 0.175, "grad_norm": 0.13574668765068054, "learning_rate": 3.3766233766233765e-07, "loss": 0.006, "step": 27 }, { "clip_ratio": 0.0002001972615765962, "epoch": 0.1814814814814815, "grad_norm": 0.13535848259925842, "learning_rate": 3.5064935064935066e-07, "loss": 0.006, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 334.4512532552083, "epoch": 0.18796296296296297, "grad_norm": 0.15384796261787415, "learning_rate": 3.636363636363636e-07, "loss": -0.0034, "num_tokens": 28410989.0, "reward": 1.1836734868231273, "reward_std": 0.5527269648654121, "rewards/acc_reward_func": 1.1836734754698617, "step": 29 }, { "clip_ratio": 0.00026356185602101806, "epoch": 0.19444444444444445, "grad_norm": 0.15437930822372437, "learning_rate": 3.766233766233766e-07, "loss": -0.0034, "step": 30 }, { "clip_ratio": 0.0002261245880661244, "epoch": 0.20092592592592592, "grad_norm": 0.15577569603919983, "learning_rate": 3.896103896103896e-07, "loss": -0.0035, "step": 31 }, { "clip_ratio": 0.00024967778119302933, "epoch": 0.2074074074074074, "grad_norm": 0.1522216796875, "learning_rate": 4.025974025974026e-07, "loss": -0.0035, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 348.5476234072731, "epoch": 0.21388888888888888, "grad_norm": 0.14673157036304474, "learning_rate": 4.155844155844156e-07, "loss": -0.0123, "num_tokens": 32101754.0, "reward": 1.2324263226418268, "reward_std": 0.5690165821995053, "rewards/acc_reward_func": 1.232426316965194, "step": 33 }, { "clip_ratio": 0.00017888651875961971, "epoch": 0.22037037037037038, "grad_norm": 0.15019653737545013, "learning_rate": 4.285714285714285e-07, "loss": -0.0123, "step": 34 }, { "clip_ratio": 0.0002070159586894858, "epoch": 0.22685185185185186, "grad_norm": 0.14540189504623413, "learning_rate": 4.415584415584415e-07, "loss": -0.0123, "step": 35 }, { "clip_ratio": 0.00024694520574744923, "epoch": 0.23333333333333334, "grad_norm": 0.14700071513652802, "learning_rate": 4.545454545454545e-07, "loss": -0.0124, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 335.0714358375186, "epoch": 0.23981481481481481, "grad_norm": 0.14185680449008942, "learning_rate": 4.675324675324675e-07, "loss": 0.0017, "num_tokens": 35797715.0, "reward": 1.2755102061090016, "reward_std": 0.5158246074404035, "rewards/acc_reward_func": 1.275510203270685, "step": 37 }, { "clip_ratio": 0.00019795322302906286, "epoch": 0.2462962962962963, "grad_norm": 0.1420549750328064, "learning_rate": 4.805194805194805e-07, "loss": 0.0015, "step": 38 }, { "clip_ratio": 0.00026202407579625114, "epoch": 0.25277777777777777, "grad_norm": 0.1423768699169159, "learning_rate": 4.935064935064935e-07, "loss": 0.0015, "step": 39 }, { "clip_ratio": 0.00026934566440537484, "epoch": 0.25925925925925924, "grad_norm": 0.1396292746067047, "learning_rate": 5.064935064935064e-07, "loss": 0.0014, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 348.11111740838913, "epoch": 0.2657407407407407, "grad_norm": 0.14567208290100098, "learning_rate": 5.194805194805194e-07, "loss": 0.0041, "num_tokens": 39264277.0, "reward": 1.2585034256889707, "reward_std": 0.5927184422810873, "rewards/acc_reward_func": 1.2585034001441229, "step": 41 }, { "clip_ratio": 0.00025571950287225524, "epoch": 0.2722222222222222, "grad_norm": 0.14597439765930176, "learning_rate": 5.324675324675324e-07, "loss": 0.0041, "step": 42 }, { "clip_ratio": 0.00022081260283898918, "epoch": 0.27870370370370373, "grad_norm": 0.16748785972595215, "learning_rate": 5.454545454545454e-07, "loss": 0.004, "step": 43 }, { "clip_ratio": 0.00034207346158966957, "epoch": 0.2851851851851852, "grad_norm": 0.14416222274303436, "learning_rate": 5.584415584415584e-07, "loss": 0.0039, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 342.09977649507067, "epoch": 0.2916666666666667, "grad_norm": 0.1368248164653778, "learning_rate": 5.714285714285714e-07, "loss": 0.0052, "num_tokens": 42894683.0, "reward": 1.3628118208476476, "reward_std": 0.5181492559966587, "rewards/acc_reward_func": 1.3628117924644834, "step": 45 }, { "clip_ratio": 0.00015652789096791474, "epoch": 0.29814814814814816, "grad_norm": 0.13666287064552307, "learning_rate": 5.844155844155844e-07, "loss": 0.0051, "step": 46 }, { "clip_ratio": 0.0002253453385492321, "epoch": 0.30462962962962964, "grad_norm": 0.1370047777891159, "learning_rate": 5.974025974025974e-07, "loss": 0.0051, "step": 47 }, { "clip_ratio": 0.00027938479650488476, "epoch": 0.3111111111111111, "grad_norm": 0.13495419919490814, "learning_rate": 6.103896103896103e-07, "loss": 0.0049, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 351.9739292689732, "epoch": 0.3175925925925926, "grad_norm": 0.12604905664920807, "learning_rate": 6.233766233766233e-07, "loss": -0.0111, "num_tokens": 46253250.0, "reward": 1.3662131769316537, "reward_std": 0.4832150155589694, "rewards/acc_reward_func": 1.3662131428718567, "step": 49 }, { "clip_ratio": 0.0001608005228003354, "epoch": 0.32407407407407407, "grad_norm": 0.12657120823860168, "learning_rate": 6.363636363636363e-07, "loss": -0.0112, "step": 50 }, { "clip_ratio": 0.00018877648212115413, "epoch": 0.33055555555555555, "grad_norm": 0.12609201669692993, "learning_rate": 6.493506493506493e-07, "loss": -0.0112, "step": 51 }, { "clip_ratio": 0.00020845069507014982, "epoch": 0.337037037037037, "grad_norm": 0.12553073465824127, "learning_rate": 6.623376623376623e-07, "loss": -0.0114, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 346.6439964657738, "epoch": 0.3435185185185185, "grad_norm": 0.1337604820728302, "learning_rate": 6.753246753246753e-07, "loss": 0.0089, "num_tokens": 50081086.0, "reward": 1.2913832437424433, "reward_std": 0.5028326373015132, "rewards/acc_reward_func": 1.2913832181975955, "step": 53 }, { "clip_ratio": 0.00016935093160663244, "epoch": 0.35, "grad_norm": 0.1348680555820465, "learning_rate": 6.883116883116883e-07, "loss": 0.0088, "step": 54 }, { "clip_ratio": 0.00019904559875223122, "epoch": 0.35648148148148145, "grad_norm": 0.13407811522483826, "learning_rate": 7.012987012987013e-07, "loss": 0.0087, "step": 55 }, { "clip_ratio": 0.0002497758089573056, "epoch": 0.362962962962963, "grad_norm": 0.13265763223171234, "learning_rate": 7.142857142857143e-07, "loss": 0.0085, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 349.2063555036272, "epoch": 0.36944444444444446, "grad_norm": 0.12446761876344681, "learning_rate": 7.272727272727272e-07, "loss": 0.0028, "num_tokens": 53519154.0, "reward": 1.2641723610105968, "reward_std": 0.4625132538023449, "rewards/acc_reward_func": 1.2641723383040655, "step": 57 }, { "clip_ratio": 0.00018638262017269707, "epoch": 0.37592592592592594, "grad_norm": 0.12733881175518036, "learning_rate": 7.402597402597402e-07, "loss": 0.0028, "step": 58 }, { "clip_ratio": 0.0002025287897032242, "epoch": 0.3824074074074074, "grad_norm": 0.12367873638868332, "learning_rate": 7.532467532467532e-07, "loss": 0.0027, "step": 59 }, { "clip_ratio": 0.000297035732641927, "epoch": 0.3888888888888889, "grad_norm": 0.12311229109764099, "learning_rate": 7.662337662337662e-07, "loss": 0.0025, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 345.8061290922619, "epoch": 0.39537037037037037, "grad_norm": 0.13111141324043274, "learning_rate": 7.792207792207792e-07, "loss": 0.0106, "num_tokens": 57414909.0, "reward": 1.2641723638489133, "reward_std": 0.4437567651981399, "rewards/acc_reward_func": 1.2641723383040655, "step": 61 }, { "clip_ratio": 0.00016025772319629877, "epoch": 0.40185185185185185, "grad_norm": 0.12952443957328796, "learning_rate": 7.922077922077922e-07, "loss": 0.0104, "step": 62 }, { "clip_ratio": 0.00023572324270000017, "epoch": 0.4083333333333333, "grad_norm": 0.1300145536661148, "learning_rate": 8.051948051948052e-07, "loss": 0.0103, "step": 63 }, { "clip_ratio": 0.00037697855891781815, "epoch": 0.4148148148148148, "grad_norm": 0.12851402163505554, "learning_rate": 8.181818181818182e-07, "loss": 0.0101, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 337.216561453683, "epoch": 0.4212962962962963, "grad_norm": 0.1267780065536499, "learning_rate": 8.311688311688312e-07, "loss": -0.0102, "num_tokens": 60989390.0, "reward": 1.3015873318626767, "reward_std": 0.41901361587501706, "rewards/acc_reward_func": 1.3015873034795125, "step": 65 }, { "clip_ratio": 0.00017322945980898416, "epoch": 0.42777777777777776, "grad_norm": 0.12472664564847946, "learning_rate": 8.44155844155844e-07, "loss": -0.0103, "step": 66 }, { "clip_ratio": 0.000287876123149458, "epoch": 0.43425925925925923, "grad_norm": 0.15554016828536987, "learning_rate": 8.57142857142857e-07, "loss": -0.0105, "step": 67 }, { "clip_ratio": 0.0003167382113003571, "epoch": 0.44074074074074077, "grad_norm": 0.12460165470838547, "learning_rate": 8.7012987012987e-07, "loss": -0.0107, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 324.6723414829799, "epoch": 0.44722222222222224, "grad_norm": 0.12794345617294312, "learning_rate": 8.83116883116883e-07, "loss": -0.0049, "num_tokens": 64756585.0, "reward": 1.3684807561692738, "reward_std": 0.39905343807878946, "rewards/acc_reward_func": 1.3684807050795782, "step": 69 }, { "clip_ratio": 0.00012579495558470843, "epoch": 0.4537037037037037, "grad_norm": 0.12346912920475006, "learning_rate": 8.96103896103896e-07, "loss": -0.005, "step": 70 }, { "clip_ratio": 0.0001513983377316479, "epoch": 0.4601851851851852, "grad_norm": 0.1247495487332344, "learning_rate": 9.09090909090909e-07, "loss": -0.0052, "step": 71 }, { "clip_ratio": 0.0002705767042893318, "epoch": 0.4666666666666667, "grad_norm": 0.1223696619272232, "learning_rate": 9.22077922077922e-07, "loss": -0.0055, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 336.5226818266369, "epoch": 0.47314814814814815, "grad_norm": 0.12345382571220398, "learning_rate": 9.35064935064935e-07, "loss": -0.0019, "num_tokens": 68501142.0, "reward": 1.3707483268919445, "reward_std": 0.4370829054997081, "rewards/acc_reward_func": 1.3707483155386788, "step": 73 }, { "clip_ratio": 0.00018952846065596013, "epoch": 0.47962962962962963, "grad_norm": 0.12303110212087631, "learning_rate": 9.480519480519479e-07, "loss": -0.002, "step": 74 }, { "clip_ratio": 0.0002436372330218243, "epoch": 0.4861111111111111, "grad_norm": 0.12270648032426834, "learning_rate": 9.61038961038961e-07, "loss": -0.0023, "step": 75 }, { "clip_ratio": 0.0003469188186933198, "epoch": 0.4925925925925926, "grad_norm": 0.12180577218532562, "learning_rate": 9.74025974025974e-07, "loss": -0.0026, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 349.16893804640995, "epoch": 0.49907407407407406, "grad_norm": 0.11210431158542633, "learning_rate": 9.87012987012987e-07, "loss": 0.0069, "num_tokens": 72117875.0, "reward": 1.3673469736462547, "reward_std": 0.36692249313706443, "rewards/acc_reward_func": 1.3673469566163563, "step": 77 }, { "clip_ratio": 0.00018126107360807336, "epoch": 0.5055555555555555, "grad_norm": 0.11090458184480667, "learning_rate": 1e-06, "loss": 0.0068, "step": 78 }, { "clip_ratio": 0.0002199545553940836, "epoch": 0.5120370370370371, "grad_norm": 0.10997123271226883, "learning_rate": 1e-06, "loss": 0.0066, "step": 79 }, { "clip_ratio": 0.00034607137768900777, "epoch": 0.5185185185185185, "grad_norm": 0.1732209026813507, "learning_rate": 1e-06, "loss": 0.0063, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 347.36735171363466, "epoch": 0.525, "grad_norm": 0.10826165974140167, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 75894389.0, "reward": 1.3752834740139188, "reward_std": 0.33868200793152764, "rewards/acc_reward_func": 1.3752834569840204, "step": 81 }, { "clip_ratio": 0.00017236430254082995, "epoch": 0.5314814814814814, "grad_norm": 0.1089528352022171, "learning_rate": 1e-06, "loss": -0.0009, "step": 82 }, { "clip_ratio": 0.0002018414240973514, "epoch": 0.537962962962963, "grad_norm": 0.10826719552278519, "learning_rate": 1e-06, "loss": -0.001, "step": 83 }, { "clip_ratio": 0.0002509245447678647, "epoch": 0.5444444444444444, "grad_norm": 0.1076684221625328, "learning_rate": 1e-06, "loss": -0.0013, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 340.232430594308, "epoch": 0.5509259259259259, "grad_norm": 0.10895638167858124, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 79825968.0, "reward": 1.447845839318775, "reward_std": 0.2928337816681181, "rewards/acc_reward_func": 1.447845805258978, "step": 85 }, { "clip_ratio": 0.00012420982654605593, "epoch": 0.5574074074074075, "grad_norm": 0.10320693999528885, "learning_rate": 1e-06, "loss": 0.0038, "step": 86 }, { "clip_ratio": 0.00014987519015059142, "epoch": 0.5638888888888889, "grad_norm": 0.10314636677503586, "learning_rate": 1e-06, "loss": 0.0037, "step": 87 }, { "clip_ratio": 0.00020615724549445855, "epoch": 0.5703703703703704, "grad_norm": 0.1004006415605545, "learning_rate": 1e-06, "loss": 0.0034, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 339.4410487583705, "epoch": 0.5768518518518518, "grad_norm": 0.10505778342485428, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 83484809.0, "reward": 1.418367367415201, "reward_std": 0.33532609577689854, "rewards/acc_reward_func": 1.4183673447086698, "step": 89 }, { "clip_ratio": 0.00018661540144378143, "epoch": 0.5833333333333334, "grad_norm": 0.10543543845415115, "learning_rate": 1e-06, "loss": 0.0055, "step": 90 }, { "clip_ratio": 0.0002658298014596637, "epoch": 0.5898148148148148, "grad_norm": 0.10678199678659439, "learning_rate": 1e-06, "loss": 0.0053, "step": 91 }, { "clip_ratio": 0.00048629668336139904, "epoch": 0.5962962962962963, "grad_norm": 0.10415786504745483, "learning_rate": 1e-06, "loss": 0.005, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 335.6065862746466, "epoch": 0.6027777777777777, "grad_norm": 0.12411384284496307, "learning_rate": 1e-06, "loss": -0.009, "num_tokens": 87190872.0, "reward": 1.4285714569545926, "reward_std": 0.3354296861659913, "rewards/acc_reward_func": 1.4285714342480613, "step": 93 }, { "clip_ratio": 0.0001295947523820879, "epoch": 0.6092592592592593, "grad_norm": 0.11555258184671402, "learning_rate": 1e-06, "loss": -0.0092, "step": 94 }, { "clip_ratio": 0.00023690979426083643, "epoch": 0.6157407407407407, "grad_norm": 0.11537593603134155, "learning_rate": 1e-06, "loss": -0.0094, "step": 95 }, { "clip_ratio": 0.0004145491839153692, "epoch": 0.6222222222222222, "grad_norm": 0.1124836802482605, "learning_rate": 1e-06, "loss": -0.0097, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 336.0283493768601, "epoch": 0.6287037037037037, "grad_norm": 0.1106274202466011, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 90867337.0, "reward": 1.3321995706785292, "reward_std": 0.3691760081620443, "rewards/acc_reward_func": 1.332199547971998, "step": 97 }, { "clip_ratio": 0.00021598117850122174, "epoch": 0.6351851851851852, "grad_norm": 0.11104429513216019, "learning_rate": 1e-06, "loss": 0.0017, "step": 98 }, { "clip_ratio": 0.00030358976974163116, "epoch": 0.6416666666666667, "grad_norm": 0.11258433759212494, "learning_rate": 1e-06, "loss": 0.0014, "step": 99 }, { "clip_ratio": 0.000491898344729894, "epoch": 0.6481481481481481, "grad_norm": 0.10996989905834198, "learning_rate": 1e-06, "loss": 0.001, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 353.0181477864583, "epoch": 0.6546296296296297, "grad_norm": 0.11205079406499863, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 94459043.0, "reward": 1.3310657909938268, "reward_std": 0.36978748050473986, "rewards/acc_reward_func": 1.3310657569340296, "step": 101 }, { "clip_ratio": 0.00014175956433367295, "epoch": 0.6611111111111111, "grad_norm": 0.11174244433641434, "learning_rate": 1e-06, "loss": 0.003, "step": 102 }, { "clip_ratio": 0.00017096654950624464, "epoch": 0.6675925925925926, "grad_norm": 0.1625394970178604, "learning_rate": 1e-06, "loss": 0.0027, "step": 103 }, { "clip_ratio": 0.00030162992853937406, "epoch": 0.674074074074074, "grad_norm": 0.10828293114900589, "learning_rate": 1e-06, "loss": 0.0023, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 356.01587640671505, "epoch": 0.6805555555555556, "grad_norm": 0.10728833824396133, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 98074129.0, "reward": 1.3945578563781011, "reward_std": 0.3750602141732261, "rewards/acc_reward_func": 1.3945578223183042, "step": 105 }, { "clip_ratio": 0.00015117749440140047, "epoch": 0.687037037037037, "grad_norm": 0.10758285224437714, "learning_rate": 1e-06, "loss": -0.0031, "step": 106 }, { "clip_ratio": 0.00019362062206103778, "epoch": 0.6935185185185185, "grad_norm": 0.10650208592414856, "learning_rate": 1e-06, "loss": -0.0034, "step": 107 }, { "clip_ratio": 0.0004002706260680931, "epoch": 0.7, "grad_norm": 0.10655343532562256, "learning_rate": 1e-06, "loss": -0.0038, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 357.0793747674851, "epoch": 0.7064814814814815, "grad_norm": 0.11842089891433716, "learning_rate": 1e-06, "loss": 0.0076, "num_tokens": 101921189.0, "reward": 1.419501157034011, "reward_std": 0.3633141240903309, "rewards/acc_reward_func": 1.419501128650847, "step": 109 }, { "clip_ratio": 0.00015762935955925577, "epoch": 0.7129629629629629, "grad_norm": 0.11337336152791977, "learning_rate": 1e-06, "loss": 0.0074, "step": 110 }, { "clip_ratio": 0.0002720732234246541, "epoch": 0.7194444444444444, "grad_norm": 0.11286605894565582, "learning_rate": 1e-06, "loss": 0.0071, "step": 111 }, { "clip_ratio": 0.0004098817347160851, "epoch": 0.725925925925926, "grad_norm": 0.14437150955200195, "learning_rate": 1e-06, "loss": 0.0067, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 382.4637262253534, "epoch": 0.7324074074074074, "grad_norm": 0.1585981696844101, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 105554040.0, "reward": 1.4580499132474263, "reward_std": 0.31650315987921895, "rewards/acc_reward_func": 1.458049896217528, "step": 113 }, { "clip_ratio": 0.0001126181769428686, "epoch": 0.7388888888888889, "grad_norm": 0.09816838800907135, "learning_rate": 1e-06, "loss": 0.0062, "step": 114 }, { "clip_ratio": 0.00020272329320072285, "epoch": 0.7453703703703703, "grad_norm": 0.09763254970312119, "learning_rate": 1e-06, "loss": 0.006, "step": 115 }, { "clip_ratio": 0.00032899622670984607, "epoch": 0.7518518518518519, "grad_norm": 0.0961078405380249, "learning_rate": 1e-06, "loss": 0.0057, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 380.40476771763394, "epoch": 0.7583333333333333, "grad_norm": 0.11743240058422089, "learning_rate": 1e-06, "loss": 0.0082, "num_tokens": 109165971.0, "reward": 1.434240386599586, "reward_std": 0.33151187073616756, "rewards/acc_reward_func": 1.4342403638930548, "step": 117 }, { "clip_ratio": 0.00016299381570119987, "epoch": 0.7648148148148148, "grad_norm": 0.11144307255744934, "learning_rate": 1e-06, "loss": 0.0081, "step": 118 }, { "clip_ratio": 0.00021788747177370603, "epoch": 0.7712962962962963, "grad_norm": 0.11082971841096878, "learning_rate": 1e-06, "loss": 0.0078, "step": 119 }, { "clip_ratio": 0.00036926281176246923, "epoch": 0.7777777777777778, "grad_norm": 0.10246479511260986, "learning_rate": 1e-06, "loss": 0.0074, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 365.10658046177457, "epoch": 0.7842592592592592, "grad_norm": 0.09951800853013992, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 112830427.0, "reward": 1.5158730347951253, "reward_std": 0.3210904362301032, "rewards/acc_reward_func": 1.5158730291184925, "step": 121 }, { "clip_ratio": 0.0001313327319665058, "epoch": 0.7907407407407407, "grad_norm": 0.102614626288414, "learning_rate": 1e-06, "loss": 0.0006, "step": 122 }, { "clip_ratio": 0.00019906960508143086, "epoch": 0.7972222222222223, "grad_norm": 0.11046651005744934, "learning_rate": 1e-06, "loss": 0.0003, "step": 123 }, { "clip_ratio": 0.0003351353970217696, "epoch": 0.8037037037037037, "grad_norm": 0.1014101579785347, "learning_rate": 1e-06, "loss": -0.0001, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 364.8957010904948, "epoch": 0.8101851851851852, "grad_norm": 0.10484964400529861, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 116491721.0, "reward": 1.4365079601605732, "reward_std": 0.3235861455045995, "rewards/acc_reward_func": 1.4365079317774092, "step": 125 }, { "clip_ratio": 0.00018362473704092692, "epoch": 0.8166666666666667, "grad_norm": 0.10315818339586258, "learning_rate": 1e-06, "loss": 0.0082, "step": 126 }, { "clip_ratio": 0.0003200927950231181, "epoch": 0.8231481481481482, "grad_norm": 0.09985575079917908, "learning_rate": 1e-06, "loss": 0.0079, "step": 127 }, { "clip_ratio": 0.0006616064056288451, "epoch": 0.8296296296296296, "grad_norm": 0.09683224558830261, "learning_rate": 1e-06, "loss": 0.0075, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 353.2698451450893, "epoch": 0.8361111111111111, "grad_norm": 0.11193078756332397, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 119939031.0, "reward": 1.4308390333538963, "reward_std": 0.3666802270426637, "rewards/acc_reward_func": 1.4308389992940993, "step": 129 }, { "clip_ratio": 0.0002013682514232295, "epoch": 0.8425925925925926, "grad_norm": 0.11090143024921417, "learning_rate": 1e-06, "loss": 0.005, "step": 130 }, { "clip_ratio": 0.00041231308672909757, "epoch": 0.8490740740740741, "grad_norm": 0.10781311243772507, "learning_rate": 1e-06, "loss": 0.0047, "step": 131 }, { "clip_ratio": 0.0008236173737151105, "epoch": 0.8555555555555555, "grad_norm": 0.11081597954034805, "learning_rate": 1e-06, "loss": 0.0043, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 359.4410487583705, "epoch": 0.862037037037037, "grad_norm": 0.09571786969900131, "learning_rate": 1e-06, "loss": -0.0039, "num_tokens": 123640940.0, "reward": 1.4591836957704454, "reward_std": 0.3062526817832674, "rewards/acc_reward_func": 1.4591836702255976, "step": 133 }, { "clip_ratio": 0.00016750225785515447, "epoch": 0.8685185185185185, "grad_norm": 0.09535212069749832, "learning_rate": 1e-06, "loss": -0.004, "step": 134 }, { "clip_ratio": 0.0002182886072703349, "epoch": 0.875, "grad_norm": 0.09480314701795578, "learning_rate": 1e-06, "loss": -0.0043, "step": 135 }, { "clip_ratio": 0.0003651986255482327, "epoch": 0.8814814814814815, "grad_norm": 0.09624945372343063, "learning_rate": 1e-06, "loss": -0.0047, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 367.3832266671317, "epoch": 0.887962962962963, "grad_norm": 0.10961057245731354, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 127169248.0, "reward": 1.3900227064178103, "reward_std": 0.38218230328389574, "rewards/acc_reward_func": 1.3900226638430642, "step": 137 }, { "clip_ratio": 0.00014266002645377913, "epoch": 0.8944444444444445, "grad_norm": 0.10866183787584305, "learning_rate": 1e-06, "loss": 0.0107, "step": 138 }, { "clip_ratio": 0.0002758925520105376, "epoch": 0.9009259259259259, "grad_norm": 0.10819561779499054, "learning_rate": 1e-06, "loss": 0.0104, "step": 139 }, { "clip_ratio": 0.0005836994775260488, "epoch": 0.9074074074074074, "grad_norm": 0.10537825524806976, "learning_rate": 1e-06, "loss": 0.0099, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 370.5873078845796, "epoch": 0.9138888888888889, "grad_norm": 0.10795921087265015, "learning_rate": 1e-06, "loss": 0.0041, "num_tokens": 130807932.0, "reward": 1.394557862054734, "reward_std": 0.36527003596226376, "rewards/acc_reward_func": 1.3945578393482028, "step": 141 }, { "clip_ratio": 0.00017469531060972562, "epoch": 0.9203703703703704, "grad_norm": 0.10809798538684845, "learning_rate": 1e-06, "loss": 0.0039, "step": 142 }, { "clip_ratio": 0.0004895108766158089, "epoch": 0.9268518518518518, "grad_norm": 0.1056734025478363, "learning_rate": 1e-06, "loss": 0.0036, "step": 143 }, { "clip_ratio": 0.0009459215119325867, "epoch": 0.9333333333333333, "grad_norm": 0.10791812837123871, "learning_rate": 1e-06, "loss": 0.0031, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 373.20408848353793, "epoch": 0.9398148148148148, "grad_norm": 0.09965453296899796, "learning_rate": 1e-06, "loss": 0.0123, "num_tokens": 134599680.0, "reward": 1.4036281591369992, "reward_std": 0.30771812565979506, "rewards/acc_reward_func": 1.4036281250772022, "step": 145 }, { "clip_ratio": 0.00018105733518799147, "epoch": 0.9462962962962963, "grad_norm": 0.09854891151189804, "learning_rate": 1e-06, "loss": 0.0121, "step": 146 }, { "clip_ratio": 0.0004012293509385061, "epoch": 0.9527777777777777, "grad_norm": 0.09946269541978836, "learning_rate": 1e-06, "loss": 0.0119, "step": 147 }, { "clip_ratio": 0.0005813377453402305, "epoch": 0.9592592592592593, "grad_norm": 0.10020922869443893, "learning_rate": 1e-06, "loss": 0.0115, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 366.29479253859745, "epoch": 0.9657407407407408, "grad_norm": 0.10600566118955612, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 138533624.0, "reward": 1.4217687391099476, "reward_std": 0.31214298538508867, "rewards/acc_reward_func": 1.4217687050501506, "step": 149 }, { "clip_ratio": 0.0001907412124204538, "epoch": 0.9722222222222222, "grad_norm": 0.10447903722524643, "learning_rate": 1e-06, "loss": 0.0062, "step": 150 }, { "clip_ratio": 0.0003977599562425721, "epoch": 0.9787037037037037, "grad_norm": 0.09847753494977951, "learning_rate": 1e-06, "loss": 0.006, "step": 151 }, { "clip_ratio": 0.000723014666229054, "epoch": 0.9851851851851852, "grad_norm": 0.09649986028671265, "learning_rate": 1e-06, "loss": 0.0055, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 368.8061290922619, "epoch": 1.0064814814814815, "grad_norm": 0.1145603284239769, "learning_rate": 1e-06, "loss": -0.0038, "num_tokens": 142234001.0, "reward": 1.4308390333538963, "reward_std": 0.32152382461797624, "rewards/acc_reward_func": 1.4308390049707322, "step": 153 }, { "clip_ratio": 0.00018503319093031765, "epoch": 1.012962962962963, "grad_norm": 0.10569456219673157, "learning_rate": 1e-06, "loss": -0.004, "step": 154 }, { "clip_ratio": 0.00037826129084264507, "epoch": 1.0194444444444444, "grad_norm": 0.10441906005144119, "learning_rate": 1e-06, "loss": -0.0044, "step": 155 }, { "clip_ratio": 0.0005787124890568, "epoch": 1.025925925925926, "grad_norm": 0.10377710312604904, "learning_rate": 1e-06, "loss": -0.0049, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 358.7063555036272, "epoch": 1.0324074074074074, "grad_norm": 0.10566150397062302, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 145994812.0, "reward": 1.4965986637842088, "reward_std": 0.30973265550675844, "rewards/acc_reward_func": 1.4965986410776775, "step": 157 }, { "clip_ratio": 0.0001249600378893471, "epoch": 1.038888888888889, "grad_norm": 0.10384287685155869, "learning_rate": 1e-06, "loss": 0.0103, "step": 158 }, { "clip_ratio": 0.0002204828872761157, "epoch": 1.0453703703703703, "grad_norm": 0.10222744941711426, "learning_rate": 1e-06, "loss": 0.01, "step": 159 }, { "clip_ratio": 0.0004675700528466786, "epoch": 1.0518518518518518, "grad_norm": 0.10059863328933716, "learning_rate": 1e-06, "loss": 0.0095, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 352.2256338936942, "epoch": 1.0583333333333333, "grad_norm": 0.11056972295045853, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 149621705.0, "reward": 1.5204081819170998, "reward_std": 0.32912652584768476, "rewards/acc_reward_func": 1.5204081648872012, "step": 161 }, { "clip_ratio": 0.00016151682436992858, "epoch": 1.0648148148148149, "grad_norm": 0.11060597002506256, "learning_rate": 1e-06, "loss": 0.0055, "step": 162 }, { "clip_ratio": 0.0004163828950064878, "epoch": 1.0712962962962962, "grad_norm": 0.10768898576498032, "learning_rate": 1e-06, "loss": 0.005, "step": 163 }, { "clip_ratio": 0.000778782024488984, "epoch": 1.0777777777777777, "grad_norm": 0.10523418337106705, "learning_rate": 1e-06, "loss": 0.0045, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 339.5136093866257, "epoch": 1.0842592592592593, "grad_norm": 0.1091051772236824, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 153198374.0, "reward": 1.4353741719609214, "reward_std": 0.27784250605674016, "rewards/acc_reward_func": 1.435374140739441, "step": 165 }, { "clip_ratio": 0.00019789206375467723, "epoch": 1.0907407407407408, "grad_norm": 0.11133051663637161, "learning_rate": 1e-06, "loss": 0.0039, "step": 166 }, { "clip_ratio": 0.0003219507693540349, "epoch": 1.0972222222222223, "grad_norm": 0.1063733696937561, "learning_rate": 1e-06, "loss": 0.0035, "step": 167 }, { "clip_ratio": 0.0007354825044915612, "epoch": 1.1037037037037036, "grad_norm": 0.10145536810159683, "learning_rate": 1e-06, "loss": 0.0029, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 337.5045413062686, "epoch": 1.1101851851851852, "grad_norm": 0.0959630087018013, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 156847449.0, "reward": 1.4739229253360204, "reward_std": 0.2454074023380166, "rewards/acc_reward_func": 1.473922916821071, "step": 169 }, { "clip_ratio": 0.00013901671592888998, "epoch": 1.1166666666666667, "grad_norm": 0.09553571790456772, "learning_rate": 1e-06, "loss": 0.0018, "step": 170 }, { "clip_ratio": 0.0002768557380823906, "epoch": 1.1231481481481482, "grad_norm": 0.09550315886735916, "learning_rate": 1e-06, "loss": 0.0015, "step": 171 }, { "clip_ratio": 0.0005109132904354261, "epoch": 1.1296296296296295, "grad_norm": 0.09238159656524658, "learning_rate": 1e-06, "loss": 0.0011, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 329.7698480515253, "epoch": 1.136111111111111, "grad_norm": 0.1056121438741684, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 160395520.0, "reward": 1.4829932167416526, "reward_std": 0.27240987175277304, "rewards/acc_reward_func": 1.4829931997117543, "step": 173 }, { "clip_ratio": 0.00018629198200956342, "epoch": 1.1425925925925926, "grad_norm": 0.10767155885696411, "learning_rate": 1e-06, "loss": 0.0035, "step": 174 }, { "clip_ratio": 0.00030426528593081805, "epoch": 1.1490740740740741, "grad_norm": 0.10210078209638596, "learning_rate": 1e-06, "loss": 0.0031, "step": 175 }, { "clip_ratio": 0.0007162606953421519, "epoch": 1.1555555555555554, "grad_norm": 0.0999271348118782, "learning_rate": 1e-06, "loss": 0.0026, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 323.3401416596912, "epoch": 1.162037037037037, "grad_norm": 0.10751984268426895, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 164255542.0, "reward": 1.5430839345568703, "reward_std": 0.2709279392092001, "rewards/acc_reward_func": 1.5430838948204404, "step": 177 }, { "clip_ratio": 0.00019573130737712962, "epoch": 1.1685185185185185, "grad_norm": 0.11798489093780518, "learning_rate": 1e-06, "loss": -0.0023, "step": 178 }, { "clip_ratio": 0.00034863099045608016, "epoch": 1.175, "grad_norm": 0.10651062428951263, "learning_rate": 1e-06, "loss": -0.0027, "step": 179 }, { "clip_ratio": 0.0006314800730684684, "epoch": 1.1814814814814816, "grad_norm": 0.10761118680238724, "learning_rate": 1e-06, "loss": -0.0032, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 335.4399167015439, "epoch": 1.1879629629629629, "grad_norm": 0.10813816636800766, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 167941502.0, "reward": 1.416099800950005, "reward_std": 0.23997027143126443, "rewards/acc_reward_func": 1.4160997640518915, "step": 181 }, { "clip_ratio": 9.955801527082388e-05, "epoch": 1.1944444444444444, "grad_norm": 0.10921348631381989, "learning_rate": 1e-06, "loss": 0.0081, "step": 182 }, { "clip_ratio": 0.00018630296004370654, "epoch": 1.200925925925926, "grad_norm": 0.10632283985614777, "learning_rate": 1e-06, "loss": 0.0077, "step": 183 }, { "clip_ratio": 0.00047109573885488015, "epoch": 1.2074074074074075, "grad_norm": 0.1052081361413002, "learning_rate": 1e-06, "loss": 0.0071, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 354.4988723028274, "epoch": 1.2138888888888888, "grad_norm": 0.09669824689626694, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 171898126.0, "reward": 1.5374149935586112, "reward_std": 0.22926786666115126, "rewards/acc_reward_func": 1.5374149680137634, "step": 185 }, { "clip_ratio": 0.0001581089007751351, "epoch": 1.2203703703703703, "grad_norm": 0.09097945690155029, "learning_rate": 1e-06, "loss": 0.0029, "step": 186 }, { "clip_ratio": 0.0001345763438231578, "epoch": 1.2268518518518519, "grad_norm": 0.09122958034276962, "learning_rate": 1e-06, "loss": 0.0026, "step": 187 }, { "clip_ratio": 0.00023815015707181634, "epoch": 1.2333333333333334, "grad_norm": 0.08839327096939087, "learning_rate": 1e-06, "loss": 0.0021, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 350.2641790480841, "epoch": 1.2398148148148147, "grad_norm": 0.12880218029022217, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 175634727.0, "reward": 1.486394587017241, "reward_std": 0.3775232934526035, "rewards/acc_reward_func": 1.4863945699873424, "step": 189 }, { "clip_ratio": 0.00019796302409044335, "epoch": 1.2462962962962962, "grad_norm": 0.12655557692050934, "learning_rate": 1e-06, "loss": 0.0101, "step": 190 }, { "clip_ratio": 0.0004977159599851196, "epoch": 1.2527777777777778, "grad_norm": 0.12340408563613892, "learning_rate": 1e-06, "loss": 0.0096, "step": 191 }, { "clip_ratio": 0.0007802596893660459, "epoch": 1.2592592592592593, "grad_norm": 0.12486658990383148, "learning_rate": 1e-06, "loss": 0.0088, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 357.3310706728981, "epoch": 1.2657407407407408, "grad_norm": 0.12545065581798553, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 179429101.0, "reward": 1.5918367760522025, "reward_std": 0.24694282561540604, "rewards/acc_reward_func": 1.591836724962507, "step": 193 }, { "clip_ratio": 0.00014233627700291219, "epoch": 1.2722222222222221, "grad_norm": 0.09697285294532776, "learning_rate": 1e-06, "loss": 0.0014, "step": 194 }, { "clip_ratio": 0.00021537907499199113, "epoch": 1.2787037037037037, "grad_norm": 0.09571573138237, "learning_rate": 1e-06, "loss": 0.001, "step": 195 }, { "clip_ratio": 0.0003717566467544419, "epoch": 1.2851851851851852, "grad_norm": 0.09427618980407715, "learning_rate": 1e-06, "loss": 0.0006, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 362.25964210146947, "epoch": 1.2916666666666667, "grad_norm": 0.10039424896240234, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 183207074.0, "reward": 1.4138322273890178, "reward_std": 0.2701569392922379, "rewards/acc_reward_func": 1.4138321990058536, "step": 197 }, { "clip_ratio": 0.00013769451262695448, "epoch": 1.2981481481481483, "grad_norm": 0.10113991051912308, "learning_rate": 1e-06, "loss": 0.0055, "step": 198 }, { "clip_ratio": 0.00026389724596464516, "epoch": 1.3046296296296296, "grad_norm": 0.09699271619319916, "learning_rate": 1e-06, "loss": 0.0052, "step": 199 }, { "clip_ratio": 0.00047017908599671153, "epoch": 1.3111111111111111, "grad_norm": 0.09559627622365952, "learning_rate": 1e-06, "loss": 0.0047, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 338.6417265392485, "epoch": 1.3175925925925926, "grad_norm": 0.12740236520767212, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 186949450.0, "reward": 1.5963719118209112, "reward_std": 0.287948662681239, "rewards/acc_reward_func": 1.5963718777611142, "step": 201 }, { "clip_ratio": 0.00018609766872638525, "epoch": 1.324074074074074, "grad_norm": 0.10720109194517136, "learning_rate": 1e-06, "loss": -0.001, "step": 202 }, { "clip_ratio": 0.00027242925110234256, "epoch": 1.3305555555555555, "grad_norm": 0.10511540621519089, "learning_rate": 1e-06, "loss": -0.0014, "step": 203 }, { "clip_ratio": 0.00042802050129033713, "epoch": 1.337037037037037, "grad_norm": 0.10465174168348312, "learning_rate": 1e-06, "loss": -0.002, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 328.1644054594494, "epoch": 1.3435185185185186, "grad_norm": 0.09750308096408844, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 190654781.0, "reward": 1.5521542231241863, "reward_std": 0.2286651549594743, "rewards/acc_reward_func": 1.552154194741022, "step": 205 }, { "clip_ratio": 0.00012115049078905334, "epoch": 1.35, "grad_norm": 0.09753656387329102, "learning_rate": 1e-06, "loss": 0.0027, "step": 206 }, { "clip_ratio": 0.00019190308757104156, "epoch": 1.3564814814814814, "grad_norm": 0.09476270526647568, "learning_rate": 1e-06, "loss": 0.0024, "step": 207 }, { "clip_ratio": 0.00031734065331485387, "epoch": 1.362962962962963, "grad_norm": 0.09239774942398071, "learning_rate": 1e-06, "loss": 0.0019, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 336.24830554780505, "epoch": 1.3694444444444445, "grad_norm": 0.08607508987188339, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 194142806.0, "reward": 1.7006802956263225, "reward_std": 0.19712306239775249, "rewards/acc_reward_func": 1.700680278596424, "step": 209 }, { "clip_ratio": 0.00010146752867426368, "epoch": 1.375925925925926, "grad_norm": 0.08650446683168411, "learning_rate": 1e-06, "loss": 0.0034, "step": 210 }, { "clip_ratio": 0.00017285726651261073, "epoch": 1.3824074074074075, "grad_norm": 0.08462885767221451, "learning_rate": 1e-06, "loss": 0.0031, "step": 211 }, { "clip_ratio": 0.00037240791252337484, "epoch": 1.3888888888888888, "grad_norm": 0.08269508928060532, "learning_rate": 1e-06, "loss": 0.0027, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 343.1621398925781, "epoch": 1.3953703703703704, "grad_norm": 0.09740854054689407, "learning_rate": 1e-06, "loss": -0.0036, "num_tokens": 197573641.0, "reward": 1.5839002586546398, "reward_std": 0.23600225948861667, "rewards/acc_reward_func": 1.5839002302714758, "step": 213 }, { "clip_ratio": 0.0001937114693213343, "epoch": 1.401851851851852, "grad_norm": 0.09612539410591125, "learning_rate": 1e-06, "loss": -0.0038, "step": 214 }, { "clip_ratio": 0.000410361105986383, "epoch": 1.4083333333333332, "grad_norm": 0.09062538295984268, "learning_rate": 1e-06, "loss": -0.0041, "step": 215 }, { "clip_ratio": 0.0006658303630371977, "epoch": 1.4148148148148147, "grad_norm": 0.08932670950889587, "learning_rate": 1e-06, "loss": -0.0046, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 345.11792137509303, "epoch": 1.4212962962962963, "grad_norm": 0.12799644470214844, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 200908101.0, "reward": 1.5249433432306563, "reward_std": 0.23494758918171837, "rewards/acc_reward_func": 1.5249433120091755, "step": 217 }, { "clip_ratio": 0.00014925150822992216, "epoch": 1.4277777777777778, "grad_norm": 0.10302122682332993, "learning_rate": 1e-06, "loss": 0.0015, "step": 218 }, { "clip_ratio": 0.00030673204136768444, "epoch": 1.4342592592592593, "grad_norm": 0.10091983526945114, "learning_rate": 1e-06, "loss": 0.0011, "step": 219 }, { "clip_ratio": 0.000536466390518139, "epoch": 1.4407407407407407, "grad_norm": 0.09628088772296906, "learning_rate": 1e-06, "loss": 0.0006, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 354.2879871186756, "epoch": 1.4472222222222222, "grad_norm": 0.08676353842020035, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 204461333.0, "reward": 1.4863945983705067, "reward_std": 0.18810446666819708, "rewards/acc_reward_func": 1.4863945699873424, "step": 221 }, { "clip_ratio": 0.00011991041174042039, "epoch": 1.4537037037037037, "grad_norm": 0.08577782660722733, "learning_rate": 1e-06, "loss": 0.0056, "step": 222 }, { "clip_ratio": 0.0002372816095538881, "epoch": 1.4601851851851853, "grad_norm": 0.08848343044519424, "learning_rate": 1e-06, "loss": 0.0053, "step": 223 }, { "clip_ratio": 0.0003101691067318565, "epoch": 1.4666666666666668, "grad_norm": 0.0836254134774208, "learning_rate": 1e-06, "loss": 0.0049, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 359.7074890136719, "epoch": 1.473148148148148, "grad_norm": 0.09105908870697021, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 208113905.0, "reward": 1.5317460695902507, "reward_std": 0.2008282034879639, "rewards/acc_reward_func": 1.5317460298538208, "step": 225 }, { "clip_ratio": 0.0001545734183829544, "epoch": 1.4796296296296296, "grad_norm": 0.08936156332492828, "learning_rate": 1e-06, "loss": 0.0025, "step": 226 }, { "clip_ratio": 0.0002477309672464062, "epoch": 1.4861111111111112, "grad_norm": 0.08730500191450119, "learning_rate": 1e-06, "loss": 0.0022, "step": 227 }, { "clip_ratio": 0.00045893899715294885, "epoch": 1.4925925925925925, "grad_norm": 0.08526027202606201, "learning_rate": 1e-06, "loss": 0.0018, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 358.7585071382068, "epoch": 1.499074074074074, "grad_norm": 0.09061747789382935, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 211741022.0, "reward": 1.4829932167416526, "reward_std": 0.20110960996576718, "rewards/acc_reward_func": 1.4829931883584886, "step": 229 }, { "clip_ratio": 0.00015922586594353474, "epoch": 1.5055555555555555, "grad_norm": 0.09076972305774689, "learning_rate": 1e-06, "loss": 0.0024, "step": 230 }, { "clip_ratio": 0.0003245450009541985, "epoch": 1.512037037037037, "grad_norm": 0.09029541909694672, "learning_rate": 1e-06, "loss": 0.0021, "step": 231 }, { "clip_ratio": 0.0005226546251963425, "epoch": 1.5185185185185186, "grad_norm": 0.08851893246173859, "learning_rate": 1e-06, "loss": 0.0017, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 380.81519862583707, "epoch": 1.525, "grad_norm": 0.08343932777643204, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 215696347.0, "reward": 1.503401395820436, "reward_std": 0.19729749645505631, "rewards/acc_reward_func": 1.5034013560840063, "step": 233 }, { "clip_ratio": 0.0001319292518712159, "epoch": 1.5314814814814814, "grad_norm": 0.08363550901412964, "learning_rate": 1e-06, "loss": -0.0011, "step": 234 }, { "clip_ratio": 0.00015108115725784695, "epoch": 1.537962962962963, "grad_norm": 0.08246675133705139, "learning_rate": 1e-06, "loss": -0.0014, "step": 235 }, { "clip_ratio": 0.00021679888924977387, "epoch": 1.5444444444444443, "grad_norm": 0.08294124901294708, "learning_rate": 1e-06, "loss": -0.0018, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 376.6542024158296, "epoch": 1.550925925925926, "grad_norm": 0.09547173976898193, "learning_rate": 1e-06, "loss": 0.0082, "num_tokens": 219263570.0, "reward": 1.5589569409688313, "reward_std": 0.22031122871807643, "rewards/acc_reward_func": 1.558956923938933, "step": 237 }, { "clip_ratio": 9.522035404751521e-05, "epoch": 1.5574074074074074, "grad_norm": 0.09474051743745804, "learning_rate": 1e-06, "loss": 0.008, "step": 238 }, { "clip_ratio": 0.00018606578322803778, "epoch": 1.5638888888888889, "grad_norm": 0.09293164312839508, "learning_rate": 1e-06, "loss": 0.0077, "step": 239 }, { "clip_ratio": 0.000349092049873434, "epoch": 1.5703703703703704, "grad_norm": 0.08964758366346359, "learning_rate": 1e-06, "loss": 0.0071, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 374.5249488467262, "epoch": 1.5768518518518517, "grad_norm": 0.07777679711580276, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 222901257.0, "reward": 1.6167800653548468, "reward_std": 0.15760817981901623, "rewards/acc_reward_func": 1.6167800653548468, "step": 241 }, { "clip_ratio": 0.00013264746234026027, "epoch": 1.5833333333333335, "grad_norm": 0.07154504954814911, "learning_rate": 1e-06, "loss": -0.0013, "step": 242 }, { "clip_ratio": 0.00021582282629207752, "epoch": 1.5898148148148148, "grad_norm": 0.07036437839269638, "learning_rate": 1e-06, "loss": -0.0015, "step": 243 }, { "clip_ratio": 0.0004014643964183051, "epoch": 1.5962962962962963, "grad_norm": 0.06977999955415726, "learning_rate": 1e-06, "loss": -0.0018, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 360.1610049293155, "epoch": 1.6027777777777779, "grad_norm": 0.19480708241462708, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 226504849.0, "reward": 1.5680272494043623, "reward_std": 0.17598431486459004, "rewards/acc_reward_func": 1.5680272039912997, "step": 245 }, { "clip_ratio": 8.644776798540814e-05, "epoch": 1.6092592592592592, "grad_norm": 0.07636608183383942, "learning_rate": 1e-06, "loss": -0.001, "step": 246 }, { "clip_ratio": 0.00016529553405624547, "epoch": 1.6157407407407407, "grad_norm": 0.0800333172082901, "learning_rate": 1e-06, "loss": -0.0013, "step": 247 }, { "clip_ratio": 0.0002638628620410427, "epoch": 1.6222222222222222, "grad_norm": 0.07518580555915833, "learning_rate": 1e-06, "loss": -0.0016, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 383.0079432896205, "epoch": 1.6287037037037035, "grad_norm": 0.08973593264818192, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 230496284.0, "reward": 1.6439909594399589, "reward_std": 0.22272922364728792, "rewards/acc_reward_func": 1.6439909367334276, "step": 249 }, { "clip_ratio": 9.212724364174174e-05, "epoch": 1.6351851851851853, "grad_norm": 0.09071903675794601, "learning_rate": 1e-06, "loss": 0.0002, "step": 250 } ], "logging_steps": 1, "max_steps": 770, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }