diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4659 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.285185185185185, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 341.3775576636905, + "epoch": 0.006481481481481481, + "grad_norm": 0.14961056411266327, + "learning_rate": 0.0, + "loss": -0.0119, + "num_tokens": 3461091.0, + "reward": 1.2052154455866133, + "reward_std": 0.5575123486064729, + "rewards/acc_reward_func": 1.2052154285567147, + "step": 1 + }, + { + "clip_ratio": 0.0, + "epoch": 0.012962962962962963, + "grad_norm": 0.1496139019727707, + "learning_rate": 1.2987012987012988e-08, + "loss": -0.0119, + "step": 2 + }, + { + "clip_ratio": 0.0001939246332247941, + "epoch": 0.019444444444444445, + "grad_norm": 0.1510600745677948, + "learning_rate": 2.5974025974025976e-08, + "loss": -0.0118, + "step": 3 + }, + { + "clip_ratio": 0.0001826526765528667, + "epoch": 0.025925925925925925, + "grad_norm": 0.15029065310955048, + "learning_rate": 3.8961038961038956e-08, + "loss": -0.0118, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.6235874720982, + "epoch": 0.032407407407407406, + "grad_norm": 0.14211858808994293, + "learning_rate": 5.194805194805195e-08, + "loss": 0.003, + "num_tokens": 7190101.0, + "reward": 1.2403628343627566, + "reward_std": 0.5266946000712258, + "rewards/acc_reward_func": 1.2403628116562253, + "step": 5 + }, + { + "clip_ratio": 0.00025525176377933737, + "epoch": 0.03888888888888889, + "grad_norm": 0.1438485085964203, + "learning_rate": 6.493506493506492e-08, + "loss": 0.0029, + "step": 6 + }, + { + "clip_ratio": 0.0002173587291465429, + "epoch": 0.04537037037037037, + "grad_norm": 0.14308251440525055, + "learning_rate": 7.792207792207791e-08, + "loss": 0.003, + "step": 7 + }, + { + "clip_ratio": 0.00018235097082825157, + "epoch": 0.05185185185185185, + "grad_norm": 0.14172835648059845, + "learning_rate": 9.09090909090909e-08, + "loss": 0.0029, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.4966067359561, + "epoch": 0.058333333333333334, + "grad_norm": 0.15887229144573212, + "learning_rate": 1.038961038961039e-07, + "loss": -0.0015, + "num_tokens": 10969747.0, + "reward": 1.119047638915834, + "reward_std": 0.5877589228607359, + "rewards/acc_reward_func": 1.1190476105326699, + "step": 9 + }, + { + "clip_ratio": 0.00026269415199446183, + "epoch": 0.06481481481481481, + "grad_norm": 0.15948110818862915, + "learning_rate": 1.1688311688311688e-07, + "loss": -0.0015, + "step": 10 + }, + { + "clip_ratio": 0.00023042162952368104, + "epoch": 0.0712962962962963, + "grad_norm": 0.16856980323791504, + "learning_rate": 1.2987012987012984e-07, + "loss": -0.0015, + "step": 11 + }, + { + "clip_ratio": 0.0002131570793045241, + "epoch": 0.07777777777777778, + "grad_norm": 0.15859223902225494, + "learning_rate": 1.4285714285714285e-07, + "loss": -0.0015, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.6644025530134, + "epoch": 0.08425925925925926, + "grad_norm": 0.14740879833698273, + "learning_rate": 1.5584415584415582e-07, + "loss": -0.0089, + "num_tokens": 14484005.0, + "reward": 1.2426304050854273, + "reward_std": 0.5664714901220231, + "rewards/acc_reward_func": 1.2426303937321617, + "step": 13 + }, + { + "clip_ratio": 0.00024305369533110586, + "epoch": 0.09074074074074075, + "grad_norm": 0.14839285612106323, + "learning_rate": 1.6883116883116883e-07, + "loss": -0.0089, + "step": 14 + }, + { + "clip_ratio": 0.0002752210618512306, + "epoch": 0.09722222222222222, + "grad_norm": 0.14957986772060394, + "learning_rate": 1.818181818181818e-07, + "loss": -0.0089, + "step": 15 + }, + { + "clip_ratio": 0.0002536561978991986, + "epoch": 0.1037037037037037, + "grad_norm": 0.14792965352535248, + "learning_rate": 1.948051948051948e-07, + "loss": -0.0089, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.96145775204616, + "epoch": 0.11018518518518519, + "grad_norm": 0.1590029001235962, + "learning_rate": 2.077922077922078e-07, + "loss": -0.0036, + "num_tokens": 17968861.0, + "reward": 1.1337868769963582, + "reward_std": 0.6494243343671163, + "rewards/acc_reward_func": 1.1337868372599285, + "step": 17 + }, + { + "clip_ratio": 0.00021958356748135493, + "epoch": 0.11666666666666667, + "grad_norm": 0.1575755625963211, + "learning_rate": 2.2077922077922076e-07, + "loss": -0.0036, + "step": 18 + }, + { + "clip_ratio": 0.00023815594821436598, + "epoch": 0.12314814814814815, + "grad_norm": 0.15869197249412537, + "learning_rate": 2.3376623376623376e-07, + "loss": -0.0036, + "step": 19 + }, + { + "clip_ratio": 0.00024751310792496604, + "epoch": 0.12962962962962962, + "grad_norm": 0.16153784096240997, + "learning_rate": 2.4675324675324674e-07, + "loss": -0.0036, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.9512561616443, + "epoch": 0.1361111111111111, + "grad_norm": 0.14674022793769836, + "learning_rate": 2.597402597402597e-07, + "loss": 0.0003, + "num_tokens": 21588414.0, + "reward": 1.1496598862466358, + "reward_std": 0.5963577500411442, + "rewards/acc_reward_func": 1.1496598692167372, + "step": 21 + }, + { + "clip_ratio": 0.00019937331585207998, + "epoch": 0.1425925925925926, + "grad_norm": 0.14694689214229584, + "learning_rate": 2.727272727272727e-07, + "loss": 0.0003, + "step": 22 + }, + { + "clip_ratio": 0.00015258416533470154, + "epoch": 0.14907407407407408, + "grad_norm": 0.14661847054958344, + "learning_rate": 2.857142857142857e-07, + "loss": 0.0003, + "step": 23 + }, + { + "clip_ratio": 0.00018231397500910264, + "epoch": 0.15555555555555556, + "grad_norm": 0.14601057767868042, + "learning_rate": 2.987012987012987e-07, + "loss": 0.0003, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.5408208937872, + "epoch": 0.16203703703703703, + "grad_norm": 0.13536295294761658, + "learning_rate": 3.1168831168831165e-07, + "loss": 0.006, + "num_tokens": 25015431.0, + "reward": 1.2891156673431396, + "reward_std": 0.529950432124592, + "rewards/acc_reward_func": 1.289115655989874, + "step": 25 + }, + { + "clip_ratio": 0.000159220098707703, + "epoch": 0.1685185185185185, + "grad_norm": 0.1358516961336136, + "learning_rate": 3.2467532467532465e-07, + "loss": 0.006, + "step": 26 + }, + { + "clip_ratio": 0.00018200912347224186, + "epoch": 0.175, + "grad_norm": 0.13574668765068054, + "learning_rate": 3.3766233766233765e-07, + "loss": 0.006, + "step": 27 + }, + { + "clip_ratio": 0.0002001972615765962, + "epoch": 0.1814814814814815, + "grad_norm": 0.13535848259925842, + "learning_rate": 3.5064935064935066e-07, + "loss": 0.006, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.4512532552083, + "epoch": 0.18796296296296297, + "grad_norm": 0.15384796261787415, + "learning_rate": 3.636363636363636e-07, + "loss": -0.0034, + "num_tokens": 28410989.0, + "reward": 1.1836734868231273, + "reward_std": 0.5527269648654121, + "rewards/acc_reward_func": 1.1836734754698617, + "step": 29 + }, + { + "clip_ratio": 0.00026356185602101806, + "epoch": 0.19444444444444445, + "grad_norm": 0.15437930822372437, + "learning_rate": 3.766233766233766e-07, + "loss": -0.0034, + "step": 30 + }, + { + "clip_ratio": 0.0002261245880661244, + "epoch": 0.20092592592592592, + "grad_norm": 0.15577569603919983, + "learning_rate": 3.896103896103896e-07, + "loss": -0.0035, + "step": 31 + }, + { + "clip_ratio": 0.00024967778119302933, + "epoch": 0.2074074074074074, + "grad_norm": 0.1522216796875, + "learning_rate": 4.025974025974026e-07, + "loss": -0.0035, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.5476234072731, + "epoch": 0.21388888888888888, + "grad_norm": 0.14673157036304474, + "learning_rate": 4.155844155844156e-07, + "loss": -0.0123, + "num_tokens": 32101754.0, + "reward": 1.2324263226418268, + "reward_std": 0.5690165821995053, + "rewards/acc_reward_func": 1.232426316965194, + "step": 33 + }, + { + "clip_ratio": 0.00017888651875961971, + "epoch": 0.22037037037037038, + "grad_norm": 0.15019653737545013, + "learning_rate": 4.285714285714285e-07, + "loss": -0.0123, + "step": 34 + }, + { + "clip_ratio": 0.0002070159586894858, + "epoch": 0.22685185185185186, + "grad_norm": 0.14540189504623413, + "learning_rate": 4.415584415584415e-07, + "loss": -0.0123, + "step": 35 + }, + { + "clip_ratio": 0.00024694520574744923, + "epoch": 0.23333333333333334, + "grad_norm": 0.14700071513652802, + "learning_rate": 4.545454545454545e-07, + "loss": -0.0124, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.0714358375186, + "epoch": 0.23981481481481481, + "grad_norm": 0.14185680449008942, + "learning_rate": 4.675324675324675e-07, + "loss": 0.0017, + "num_tokens": 35797715.0, + "reward": 1.2755102061090016, + "reward_std": 0.5158246074404035, + "rewards/acc_reward_func": 1.275510203270685, + "step": 37 + }, + { + "clip_ratio": 0.00019795322302906286, + "epoch": 0.2462962962962963, + "grad_norm": 0.1420549750328064, + "learning_rate": 4.805194805194805e-07, + "loss": 0.0015, + "step": 38 + }, + { + "clip_ratio": 0.00026202407579625114, + "epoch": 0.25277777777777777, + "grad_norm": 0.1423768699169159, + "learning_rate": 4.935064935064935e-07, + "loss": 0.0015, + "step": 39 + }, + { + "clip_ratio": 0.00026934566440537484, + "epoch": 0.25925925925925924, + "grad_norm": 0.1396292746067047, + "learning_rate": 5.064935064935064e-07, + "loss": 0.0014, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.11111740838913, + "epoch": 0.2657407407407407, + "grad_norm": 0.14567208290100098, + "learning_rate": 5.194805194805194e-07, + "loss": 0.0041, + "num_tokens": 39264277.0, + "reward": 1.2585034256889707, + "reward_std": 0.5927184422810873, + "rewards/acc_reward_func": 1.2585034001441229, + "step": 41 + }, + { + "clip_ratio": 0.00025571950287225524, + "epoch": 0.2722222222222222, + "grad_norm": 0.14597439765930176, + "learning_rate": 5.324675324675324e-07, + "loss": 0.0041, + "step": 42 + }, + { + "clip_ratio": 0.00022081260283898918, + "epoch": 0.27870370370370373, + "grad_norm": 0.16748785972595215, + "learning_rate": 5.454545454545454e-07, + "loss": 0.004, + "step": 43 + }, + { + "clip_ratio": 0.00034207346158966957, + "epoch": 0.2851851851851852, + "grad_norm": 0.14416222274303436, + "learning_rate": 5.584415584415584e-07, + "loss": 0.0039, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.09977649507067, + "epoch": 0.2916666666666667, + "grad_norm": 0.1368248164653778, + "learning_rate": 5.714285714285714e-07, + "loss": 0.0052, + "num_tokens": 42894683.0, + "reward": 1.3628118208476476, + "reward_std": 0.5181492559966587, + "rewards/acc_reward_func": 1.3628117924644834, + "step": 45 + }, + { + "clip_ratio": 0.00015652789096791474, + "epoch": 0.29814814814814816, + "grad_norm": 0.13666287064552307, + "learning_rate": 5.844155844155844e-07, + "loss": 0.0051, + "step": 46 + }, + { + "clip_ratio": 0.0002253453385492321, + "epoch": 0.30462962962962964, + "grad_norm": 0.1370047777891159, + "learning_rate": 5.974025974025974e-07, + "loss": 0.0051, + "step": 47 + }, + { + "clip_ratio": 0.00027938479650488476, + "epoch": 0.3111111111111111, + "grad_norm": 0.13495419919490814, + "learning_rate": 6.103896103896103e-07, + "loss": 0.0049, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.9739292689732, + "epoch": 0.3175925925925926, + "grad_norm": 0.12604905664920807, + "learning_rate": 6.233766233766233e-07, + "loss": -0.0111, + "num_tokens": 46253250.0, + "reward": 1.3662131769316537, + "reward_std": 0.4832150155589694, + "rewards/acc_reward_func": 1.3662131428718567, + "step": 49 + }, + { + "clip_ratio": 0.0001608005228003354, + "epoch": 0.32407407407407407, + "grad_norm": 0.12657120823860168, + "learning_rate": 6.363636363636363e-07, + "loss": -0.0112, + "step": 50 + }, + { + "clip_ratio": 0.00018877648212115413, + "epoch": 0.33055555555555555, + "grad_norm": 0.12609201669692993, + "learning_rate": 6.493506493506493e-07, + "loss": -0.0112, + "step": 51 + }, + { + "clip_ratio": 0.00020845069507014982, + "epoch": 0.337037037037037, + "grad_norm": 0.12553073465824127, + "learning_rate": 6.623376623376623e-07, + "loss": -0.0114, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.6439964657738, + "epoch": 0.3435185185185185, + "grad_norm": 0.1337604820728302, + "learning_rate": 6.753246753246753e-07, + "loss": 0.0089, + "num_tokens": 50081086.0, + "reward": 1.2913832437424433, + "reward_std": 0.5028326373015132, + "rewards/acc_reward_func": 1.2913832181975955, + "step": 53 + }, + { + "clip_ratio": 0.00016935093160663244, + "epoch": 0.35, + "grad_norm": 0.1348680555820465, + "learning_rate": 6.883116883116883e-07, + "loss": 0.0088, + "step": 54 + }, + { + "clip_ratio": 0.00019904559875223122, + "epoch": 0.35648148148148145, + "grad_norm": 0.13407811522483826, + "learning_rate": 7.012987012987013e-07, + "loss": 0.0087, + "step": 55 + }, + { + "clip_ratio": 0.0002497758089573056, + "epoch": 0.362962962962963, + "grad_norm": 0.13265763223171234, + "learning_rate": 7.142857142857143e-07, + "loss": 0.0085, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.2063555036272, + "epoch": 0.36944444444444446, + "grad_norm": 0.12446761876344681, + "learning_rate": 7.272727272727272e-07, + "loss": 0.0028, + "num_tokens": 53519154.0, + "reward": 1.2641723610105968, + "reward_std": 0.4625132538023449, + "rewards/acc_reward_func": 1.2641723383040655, + "step": 57 + }, + { + "clip_ratio": 0.00018638262017269707, + "epoch": 0.37592592592592594, + "grad_norm": 0.12733881175518036, + "learning_rate": 7.402597402597402e-07, + "loss": 0.0028, + "step": 58 + }, + { + "clip_ratio": 0.0002025287897032242, + "epoch": 0.3824074074074074, + "grad_norm": 0.12367873638868332, + "learning_rate": 7.532467532467532e-07, + "loss": 0.0027, + "step": 59 + }, + { + "clip_ratio": 0.000297035732641927, + "epoch": 0.3888888888888889, + "grad_norm": 0.12311229109764099, + "learning_rate": 7.662337662337662e-07, + "loss": 0.0025, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.8061290922619, + "epoch": 0.39537037037037037, + "grad_norm": 0.13111141324043274, + "learning_rate": 7.792207792207792e-07, + "loss": 0.0106, + "num_tokens": 57414909.0, + "reward": 1.2641723638489133, + "reward_std": 0.4437567651981399, + "rewards/acc_reward_func": 1.2641723383040655, + "step": 61 + }, + { + "clip_ratio": 0.00016025772319629877, + "epoch": 0.40185185185185185, + "grad_norm": 0.12952443957328796, + "learning_rate": 7.922077922077922e-07, + "loss": 0.0104, + "step": 62 + }, + { + "clip_ratio": 0.00023572324270000017, + "epoch": 0.4083333333333333, + "grad_norm": 0.1300145536661148, + "learning_rate": 8.051948051948052e-07, + "loss": 0.0103, + "step": 63 + }, + { + "clip_ratio": 0.00037697855891781815, + "epoch": 0.4148148148148148, + "grad_norm": 0.12851402163505554, + "learning_rate": 8.181818181818182e-07, + "loss": 0.0101, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.216561453683, + "epoch": 0.4212962962962963, + "grad_norm": 0.1267780065536499, + "learning_rate": 8.311688311688312e-07, + "loss": -0.0102, + "num_tokens": 60989390.0, + "reward": 1.3015873318626767, + "reward_std": 0.41901361587501706, + "rewards/acc_reward_func": 1.3015873034795125, + "step": 65 + }, + { + "clip_ratio": 0.00017322945980898416, + "epoch": 0.42777777777777776, + "grad_norm": 0.12472664564847946, + "learning_rate": 8.44155844155844e-07, + "loss": -0.0103, + "step": 66 + }, + { + "clip_ratio": 0.000287876123149458, + "epoch": 0.43425925925925923, + "grad_norm": 0.15554016828536987, + "learning_rate": 8.57142857142857e-07, + "loss": -0.0105, + "step": 67 + }, + { + "clip_ratio": 0.0003167382113003571, + "epoch": 0.44074074074074077, + "grad_norm": 0.12460165470838547, + "learning_rate": 8.7012987012987e-07, + "loss": -0.0107, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.6723414829799, + "epoch": 0.44722222222222224, + "grad_norm": 0.12794345617294312, + "learning_rate": 8.83116883116883e-07, + "loss": -0.0049, + "num_tokens": 64756585.0, + "reward": 1.3684807561692738, + "reward_std": 0.39905343807878946, + "rewards/acc_reward_func": 1.3684807050795782, + "step": 69 + }, + { + "clip_ratio": 0.00012579495558470843, + "epoch": 0.4537037037037037, + "grad_norm": 0.12346912920475006, + "learning_rate": 8.96103896103896e-07, + "loss": -0.005, + "step": 70 + }, + { + "clip_ratio": 0.0001513983377316479, + "epoch": 0.4601851851851852, + "grad_norm": 0.1247495487332344, + "learning_rate": 9.09090909090909e-07, + "loss": -0.0052, + "step": 71 + }, + { + "clip_ratio": 0.0002705767042893318, + "epoch": 0.4666666666666667, + "grad_norm": 0.1223696619272232, + "learning_rate": 9.22077922077922e-07, + "loss": -0.0055, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.5226818266369, + "epoch": 0.47314814814814815, + "grad_norm": 0.12345382571220398, + "learning_rate": 9.35064935064935e-07, + "loss": -0.0019, + "num_tokens": 68501142.0, + "reward": 1.3707483268919445, + "reward_std": 0.4370829054997081, + "rewards/acc_reward_func": 1.3707483155386788, + "step": 73 + }, + { + "clip_ratio": 0.00018952846065596013, + "epoch": 0.47962962962962963, + "grad_norm": 0.12303110212087631, + "learning_rate": 9.480519480519479e-07, + "loss": -0.002, + "step": 74 + }, + { + "clip_ratio": 0.0002436372330218243, + "epoch": 0.4861111111111111, + "grad_norm": 0.12270648032426834, + "learning_rate": 9.61038961038961e-07, + "loss": -0.0023, + "step": 75 + }, + { + "clip_ratio": 0.0003469188186933198, + "epoch": 0.4925925925925926, + "grad_norm": 0.12180577218532562, + "learning_rate": 9.74025974025974e-07, + "loss": -0.0026, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.16893804640995, + "epoch": 0.49907407407407406, + "grad_norm": 0.11210431158542633, + "learning_rate": 9.87012987012987e-07, + "loss": 0.0069, + "num_tokens": 72117875.0, + "reward": 1.3673469736462547, + "reward_std": 0.36692249313706443, + "rewards/acc_reward_func": 1.3673469566163563, + "step": 77 + }, + { + "clip_ratio": 0.00018126107360807336, + "epoch": 0.5055555555555555, + "grad_norm": 0.11090458184480667, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 78 + }, + { + "clip_ratio": 0.0002199545553940836, + "epoch": 0.5120370370370371, + "grad_norm": 0.10997123271226883, + "learning_rate": 1e-06, + "loss": 0.0066, + "step": 79 + }, + { + "clip_ratio": 0.00034607137768900777, + "epoch": 0.5185185185185185, + "grad_norm": 0.1732209026813507, + "learning_rate": 1e-06, + "loss": 0.0063, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.36735171363466, + "epoch": 0.525, + "grad_norm": 0.10826165974140167, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 75894389.0, + "reward": 1.3752834740139188, + "reward_std": 0.33868200793152764, + "rewards/acc_reward_func": 1.3752834569840204, + "step": 81 + }, + { + "clip_ratio": 0.00017236430254082995, + "epoch": 0.5314814814814814, + "grad_norm": 0.1089528352022171, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 82 + }, + { + "clip_ratio": 0.0002018414240973514, + "epoch": 0.537962962962963, + "grad_norm": 0.10826719552278519, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 83 + }, + { + "clip_ratio": 0.0002509245447678647, + "epoch": 0.5444444444444444, + "grad_norm": 0.1076684221625328, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.232430594308, + "epoch": 0.5509259259259259, + "grad_norm": 0.10895638167858124, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 79825968.0, + "reward": 1.447845839318775, + "reward_std": 0.2928337816681181, + "rewards/acc_reward_func": 1.447845805258978, + "step": 85 + }, + { + "clip_ratio": 0.00012420982654605593, + "epoch": 0.5574074074074075, + "grad_norm": 0.10320693999528885, + "learning_rate": 1e-06, + "loss": 0.0038, + "step": 86 + }, + { + "clip_ratio": 0.00014987519015059142, + "epoch": 0.5638888888888889, + "grad_norm": 0.10314636677503586, + "learning_rate": 1e-06, + "loss": 0.0037, + "step": 87 + }, + { + "clip_ratio": 0.00020615724549445855, + "epoch": 0.5703703703703704, + "grad_norm": 0.1004006415605545, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.4410487583705, + "epoch": 0.5768518518518518, + "grad_norm": 0.10505778342485428, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 83484809.0, + "reward": 1.418367367415201, + "reward_std": 0.33532609577689854, + "rewards/acc_reward_func": 1.4183673447086698, + "step": 89 + }, + { + "clip_ratio": 0.00018661540144378143, + "epoch": 0.5833333333333334, + "grad_norm": 0.10543543845415115, + "learning_rate": 1e-06, + "loss": 0.0055, + "step": 90 + }, + { + "clip_ratio": 0.0002658298014596637, + "epoch": 0.5898148148148148, + "grad_norm": 0.10678199678659439, + "learning_rate": 1e-06, + "loss": 0.0053, + "step": 91 + }, + { + "clip_ratio": 0.00048629668336139904, + "epoch": 0.5962962962962963, + "grad_norm": 0.10415786504745483, + "learning_rate": 1e-06, + "loss": 0.005, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.6065862746466, + "epoch": 0.6027777777777777, + "grad_norm": 0.12411384284496307, + "learning_rate": 1e-06, + "loss": -0.009, + "num_tokens": 87190872.0, + "reward": 1.4285714569545926, + "reward_std": 0.3354296861659913, + "rewards/acc_reward_func": 1.4285714342480613, + "step": 93 + }, + { + "clip_ratio": 0.0001295947523820879, + "epoch": 0.6092592592592593, + "grad_norm": 0.11555258184671402, + "learning_rate": 1e-06, + "loss": -0.0092, + "step": 94 + }, + { + "clip_ratio": 0.00023690979426083643, + "epoch": 0.6157407407407407, + "grad_norm": 0.11537593603134155, + "learning_rate": 1e-06, + "loss": -0.0094, + "step": 95 + }, + { + "clip_ratio": 0.0004145491839153692, + "epoch": 0.6222222222222222, + "grad_norm": 0.1124836802482605, + "learning_rate": 1e-06, + "loss": -0.0097, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.0283493768601, + "epoch": 0.6287037037037037, + "grad_norm": 0.1106274202466011, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 90867337.0, + "reward": 1.3321995706785292, + "reward_std": 0.3691760081620443, + "rewards/acc_reward_func": 1.332199547971998, + "step": 97 + }, + { + "clip_ratio": 0.00021598117850122174, + "epoch": 0.6351851851851852, + "grad_norm": 0.11104429513216019, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 98 + }, + { + "clip_ratio": 0.00030358976974163116, + "epoch": 0.6416666666666667, + "grad_norm": 0.11258433759212494, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 99 + }, + { + "clip_ratio": 0.000491898344729894, + "epoch": 0.6481481481481481, + "grad_norm": 0.10996989905834198, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.0181477864583, + "epoch": 0.6546296296296297, + "grad_norm": 0.11205079406499863, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 94459043.0, + "reward": 1.3310657909938268, + "reward_std": 0.36978748050473986, + "rewards/acc_reward_func": 1.3310657569340296, + "step": 101 + }, + { + "clip_ratio": 0.00014175956433367295, + "epoch": 0.6611111111111111, + "grad_norm": 0.11174244433641434, + "learning_rate": 1e-06, + "loss": 0.003, + "step": 102 + }, + { + "clip_ratio": 0.00017096654950624464, + "epoch": 0.6675925925925926, + "grad_norm": 0.1625394970178604, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 103 + }, + { + "clip_ratio": 0.00030162992853937406, + "epoch": 0.674074074074074, + "grad_norm": 0.10828293114900589, + "learning_rate": 1e-06, + "loss": 0.0023, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.01587640671505, + "epoch": 0.6805555555555556, + "grad_norm": 0.10728833824396133, + "learning_rate": 1e-06, + "loss": -0.003, + "num_tokens": 98074129.0, + "reward": 1.3945578563781011, + "reward_std": 0.3750602141732261, + "rewards/acc_reward_func": 1.3945578223183042, + "step": 105 + }, + { + "clip_ratio": 0.00015117749440140047, + "epoch": 0.687037037037037, + "grad_norm": 0.10758285224437714, + "learning_rate": 1e-06, + "loss": -0.0031, + "step": 106 + }, + { + "clip_ratio": 0.00019362062206103778, + "epoch": 0.6935185185185185, + "grad_norm": 0.10650208592414856, + "learning_rate": 1e-06, + "loss": -0.0034, + "step": 107 + }, + { + "clip_ratio": 0.0004002706260680931, + "epoch": 0.7, + "grad_norm": 0.10655343532562256, + "learning_rate": 1e-06, + "loss": -0.0038, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.0793747674851, + "epoch": 0.7064814814814815, + "grad_norm": 0.11842089891433716, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 101921189.0, + "reward": 1.419501157034011, + "reward_std": 0.3633141240903309, + "rewards/acc_reward_func": 1.419501128650847, + "step": 109 + }, + { + "clip_ratio": 0.00015762935955925577, + "epoch": 0.7129629629629629, + "grad_norm": 0.11337336152791977, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 110 + }, + { + "clip_ratio": 0.0002720732234246541, + "epoch": 0.7194444444444444, + "grad_norm": 0.11286605894565582, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 111 + }, + { + "clip_ratio": 0.0004098817347160851, + "epoch": 0.725925925925926, + "grad_norm": 0.14437150955200195, + "learning_rate": 1e-06, + "loss": 0.0067, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.4637262253534, + "epoch": 0.7324074074074074, + "grad_norm": 0.1585981696844101, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 105554040.0, + "reward": 1.4580499132474263, + "reward_std": 0.31650315987921895, + "rewards/acc_reward_func": 1.458049896217528, + "step": 113 + }, + { + "clip_ratio": 0.0001126181769428686, + "epoch": 0.7388888888888889, + "grad_norm": 0.09816838800907135, + "learning_rate": 1e-06, + "loss": 0.0062, + "step": 114 + }, + { + "clip_ratio": 0.00020272329320072285, + "epoch": 0.7453703703703703, + "grad_norm": 0.09763254970312119, + "learning_rate": 1e-06, + "loss": 0.006, + "step": 115 + }, + { + "clip_ratio": 0.00032899622670984607, + "epoch": 0.7518518518518519, + "grad_norm": 0.0961078405380249, + "learning_rate": 1e-06, + "loss": 0.0057, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.40476771763394, + "epoch": 0.7583333333333333, + "grad_norm": 0.11743240058422089, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 109165971.0, + "reward": 1.434240386599586, + "reward_std": 0.33151187073616756, + "rewards/acc_reward_func": 1.4342403638930548, + "step": 117 + }, + { + "clip_ratio": 0.00016299381570119987, + "epoch": 0.7648148148148148, + "grad_norm": 0.11144307255744934, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 118 + }, + { + "clip_ratio": 0.00021788747177370603, + "epoch": 0.7712962962962963, + "grad_norm": 0.11082971841096878, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 119 + }, + { + "clip_ratio": 0.00036926281176246923, + "epoch": 0.7777777777777778, + "grad_norm": 0.10246479511260986, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.10658046177457, + "epoch": 0.7842592592592592, + "grad_norm": 0.09951800853013992, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 112830427.0, + "reward": 1.5158730347951253, + "reward_std": 0.3210904362301032, + "rewards/acc_reward_func": 1.5158730291184925, + "step": 121 + }, + { + "clip_ratio": 0.0001313327319665058, + "epoch": 0.7907407407407407, + "grad_norm": 0.102614626288414, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 122 + }, + { + "clip_ratio": 0.00019906960508143086, + "epoch": 0.7972222222222223, + "grad_norm": 0.11046651005744934, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 123 + }, + { + "clip_ratio": 0.0003351353970217696, + "epoch": 0.8037037037037037, + "grad_norm": 0.1014101579785347, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.8957010904948, + "epoch": 0.8101851851851852, + "grad_norm": 0.10484964400529861, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 116491721.0, + "reward": 1.4365079601605732, + "reward_std": 0.3235861455045995, + "rewards/acc_reward_func": 1.4365079317774092, + "step": 125 + }, + { + "clip_ratio": 0.00018362473704092692, + "epoch": 0.8166666666666667, + "grad_norm": 0.10315818339586258, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 126 + }, + { + "clip_ratio": 0.0003200927950231181, + "epoch": 0.8231481481481482, + "grad_norm": 0.09985575079917908, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 127 + }, + { + "clip_ratio": 0.0006616064056288451, + "epoch": 0.8296296296296296, + "grad_norm": 0.09683224558830261, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.2698451450893, + "epoch": 0.8361111111111111, + "grad_norm": 0.11193078756332397, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 119939031.0, + "reward": 1.4308390333538963, + "reward_std": 0.3666802270426637, + "rewards/acc_reward_func": 1.4308389992940993, + "step": 129 + }, + { + "clip_ratio": 0.0002013682514232295, + "epoch": 0.8425925925925926, + "grad_norm": 0.11090143024921417, + "learning_rate": 1e-06, + "loss": 0.005, + "step": 130 + }, + { + "clip_ratio": 0.00041231308672909757, + "epoch": 0.8490740740740741, + "grad_norm": 0.10781311243772507, + "learning_rate": 1e-06, + "loss": 0.0047, + "step": 131 + }, + { + "clip_ratio": 0.0008236173737151105, + "epoch": 0.8555555555555555, + "grad_norm": 0.11081597954034805, + "learning_rate": 1e-06, + "loss": 0.0043, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.4410487583705, + "epoch": 0.862037037037037, + "grad_norm": 0.09571786969900131, + "learning_rate": 1e-06, + "loss": -0.0039, + "num_tokens": 123640940.0, + "reward": 1.4591836957704454, + "reward_std": 0.3062526817832674, + "rewards/acc_reward_func": 1.4591836702255976, + "step": 133 + }, + { + "clip_ratio": 0.00016750225785515447, + "epoch": 0.8685185185185185, + "grad_norm": 0.09535212069749832, + "learning_rate": 1e-06, + "loss": -0.004, + "step": 134 + }, + { + "clip_ratio": 0.0002182886072703349, + "epoch": 0.875, + "grad_norm": 0.09480314701795578, + "learning_rate": 1e-06, + "loss": -0.0043, + "step": 135 + }, + { + "clip_ratio": 0.0003651986255482327, + "epoch": 0.8814814814814815, + "grad_norm": 0.09624945372343063, + "learning_rate": 1e-06, + "loss": -0.0047, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.3832266671317, + "epoch": 0.887962962962963, + "grad_norm": 0.10961057245731354, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 127169248.0, + "reward": 1.3900227064178103, + "reward_std": 0.38218230328389574, + "rewards/acc_reward_func": 1.3900226638430642, + "step": 137 + }, + { + "clip_ratio": 0.00014266002645377913, + "epoch": 0.8944444444444445, + "grad_norm": 0.10866183787584305, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 138 + }, + { + "clip_ratio": 0.0002758925520105376, + "epoch": 0.9009259259259259, + "grad_norm": 0.10819561779499054, + "learning_rate": 1e-06, + "loss": 0.0104, + "step": 139 + }, + { + "clip_ratio": 0.0005836994775260488, + "epoch": 0.9074074074074074, + "grad_norm": 0.10537825524806976, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.5873078845796, + "epoch": 0.9138888888888889, + "grad_norm": 0.10795921087265015, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 130807932.0, + "reward": 1.394557862054734, + "reward_std": 0.36527003596226376, + "rewards/acc_reward_func": 1.3945578393482028, + "step": 141 + }, + { + "clip_ratio": 0.00017469531060972562, + "epoch": 0.9203703703703704, + "grad_norm": 0.10809798538684845, + "learning_rate": 1e-06, + "loss": 0.0039, + "step": 142 + }, + { + "clip_ratio": 0.0004895108766158089, + "epoch": 0.9268518518518518, + "grad_norm": 0.1056734025478363, + "learning_rate": 1e-06, + "loss": 0.0036, + "step": 143 + }, + { + "clip_ratio": 0.0009459215119325867, + "epoch": 0.9333333333333333, + "grad_norm": 0.10791812837123871, + "learning_rate": 1e-06, + "loss": 0.0031, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.20408848353793, + "epoch": 0.9398148148148148, + "grad_norm": 0.09965453296899796, + "learning_rate": 1e-06, + "loss": 0.0123, + "num_tokens": 134599680.0, + "reward": 1.4036281591369992, + "reward_std": 0.30771812565979506, + "rewards/acc_reward_func": 1.4036281250772022, + "step": 145 + }, + { + "clip_ratio": 0.00018105733518799147, + "epoch": 0.9462962962962963, + "grad_norm": 0.09854891151189804, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 146 + }, + { + "clip_ratio": 0.0004012293509385061, + "epoch": 0.9527777777777777, + "grad_norm": 0.09946269541978836, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 147 + }, + { + "clip_ratio": 0.0005813377453402305, + "epoch": 0.9592592592592593, + "grad_norm": 0.10020922869443893, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.29479253859745, + "epoch": 0.9657407407407408, + "grad_norm": 0.10600566118955612, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 138533624.0, + "reward": 1.4217687391099476, + "reward_std": 0.31214298538508867, + "rewards/acc_reward_func": 1.4217687050501506, + "step": 149 + }, + { + "clip_ratio": 0.0001907412124204538, + "epoch": 0.9722222222222222, + "grad_norm": 0.10447903722524643, + "learning_rate": 1e-06, + "loss": 0.0062, + "step": 150 + }, + { + "clip_ratio": 0.0003977599562425721, + "epoch": 0.9787037037037037, + "grad_norm": 0.09847753494977951, + "learning_rate": 1e-06, + "loss": 0.006, + "step": 151 + }, + { + "clip_ratio": 0.000723014666229054, + "epoch": 0.9851851851851852, + "grad_norm": 0.09649986028671265, + "learning_rate": 1e-06, + "loss": 0.0055, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.8061290922619, + "epoch": 1.0064814814814815, + "grad_norm": 0.1145603284239769, + "learning_rate": 1e-06, + "loss": -0.0038, + "num_tokens": 142234001.0, + "reward": 1.4308390333538963, + "reward_std": 0.32152382461797624, + "rewards/acc_reward_func": 1.4308390049707322, + "step": 153 + }, + { + "clip_ratio": 0.00018503319093031765, + "epoch": 1.012962962962963, + "grad_norm": 0.10569456219673157, + "learning_rate": 1e-06, + "loss": -0.004, + "step": 154 + }, + { + "clip_ratio": 0.00037826129084264507, + "epoch": 1.0194444444444444, + "grad_norm": 0.10441906005144119, + "learning_rate": 1e-06, + "loss": -0.0044, + "step": 155 + }, + { + "clip_ratio": 0.0005787124890568, + "epoch": 1.025925925925926, + "grad_norm": 0.10377710312604904, + "learning_rate": 1e-06, + "loss": -0.0049, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.7063555036272, + "epoch": 1.0324074074074074, + "grad_norm": 0.10566150397062302, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 145994812.0, + "reward": 1.4965986637842088, + "reward_std": 0.30973265550675844, + "rewards/acc_reward_func": 1.4965986410776775, + "step": 157 + }, + { + "clip_ratio": 0.0001249600378893471, + "epoch": 1.038888888888889, + "grad_norm": 0.10384287685155869, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 158 + }, + { + "clip_ratio": 0.0002204828872761157, + "epoch": 1.0453703703703703, + "grad_norm": 0.10222744941711426, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 159 + }, + { + "clip_ratio": 0.0004675700528466786, + "epoch": 1.0518518518518518, + "grad_norm": 0.10059863328933716, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.2256338936942, + "epoch": 1.0583333333333333, + "grad_norm": 0.11056972295045853, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 149621705.0, + "reward": 1.5204081819170998, + "reward_std": 0.32912652584768476, + "rewards/acc_reward_func": 1.5204081648872012, + "step": 161 + }, + { + "clip_ratio": 0.00016151682436992858, + "epoch": 1.0648148148148149, + "grad_norm": 0.11060597002506256, + "learning_rate": 1e-06, + "loss": 0.0055, + "step": 162 + }, + { + "clip_ratio": 0.0004163828950064878, + "epoch": 1.0712962962962962, + "grad_norm": 0.10768898576498032, + "learning_rate": 1e-06, + "loss": 0.005, + "step": 163 + }, + { + "clip_ratio": 0.000778782024488984, + "epoch": 1.0777777777777777, + "grad_norm": 0.10523418337106705, + "learning_rate": 1e-06, + "loss": 0.0045, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.5136093866257, + "epoch": 1.0842592592592593, + "grad_norm": 0.1091051772236824, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 153198374.0, + "reward": 1.4353741719609214, + "reward_std": 0.27784250605674016, + "rewards/acc_reward_func": 1.435374140739441, + "step": 165 + }, + { + "clip_ratio": 0.00019789206375467723, + "epoch": 1.0907407407407408, + "grad_norm": 0.11133051663637161, + "learning_rate": 1e-06, + "loss": 0.0039, + "step": 166 + }, + { + "clip_ratio": 0.0003219507693540349, + "epoch": 1.0972222222222223, + "grad_norm": 0.1063733696937561, + "learning_rate": 1e-06, + "loss": 0.0035, + "step": 167 + }, + { + "clip_ratio": 0.0007354825044915612, + "epoch": 1.1037037037037036, + "grad_norm": 0.10145536810159683, + "learning_rate": 1e-06, + "loss": 0.0029, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.5045413062686, + "epoch": 1.1101851851851852, + "grad_norm": 0.0959630087018013, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 156847449.0, + "reward": 1.4739229253360204, + "reward_std": 0.2454074023380166, + "rewards/acc_reward_func": 1.473922916821071, + "step": 169 + }, + { + "clip_ratio": 0.00013901671592888998, + "epoch": 1.1166666666666667, + "grad_norm": 0.09553571790456772, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 170 + }, + { + "clip_ratio": 0.0002768557380823906, + "epoch": 1.1231481481481482, + "grad_norm": 0.09550315886735916, + "learning_rate": 1e-06, + "loss": 0.0015, + "step": 171 + }, + { + "clip_ratio": 0.0005109132904354261, + "epoch": 1.1296296296296295, + "grad_norm": 0.09238159656524658, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.7698480515253, + "epoch": 1.136111111111111, + "grad_norm": 0.1056121438741684, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 160395520.0, + "reward": 1.4829932167416526, + "reward_std": 0.27240987175277304, + "rewards/acc_reward_func": 1.4829931997117543, + "step": 173 + }, + { + "clip_ratio": 0.00018629198200956342, + "epoch": 1.1425925925925926, + "grad_norm": 0.10767155885696411, + "learning_rate": 1e-06, + "loss": 0.0035, + "step": 174 + }, + { + "clip_ratio": 0.00030426528593081805, + "epoch": 1.1490740740740741, + "grad_norm": 0.10210078209638596, + "learning_rate": 1e-06, + "loss": 0.0031, + "step": 175 + }, + { + "clip_ratio": 0.0007162606953421519, + "epoch": 1.1555555555555554, + "grad_norm": 0.0999271348118782, + "learning_rate": 1e-06, + "loss": 0.0026, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.3401416596912, + "epoch": 1.162037037037037, + "grad_norm": 0.10751984268426895, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 164255542.0, + "reward": 1.5430839345568703, + "reward_std": 0.2709279392092001, + "rewards/acc_reward_func": 1.5430838948204404, + "step": 177 + }, + { + "clip_ratio": 0.00019573130737712962, + "epoch": 1.1685185185185185, + "grad_norm": 0.11798489093780518, + "learning_rate": 1e-06, + "loss": -0.0023, + "step": 178 + }, + { + "clip_ratio": 0.00034863099045608016, + "epoch": 1.175, + "grad_norm": 0.10651062428951263, + "learning_rate": 1e-06, + "loss": -0.0027, + "step": 179 + }, + { + "clip_ratio": 0.0006314800730684684, + "epoch": 1.1814814814814816, + "grad_norm": 0.10761118680238724, + "learning_rate": 1e-06, + "loss": -0.0032, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.4399167015439, + "epoch": 1.1879629629629629, + "grad_norm": 0.10813816636800766, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 167941502.0, + "reward": 1.416099800950005, + "reward_std": 0.23997027143126443, + "rewards/acc_reward_func": 1.4160997640518915, + "step": 181 + }, + { + "clip_ratio": 9.955801527082388e-05, + "epoch": 1.1944444444444444, + "grad_norm": 0.10921348631381989, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 182 + }, + { + "clip_ratio": 0.00018630296004370654, + "epoch": 1.200925925925926, + "grad_norm": 0.10632283985614777, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 183 + }, + { + "clip_ratio": 0.00047109573885488015, + "epoch": 1.2074074074074075, + "grad_norm": 0.1052081361413002, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.4988723028274, + "epoch": 1.2138888888888888, + "grad_norm": 0.09669824689626694, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 171898126.0, + "reward": 1.5374149935586112, + "reward_std": 0.22926786666115126, + "rewards/acc_reward_func": 1.5374149680137634, + "step": 185 + }, + { + "clip_ratio": 0.0001581089007751351, + "epoch": 1.2203703703703703, + "grad_norm": 0.09097945690155029, + "learning_rate": 1e-06, + "loss": 0.0029, + "step": 186 + }, + { + "clip_ratio": 0.0001345763438231578, + "epoch": 1.2268518518518519, + "grad_norm": 0.09122958034276962, + "learning_rate": 1e-06, + "loss": 0.0026, + "step": 187 + }, + { + "clip_ratio": 0.00023815015707181634, + "epoch": 1.2333333333333334, + "grad_norm": 0.08839327096939087, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.2641790480841, + "epoch": 1.2398148148148147, + "grad_norm": 0.12880218029022217, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 175634727.0, + "reward": 1.486394587017241, + "reward_std": 0.3775232934526035, + "rewards/acc_reward_func": 1.4863945699873424, + "step": 189 + }, + { + "clip_ratio": 0.00019796302409044335, + "epoch": 1.2462962962962962, + "grad_norm": 0.12655557692050934, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 190 + }, + { + "clip_ratio": 0.0004977159599851196, + "epoch": 1.2527777777777778, + "grad_norm": 0.12340408563613892, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 191 + }, + { + "clip_ratio": 0.0007802596893660459, + "epoch": 1.2592592592592593, + "grad_norm": 0.12486658990383148, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.3310706728981, + "epoch": 1.2657407407407408, + "grad_norm": 0.12545065581798553, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 179429101.0, + "reward": 1.5918367760522025, + "reward_std": 0.24694282561540604, + "rewards/acc_reward_func": 1.591836724962507, + "step": 193 + }, + { + "clip_ratio": 0.00014233627700291219, + "epoch": 1.2722222222222221, + "grad_norm": 0.09697285294532776, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 194 + }, + { + "clip_ratio": 0.00021537907499199113, + "epoch": 1.2787037037037037, + "grad_norm": 0.09571573138237, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 195 + }, + { + "clip_ratio": 0.0003717566467544419, + "epoch": 1.2851851851851852, + "grad_norm": 0.09427618980407715, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 362.25964210146947, + "epoch": 1.2916666666666667, + "grad_norm": 0.10039424896240234, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 183207074.0, + "reward": 1.4138322273890178, + "reward_std": 0.2701569392922379, + "rewards/acc_reward_func": 1.4138321990058536, + "step": 197 + }, + { + "clip_ratio": 0.00013769451262695448, + "epoch": 1.2981481481481483, + "grad_norm": 0.10113991051912308, + "learning_rate": 1e-06, + "loss": 0.0055, + "step": 198 + }, + { + "clip_ratio": 0.00026389724596464516, + "epoch": 1.3046296296296296, + "grad_norm": 0.09699271619319916, + "learning_rate": 1e-06, + "loss": 0.0052, + "step": 199 + }, + { + "clip_ratio": 0.00047017908599671153, + "epoch": 1.3111111111111111, + "grad_norm": 0.09559627622365952, + "learning_rate": 1e-06, + "loss": 0.0047, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.6417265392485, + "epoch": 1.3175925925925926, + "grad_norm": 0.12740236520767212, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 186949450.0, + "reward": 1.5963719118209112, + "reward_std": 0.287948662681239, + "rewards/acc_reward_func": 1.5963718777611142, + "step": 201 + }, + { + "clip_ratio": 0.00018609766872638525, + "epoch": 1.324074074074074, + "grad_norm": 0.10720109194517136, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 202 + }, + { + "clip_ratio": 0.00027242925110234256, + "epoch": 1.3305555555555555, + "grad_norm": 0.10511540621519089, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 203 + }, + { + "clip_ratio": 0.00042802050129033713, + "epoch": 1.337037037037037, + "grad_norm": 0.10465174168348312, + "learning_rate": 1e-06, + "loss": -0.002, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.1644054594494, + "epoch": 1.3435185185185186, + "grad_norm": 0.09750308096408844, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 190654781.0, + "reward": 1.5521542231241863, + "reward_std": 0.2286651549594743, + "rewards/acc_reward_func": 1.552154194741022, + "step": 205 + }, + { + "clip_ratio": 0.00012115049078905334, + "epoch": 1.35, + "grad_norm": 0.09753656387329102, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 206 + }, + { + "clip_ratio": 0.00019190308757104156, + "epoch": 1.3564814814814814, + "grad_norm": 0.09476270526647568, + "learning_rate": 1e-06, + "loss": 0.0024, + "step": 207 + }, + { + "clip_ratio": 0.00031734065331485387, + "epoch": 1.362962962962963, + "grad_norm": 0.09239774942398071, + "learning_rate": 1e-06, + "loss": 0.0019, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.24830554780505, + "epoch": 1.3694444444444445, + "grad_norm": 0.08607508987188339, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 194142806.0, + "reward": 1.7006802956263225, + "reward_std": 0.19712306239775249, + "rewards/acc_reward_func": 1.700680278596424, + "step": 209 + }, + { + "clip_ratio": 0.00010146752867426368, + "epoch": 1.375925925925926, + "grad_norm": 0.08650446683168411, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 210 + }, + { + "clip_ratio": 0.00017285726651261073, + "epoch": 1.3824074074074075, + "grad_norm": 0.08462885767221451, + "learning_rate": 1e-06, + "loss": 0.0031, + "step": 211 + }, + { + "clip_ratio": 0.00037240791252337484, + "epoch": 1.3888888888888888, + "grad_norm": 0.08269508928060532, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.1621398925781, + "epoch": 1.3953703703703704, + "grad_norm": 0.09740854054689407, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 197573641.0, + "reward": 1.5839002586546398, + "reward_std": 0.23600225948861667, + "rewards/acc_reward_func": 1.5839002302714758, + "step": 213 + }, + { + "clip_ratio": 0.0001937114693213343, + "epoch": 1.401851851851852, + "grad_norm": 0.09612539410591125, + "learning_rate": 1e-06, + "loss": -0.0038, + "step": 214 + }, + { + "clip_ratio": 0.000410361105986383, + "epoch": 1.4083333333333332, + "grad_norm": 0.09062538295984268, + "learning_rate": 1e-06, + "loss": -0.0041, + "step": 215 + }, + { + "clip_ratio": 0.0006658303630371977, + "epoch": 1.4148148148148147, + "grad_norm": 0.08932670950889587, + "learning_rate": 1e-06, + "loss": -0.0046, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.11792137509303, + "epoch": 1.4212962962962963, + "grad_norm": 0.12799644470214844, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 200908101.0, + "reward": 1.5249433432306563, + "reward_std": 0.23494758918171837, + "rewards/acc_reward_func": 1.5249433120091755, + "step": 217 + }, + { + "clip_ratio": 0.00014925150822992216, + "epoch": 1.4277777777777778, + "grad_norm": 0.10302122682332993, + "learning_rate": 1e-06, + "loss": 0.0015, + "step": 218 + }, + { + "clip_ratio": 0.00030673204136768444, + "epoch": 1.4342592592592593, + "grad_norm": 0.10091983526945114, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 219 + }, + { + "clip_ratio": 0.000536466390518139, + "epoch": 1.4407407407407407, + "grad_norm": 0.09628088772296906, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.2879871186756, + "epoch": 1.4472222222222222, + "grad_norm": 0.08676353842020035, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 204461333.0, + "reward": 1.4863945983705067, + "reward_std": 0.18810446666819708, + "rewards/acc_reward_func": 1.4863945699873424, + "step": 221 + }, + { + "clip_ratio": 0.00011991041174042039, + "epoch": 1.4537037037037037, + "grad_norm": 0.08577782660722733, + "learning_rate": 1e-06, + "loss": 0.0056, + "step": 222 + }, + { + "clip_ratio": 0.0002372816095538881, + "epoch": 1.4601851851851853, + "grad_norm": 0.08848343044519424, + "learning_rate": 1e-06, + "loss": 0.0053, + "step": 223 + }, + { + "clip_ratio": 0.0003101691067318565, + "epoch": 1.4666666666666668, + "grad_norm": 0.0836254134774208, + "learning_rate": 1e-06, + "loss": 0.0049, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.7074890136719, + "epoch": 1.473148148148148, + "grad_norm": 0.09105908870697021, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 208113905.0, + "reward": 1.5317460695902507, + "reward_std": 0.2008282034879639, + "rewards/acc_reward_func": 1.5317460298538208, + "step": 225 + }, + { + "clip_ratio": 0.0001545734183829544, + "epoch": 1.4796296296296296, + "grad_norm": 0.08936156332492828, + "learning_rate": 1e-06, + "loss": 0.0025, + "step": 226 + }, + { + "clip_ratio": 0.0002477309672464062, + "epoch": 1.4861111111111112, + "grad_norm": 0.08730500191450119, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 227 + }, + { + "clip_ratio": 0.00045893899715294885, + "epoch": 1.4925925925925925, + "grad_norm": 0.08526027202606201, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.7585071382068, + "epoch": 1.499074074074074, + "grad_norm": 0.09061747789382935, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 211741022.0, + "reward": 1.4829932167416526, + "reward_std": 0.20110960996576718, + "rewards/acc_reward_func": 1.4829931883584886, + "step": 229 + }, + { + "clip_ratio": 0.00015922586594353474, + "epoch": 1.5055555555555555, + "grad_norm": 0.09076972305774689, + "learning_rate": 1e-06, + "loss": 0.0024, + "step": 230 + }, + { + "clip_ratio": 0.0003245450009541985, + "epoch": 1.512037037037037, + "grad_norm": 0.09029541909694672, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 231 + }, + { + "clip_ratio": 0.0005226546251963425, + "epoch": 1.5185185185185186, + "grad_norm": 0.08851893246173859, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.81519862583707, + "epoch": 1.525, + "grad_norm": 0.08343932777643204, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 215696347.0, + "reward": 1.503401395820436, + "reward_std": 0.19729749645505631, + "rewards/acc_reward_func": 1.5034013560840063, + "step": 233 + }, + { + "clip_ratio": 0.0001319292518712159, + "epoch": 1.5314814814814814, + "grad_norm": 0.08363550901412964, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 234 + }, + { + "clip_ratio": 0.00015108115725784695, + "epoch": 1.537962962962963, + "grad_norm": 0.08246675133705139, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 235 + }, + { + "clip_ratio": 0.00021679888924977387, + "epoch": 1.5444444444444443, + "grad_norm": 0.08294124901294708, + "learning_rate": 1e-06, + "loss": -0.0018, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.6542024158296, + "epoch": 1.550925925925926, + "grad_norm": 0.09547173976898193, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 219263570.0, + "reward": 1.5589569409688313, + "reward_std": 0.22031122871807643, + "rewards/acc_reward_func": 1.558956923938933, + "step": 237 + }, + { + "clip_ratio": 9.522035404751521e-05, + "epoch": 1.5574074074074074, + "grad_norm": 0.09474051743745804, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 238 + }, + { + "clip_ratio": 0.00018606578322803778, + "epoch": 1.5638888888888889, + "grad_norm": 0.09293164312839508, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 239 + }, + { + "clip_ratio": 0.000349092049873434, + "epoch": 1.5703703703703704, + "grad_norm": 0.08964758366346359, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.5249488467262, + "epoch": 1.5768518518518517, + "grad_norm": 0.07777679711580276, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 222901257.0, + "reward": 1.6167800653548468, + "reward_std": 0.15760817981901623, + "rewards/acc_reward_func": 1.6167800653548468, + "step": 241 + }, + { + "clip_ratio": 0.00013264746234026027, + "epoch": 1.5833333333333335, + "grad_norm": 0.07154504954814911, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 242 + }, + { + "clip_ratio": 0.00021582282629207752, + "epoch": 1.5898148148148148, + "grad_norm": 0.07036437839269638, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 243 + }, + { + "clip_ratio": 0.0004014643964183051, + "epoch": 1.5962962962962963, + "grad_norm": 0.06977999955415726, + "learning_rate": 1e-06, + "loss": -0.0018, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.1610049293155, + "epoch": 1.6027777777777779, + "grad_norm": 0.19480708241462708, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 226504849.0, + "reward": 1.5680272494043623, + "reward_std": 0.17598431486459004, + "rewards/acc_reward_func": 1.5680272039912997, + "step": 245 + }, + { + "clip_ratio": 8.644776798540814e-05, + "epoch": 1.6092592592592592, + "grad_norm": 0.07636608183383942, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 246 + }, + { + "clip_ratio": 0.00016529553405624547, + "epoch": 1.6157407407407407, + "grad_norm": 0.0800333172082901, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 247 + }, + { + "clip_ratio": 0.0002638628620410427, + "epoch": 1.6222222222222222, + "grad_norm": 0.07518580555915833, + "learning_rate": 1e-06, + "loss": -0.0016, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.0079432896205, + "epoch": 1.6287037037037035, + "grad_norm": 0.08973593264818192, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 230496284.0, + "reward": 1.6439909594399589, + "reward_std": 0.22272922364728792, + "rewards/acc_reward_func": 1.6439909367334276, + "step": 249 + }, + { + "clip_ratio": 9.212724364174174e-05, + "epoch": 1.6351851851851853, + "grad_norm": 0.09071903675794601, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 250 + }, + { + "clip_ratio": 0.00014695563593358245, + "epoch": 1.6416666666666666, + "grad_norm": 0.08961236476898193, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 251 + }, + { + "clip_ratio": 0.0002890962827049883, + "epoch": 1.6481481481481481, + "grad_norm": 0.08664807677268982, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.3832252139137, + "epoch": 1.6546296296296297, + "grad_norm": 0.0890711322426796, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 234104038.0, + "reward": 1.6360544533956618, + "reward_std": 0.20212856946246965, + "rewards/acc_reward_func": 1.6360544250124978, + "step": 253 + }, + { + "clip_ratio": 0.00010686751574255704, + "epoch": 1.661111111111111, + "grad_norm": 0.09000733494758606, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 254 + }, + { + "clip_ratio": 0.00012221902753004716, + "epoch": 1.6675925925925927, + "grad_norm": 0.08966827392578125, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 255 + }, + { + "clip_ratio": 0.0002542839824205397, + "epoch": 1.674074074074074, + "grad_norm": 0.08624038100242615, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 393.71202305385043, + "epoch": 1.6805555555555556, + "grad_norm": 0.08406448364257812, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 237931352.0, + "reward": 1.5668934555280776, + "reward_std": 0.18583334485689798, + "rewards/acc_reward_func": 1.5668934299832298, + "step": 257 + }, + { + "clip_ratio": 0.0001015219997844681, + "epoch": 1.6870370370370371, + "grad_norm": 0.08446004986763, + "learning_rate": 1e-06, + "loss": -0.0045, + "step": 258 + }, + { + "clip_ratio": 0.00016563806968319806, + "epoch": 1.6935185185185184, + "grad_norm": 0.08104430139064789, + "learning_rate": 1e-06, + "loss": -0.0048, + "step": 259 + }, + { + "clip_ratio": 0.0004377198553508303, + "epoch": 1.7, + "grad_norm": 0.08080463111400604, + "learning_rate": 1e-06, + "loss": -0.0051, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.3129316057478, + "epoch": 1.7064814814814815, + "grad_norm": 0.10368765890598297, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 241470596.0, + "reward": 1.4274376801082067, + "reward_std": 0.2483105512247199, + "rewards/acc_reward_func": 1.4274376460484095, + "step": 261 + }, + { + "clip_ratio": 0.00011860012004728473, + "epoch": 1.7129629629629628, + "grad_norm": 0.10247006267309189, + "learning_rate": 1e-06, + "loss": -0.0018, + "step": 262 + }, + { + "clip_ratio": 0.00021534529583905603, + "epoch": 1.7194444444444446, + "grad_norm": 0.09949088841676712, + "learning_rate": 1e-06, + "loss": -0.0022, + "step": 263 + }, + { + "clip_ratio": 0.0004019789394944729, + "epoch": 1.7259259259259259, + "grad_norm": 0.09739667177200317, + "learning_rate": 1e-06, + "loss": -0.0028, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.7698480515253, + "epoch": 1.7324074074074074, + "grad_norm": 0.09280133247375488, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 245293317.0, + "reward": 1.6485261122385662, + "reward_std": 0.21551773555222012, + "rewards/acc_reward_func": 1.6485260725021362, + "step": 265 + }, + { + "clip_ratio": 0.0001486230517219242, + "epoch": 1.738888888888889, + "grad_norm": 0.0918872281908989, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 266 + }, + { + "clip_ratio": 0.00017756144734448753, + "epoch": 1.7453703703703702, + "grad_norm": 0.09080260992050171, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 267 + }, + { + "clip_ratio": 0.0002545833766427157, + "epoch": 1.751851851851852, + "grad_norm": 0.09026115387678146, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.11565580822173, + "epoch": 1.7583333333333333, + "grad_norm": 0.07813248783349991, + "learning_rate": 1e-06, + "loss": 0.0121, + "num_tokens": 249129855.0, + "reward": 1.5600907234918504, + "reward_std": 0.1942712063235896, + "rewards/acc_reward_func": 1.560090700785319, + "step": 269 + }, + { + "clip_ratio": 8.448378156615599e-05, + "epoch": 1.7648148148148148, + "grad_norm": 0.07819830626249313, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 270 + }, + { + "clip_ratio": 0.00012770562101477046, + "epoch": 1.7712962962962964, + "grad_norm": 0.07886083424091339, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 271 + }, + { + "clip_ratio": 0.00021004644683368193, + "epoch": 1.7777777777777777, + "grad_norm": 0.0773688331246376, + "learning_rate": 1e-06, + "loss": 0.0113, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.6462692987351, + "epoch": 1.7842592592592592, + "grad_norm": 0.09219877421855927, + "learning_rate": 1e-06, + "loss": -0.0026, + "num_tokens": 252783987.0, + "reward": 1.5691610290890647, + "reward_std": 0.21298281227548918, + "rewards/acc_reward_func": 1.5691610007059007, + "step": 273 + }, + { + "clip_ratio": 9.461846673816798e-05, + "epoch": 1.7907407407407407, + "grad_norm": 0.08525452762842178, + "learning_rate": 1e-06, + "loss": -0.0028, + "step": 274 + }, + { + "clip_ratio": 0.0001551211105487753, + "epoch": 1.7972222222222223, + "grad_norm": 0.08564640581607819, + "learning_rate": 1e-06, + "loss": -0.0031, + "step": 275 + }, + { + "clip_ratio": 0.0003054911636960848, + "epoch": 1.8037037037037038, + "grad_norm": 0.08445427566766739, + "learning_rate": 1e-06, + "loss": -0.0035, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.36622256324404, + "epoch": 1.8101851851851851, + "grad_norm": 0.09835602343082428, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 256514324.0, + "reward": 1.5283446907997131, + "reward_std": 0.23371348583272525, + "rewards/acc_reward_func": 1.5283446680931818, + "step": 277 + }, + { + "clip_ratio": 0.00011316709839905213, + "epoch": 1.8166666666666667, + "grad_norm": 0.09218237549066544, + "learning_rate": 1e-06, + "loss": 0.0065, + "step": 278 + }, + { + "clip_ratio": 0.00012780956482553543, + "epoch": 1.8231481481481482, + "grad_norm": 0.08990070223808289, + "learning_rate": 1e-06, + "loss": 0.0062, + "step": 279 + }, + { + "clip_ratio": 0.0002694847620053527, + "epoch": 1.8296296296296295, + "grad_norm": 0.08729380369186401, + "learning_rate": 1e-06, + "loss": 0.0057, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 466.97166224888394, + "epoch": 1.8361111111111112, + "grad_norm": 0.08512269705533981, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 260558815.0, + "reward": 1.6553288300832112, + "reward_std": 0.18018270248458498, + "rewards/acc_reward_func": 1.6553287903467815, + "step": 281 + }, + { + "clip_ratio": 0.00011174656295528014, + "epoch": 1.8425925925925926, + "grad_norm": 0.07830575108528137, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 282 + }, + { + "clip_ratio": 0.00014859736603241237, + "epoch": 1.849074074074074, + "grad_norm": 0.07735547423362732, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 283 + }, + { + "clip_ratio": 0.00023661474794304618, + "epoch": 1.8555555555555556, + "grad_norm": 0.07640089839696884, + "learning_rate": 1e-06, + "loss": 0.0015, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 473.8832281203497, + "epoch": 1.862037037037037, + "grad_norm": 0.0777854397892952, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 264206382.0, + "reward": 1.6394558179946173, + "reward_std": 0.19011170026801882, + "rewards/acc_reward_func": 1.6394557669049217, + "step": 285 + }, + { + "clip_ratio": 8.368942369651493e-05, + "epoch": 1.8685185185185185, + "grad_norm": 0.07746341824531555, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 286 + }, + { + "clip_ratio": 0.0001167819041473281, + "epoch": 1.875, + "grad_norm": 0.0767863318324089, + "learning_rate": 1e-06, + "loss": 0.0031, + "step": 287 + }, + { + "clip_ratio": 0.00020507513424187587, + "epoch": 1.8814814814814815, + "grad_norm": 0.07616633176803589, + "learning_rate": 1e-06, + "loss": 0.0028, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.0975123814174, + "epoch": 1.887962962962963, + "grad_norm": 0.07935984432697296, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 268028222.0, + "reward": 1.5997732764198667, + "reward_std": 0.18707973510026932, + "rewards/acc_reward_func": 1.5997732423600697, + "step": 289 + }, + { + "clip_ratio": 7.331416522252507e-05, + "epoch": 1.8944444444444444, + "grad_norm": 0.07752402871847153, + "learning_rate": 1e-06, + "loss": 0.0078, + "step": 290 + }, + { + "clip_ratio": 9.849115892956477e-05, + "epoch": 1.900925925925926, + "grad_norm": 0.076600082218647, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 291 + }, + { + "clip_ratio": 0.0001755388210377922, + "epoch": 1.9074074074074074, + "grad_norm": 0.07660045474767685, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 470.53515625, + "epoch": 1.9138888888888888, + "grad_norm": 0.0926726683974266, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 271639278.0, + "reward": 1.452380983602433, + "reward_std": 0.24590044894388743, + "rewards/acc_reward_func": 1.4523809580575853, + "step": 293 + }, + { + "clip_ratio": 8.711325686558016e-05, + "epoch": 1.9203703703703705, + "grad_norm": 0.09404096752405167, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 294 + }, + { + "clip_ratio": 0.00013792818329723863, + "epoch": 1.9268518518518518, + "grad_norm": 0.09250401705503464, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 295 + }, + { + "clip_ratio": 0.0002357466875678039, + "epoch": 1.9333333333333333, + "grad_norm": 0.09176366031169891, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.4886707124256, + "epoch": 1.9398148148148149, + "grad_norm": 0.08760599046945572, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 275315687.0, + "reward": 1.529478478999365, + "reward_std": 0.2415944811488901, + "rewards/acc_reward_func": 1.5294784619694664, + "step": 297 + }, + { + "clip_ratio": 0.00010062488824284325, + "epoch": 1.9462962962962962, + "grad_norm": 0.08795120567083359, + "learning_rate": 1e-06, + "loss": 0.0026, + "step": 298 + }, + { + "clip_ratio": 0.00017867312445083545, + "epoch": 1.9527777777777777, + "grad_norm": 0.08775324374437332, + "learning_rate": 1e-06, + "loss": 0.0023, + "step": 299 + }, + { + "clip_ratio": 0.000306412472300941, + "epoch": 1.9592592592592593, + "grad_norm": 0.08796869218349457, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.61678786504837, + "epoch": 1.9657407407407408, + "grad_norm": 0.08162126690149307, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 279066213.0, + "reward": 1.515873046148391, + "reward_std": 0.19076500141194888, + "rewards/acc_reward_func": 1.5158730177652269, + "step": 301 + }, + { + "clip_ratio": 7.91146989545918e-05, + "epoch": 1.9722222222222223, + "grad_norm": 0.0824337899684906, + "learning_rate": 1e-06, + "loss": 0.0041, + "step": 302 + }, + { + "clip_ratio": 0.00014237434654552046, + "epoch": 1.9787037037037036, + "grad_norm": 0.08004167675971985, + "learning_rate": 1e-06, + "loss": 0.0038, + "step": 303 + }, + { + "clip_ratio": 0.00029009427366656295, + "epoch": 1.9851851851851852, + "grad_norm": 0.08044147491455078, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 471.05669875372024, + "epoch": 2.0064814814814813, + "grad_norm": 0.07870854437351227, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 282465419.0, + "reward": 1.5374149935586112, + "reward_std": 0.1854196455152262, + "rewards/acc_reward_func": 1.53741497085208, + "step": 305 + }, + { + "clip_ratio": 8.949786959939437e-05, + "epoch": 2.012962962962963, + "grad_norm": 0.08076539635658264, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 306 + }, + { + "clip_ratio": 0.00015401908898465556, + "epoch": 2.0194444444444444, + "grad_norm": 0.0782662183046341, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 307 + }, + { + "clip_ratio": 0.00023743030829964916, + "epoch": 2.025925925925926, + "grad_norm": 0.07929681241512299, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.8979681105841, + "epoch": 2.0324074074074074, + "grad_norm": 0.07813294231891632, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 286079843.0, + "reward": 1.673469407217843, + "reward_std": 0.17615552140133722, + "rewards/acc_reward_func": 1.673469378834679, + "step": 309 + }, + { + "clip_ratio": 0.00010939102676708163, + "epoch": 2.0388888888888888, + "grad_norm": 0.07774676382541656, + "learning_rate": 1e-06, + "loss": 0.0064, + "step": 310 + }, + { + "clip_ratio": 0.00011380412355980038, + "epoch": 2.0453703703703705, + "grad_norm": 0.07755687832832336, + "learning_rate": 1e-06, + "loss": 0.0062, + "step": 311 + }, + { + "clip_ratio": 0.0002462299581522876, + "epoch": 2.051851851851852, + "grad_norm": 0.08249640464782715, + "learning_rate": 1e-06, + "loss": 0.0058, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.54082670665923, + "epoch": 2.058333333333333, + "grad_norm": 0.07517673820257187, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 289924850.0, + "reward": 1.6281179274831499, + "reward_std": 0.17834148626951946, + "rewards/acc_reward_func": 1.6281179132915677, + "step": 313 + }, + { + "clip_ratio": 0.00011314422001651976, + "epoch": 2.064814814814815, + "grad_norm": 0.073227159678936, + "learning_rate": 1e-06, + "loss": 0.0045, + "step": 314 + }, + { + "clip_ratio": 0.00020379069714441096, + "epoch": 2.071296296296296, + "grad_norm": 0.07385105639696121, + "learning_rate": 1e-06, + "loss": 0.0043, + "step": 315 + }, + { + "clip_ratio": 0.0003329941499119048, + "epoch": 2.077777777777778, + "grad_norm": 0.07256048172712326, + "learning_rate": 1e-06, + "loss": 0.0039, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.64512997581846, + "epoch": 2.0842592592592593, + "grad_norm": 0.08051841706037521, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 293558677.0, + "reward": 1.6541950475601923, + "reward_std": 0.17689220057356925, + "rewards/acc_reward_func": 1.6541950021471297, + "step": 317 + }, + { + "clip_ratio": 8.453805778463859e-05, + "epoch": 2.0907407407407406, + "grad_norm": 0.08467243611812592, + "learning_rate": 1e-06, + "loss": 0.005, + "step": 318 + }, + { + "clip_ratio": 0.00011742045968449453, + "epoch": 2.0972222222222223, + "grad_norm": 0.07905680686235428, + "learning_rate": 1e-06, + "loss": 0.0047, + "step": 319 + }, + { + "clip_ratio": 0.0002576940849914016, + "epoch": 2.1037037037037036, + "grad_norm": 0.07652027159929276, + "learning_rate": 1e-06, + "loss": 0.0043, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.39683605375745, + "epoch": 2.1101851851851854, + "grad_norm": 0.07079870998859406, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 297388737.0, + "reward": 1.6224490063531058, + "reward_std": 0.14390913556729043, + "rewards/acc_reward_func": 1.6224489779699416, + "step": 321 + }, + { + "clip_ratio": 6.244152800285346e-05, + "epoch": 2.1166666666666667, + "grad_norm": 0.07108927518129349, + "learning_rate": 1e-06, + "loss": -0.0032, + "step": 322 + }, + { + "clip_ratio": 9.540307673021397e-05, + "epoch": 2.123148148148148, + "grad_norm": 0.07107880711555481, + "learning_rate": 1e-06, + "loss": -0.0035, + "step": 323 + }, + { + "clip_ratio": 0.00015793243524274745, + "epoch": 2.1296296296296298, + "grad_norm": 0.07058020681142807, + "learning_rate": 1e-06, + "loss": -0.0038, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.31520298549106, + "epoch": 2.136111111111111, + "grad_norm": 0.07716906815767288, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 301268873.0, + "reward": 1.6712018421718053, + "reward_std": 0.1573598569347745, + "rewards/acc_reward_func": 1.671201813788641, + "step": 325 + }, + { + "clip_ratio": 6.981899336789779e-05, + "epoch": 2.1425925925925924, + "grad_norm": 0.07711810618638992, + "learning_rate": 1e-06, + "loss": 0.003, + "step": 326 + }, + { + "clip_ratio": 9.463478012808732e-05, + "epoch": 2.149074074074074, + "grad_norm": 0.07729792594909668, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 327 + }, + { + "clip_ratio": 0.0001468960305958587, + "epoch": 2.1555555555555554, + "grad_norm": 0.07669426500797272, + "learning_rate": 1e-06, + "loss": 0.0023, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.0521647135417, + "epoch": 2.162037037037037, + "grad_norm": 0.08283355832099915, + "learning_rate": 1e-06, + "loss": -0.0017, + "num_tokens": 304759563.0, + "reward": 1.6179138592311315, + "reward_std": 0.18949996839676583, + "rewards/acc_reward_func": 1.6179138365246, + "step": 329 + }, + { + "clip_ratio": 0.0001103849049069963, + "epoch": 2.1685185185185185, + "grad_norm": 0.08356571942567825, + "learning_rate": 1e-06, + "loss": -0.0019, + "step": 330 + }, + { + "clip_ratio": 0.0001704507456105646, + "epoch": 2.175, + "grad_norm": 0.08093303442001343, + "learning_rate": 1e-06, + "loss": -0.0023, + "step": 331 + }, + { + "clip_ratio": 0.0002644155105198955, + "epoch": 2.1814814814814816, + "grad_norm": 0.07993580400943756, + "learning_rate": 1e-06, + "loss": -0.0027, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.93311564127606, + "epoch": 2.187962962962963, + "grad_norm": 0.08723075687885284, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 308389564.0, + "reward": 1.6746032010941279, + "reward_std": 0.19914495199918747, + "rewards/acc_reward_func": 1.6746031670343309, + "step": 333 + }, + { + "clip_ratio": 0.0001357406850036655, + "epoch": 2.1944444444444446, + "grad_norm": 0.09279884397983551, + "learning_rate": 1e-06, + "loss": 0.0025, + "step": 334 + }, + { + "clip_ratio": 0.0002144234143391562, + "epoch": 2.200925925925926, + "grad_norm": 0.08094783127307892, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 335 + }, + { + "clip_ratio": 0.0003632333076287371, + "epoch": 2.2074074074074073, + "grad_norm": 0.08018580079078674, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.8310735793341, + "epoch": 2.213888888888889, + "grad_norm": 0.07070093601942062, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 312450389.0, + "reward": 1.6836735010147095, + "reward_std": 0.15429549912611643, + "rewards/acc_reward_func": 1.6836734669549125, + "step": 337 + }, + { + "clip_ratio": 5.7854162670472374e-05, + "epoch": 2.2203703703703703, + "grad_norm": 0.07059533894062042, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 338 + }, + { + "clip_ratio": 7.601507691322782e-05, + "epoch": 2.226851851851852, + "grad_norm": 0.07046937197446823, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 339 + }, + { + "clip_ratio": 0.00011438851513611596, + "epoch": 2.2333333333333334, + "grad_norm": 0.06987206637859344, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.7800525483631, + "epoch": 2.2398148148148147, + "grad_norm": 0.07333923131227493, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 316124745.0, + "reward": 1.651927459807623, + "reward_std": 0.15663272621376173, + "rewards/acc_reward_func": 1.6519274314244587, + "step": 341 + }, + { + "clip_ratio": 5.6131516820252205e-05, + "epoch": 2.2462962962962965, + "grad_norm": 0.07241741567850113, + "learning_rate": 1e-06, + "loss": 0.0028, + "step": 342 + }, + { + "clip_ratio": 7.750764762888485e-05, + "epoch": 2.2527777777777778, + "grad_norm": 0.07209280133247375, + "learning_rate": 1e-06, + "loss": 0.0026, + "step": 343 + }, + { + "clip_ratio": 0.0001592640822179549, + "epoch": 2.259259259259259, + "grad_norm": 0.07179038226604462, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.90136864071803, + "epoch": 2.265740740740741, + "grad_norm": 0.06366118788719177, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 319907790.0, + "reward": 1.6088435649871826, + "reward_std": 0.13132147632894062, + "rewards/acc_reward_func": 1.6088435309273856, + "step": 345 + }, + { + "clip_ratio": 5.585887724702756e-05, + "epoch": 2.272222222222222, + "grad_norm": 0.06443970650434494, + "learning_rate": 1e-06, + "loss": -0.0025, + "step": 346 + }, + { + "clip_ratio": 6.936551792369712e-05, + "epoch": 2.278703703703704, + "grad_norm": 0.06347978860139847, + "learning_rate": 1e-06, + "loss": -0.0027, + "step": 347 + }, + { + "clip_ratio": 8.864202668302737e-05, + "epoch": 2.285185185185185, + "grad_norm": 0.06255137920379639, + "learning_rate": 1e-06, + "loss": -0.003, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.8322099958147, + "epoch": 2.2916666666666665, + "grad_norm": 0.06696058064699173, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 323384510.0, + "reward": 1.6383220184416998, + "reward_std": 0.13071540867288908, + "rewards/acc_reward_func": 1.6383219900585355, + "step": 349 + }, + { + "clip_ratio": 4.920152094287221e-05, + "epoch": 2.2981481481481483, + "grad_norm": 0.06489527225494385, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 350 + }, + { + "clip_ratio": 6.74184571142264e-05, + "epoch": 2.3046296296296296, + "grad_norm": 0.06311435252428055, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 351 + }, + { + "clip_ratio": 0.0001653696353536188, + "epoch": 2.311111111111111, + "grad_norm": 0.06215111166238785, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.8628191266741, + "epoch": 2.3175925925925926, + "grad_norm": 0.08817350119352341, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 327426049.0, + "reward": 1.68934242498307, + "reward_std": 0.14556503544251123, + "rewards/acc_reward_func": 1.6893424022765386, + "step": 353 + }, + { + "clip_ratio": 4.580863883131228e-05, + "epoch": 2.324074074074074, + "grad_norm": 0.07056381553411484, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 354 + }, + { + "clip_ratio": 8.168562462309464e-05, + "epoch": 2.3305555555555557, + "grad_norm": 0.0691598653793335, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 355 + }, + { + "clip_ratio": 0.00018179262354221595, + "epoch": 2.337037037037037, + "grad_norm": 0.06830444186925888, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.96145775204616, + "epoch": 2.3435185185185183, + "grad_norm": 0.07399642467498779, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 331174365.0, + "reward": 1.6439909594399589, + "reward_std": 0.15822177433541842, + "rewards/acc_reward_func": 1.6439909310567946, + "step": 357 + }, + { + "clip_ratio": 4.970507004708495e-05, + "epoch": 2.35, + "grad_norm": 0.07388998568058014, + "learning_rate": 1e-06, + "loss": 0.0064, + "step": 358 + }, + { + "clip_ratio": 7.191943188358674e-05, + "epoch": 2.3564814814814814, + "grad_norm": 0.07321937382221222, + "learning_rate": 1e-06, + "loss": 0.0061, + "step": 359 + }, + { + "clip_ratio": 0.00010987346589293641, + "epoch": 2.362962962962963, + "grad_norm": 0.07204120606184006, + "learning_rate": 1e-06, + "loss": 0.0057, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.12698654901413, + "epoch": 2.3694444444444445, + "grad_norm": 0.0716153234243393, + "learning_rate": 1e-06, + "loss": 0.0084, + "num_tokens": 334945303.0, + "reward": 1.568027240889413, + "reward_std": 0.14656859547609374, + "rewards/acc_reward_func": 1.568027206829616, + "step": 361 + }, + { + "clip_ratio": 5.729519952659584e-05, + "epoch": 2.3759259259259258, + "grad_norm": 0.07136084884405136, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 362 + }, + { + "clip_ratio": 5.302412908003178e-05, + "epoch": 2.3824074074074075, + "grad_norm": 0.07047837227582932, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 363 + }, + { + "clip_ratio": 0.00014433350086273138, + "epoch": 2.388888888888889, + "grad_norm": 0.06907393783330917, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.1598714192708, + "epoch": 2.3953703703703706, + "grad_norm": 0.08112609386444092, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 338547496.0, + "reward": 1.613378712109157, + "reward_std": 0.16461583830061413, + "rewards/acc_reward_func": 1.613378683725993, + "step": 365 + }, + { + "clip_ratio": 6.238901100697971e-05, + "epoch": 2.401851851851852, + "grad_norm": 0.08133766055107117, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 366 + }, + { + "clip_ratio": 0.00019287066121857302, + "epoch": 2.408333333333333, + "grad_norm": 0.07959448546171188, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 367 + }, + { + "clip_ratio": 0.0002893279022481736, + "epoch": 2.414814814814815, + "grad_norm": 0.07730524241924286, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.75284249441967, + "epoch": 2.4212962962962963, + "grad_norm": 0.08015070855617523, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 342065684.0, + "reward": 1.6065759829112463, + "reward_std": 0.1567436396366074, + "rewards/acc_reward_func": 1.6065759658813477, + "step": 369 + }, + { + "clip_ratio": 4.735970131670391e-05, + "epoch": 2.4277777777777776, + "grad_norm": 0.08372899144887924, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 370 + }, + { + "clip_ratio": 0.00012169528140691996, + "epoch": 2.4342592592592593, + "grad_norm": 0.0796024277806282, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 371 + }, + { + "clip_ratio": 0.0003254882942114602, + "epoch": 2.4407407407407407, + "grad_norm": 0.07758983969688416, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.54762776692706, + "epoch": 2.4472222222222224, + "grad_norm": 0.0863879844546318, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 345663755.0, + "reward": 1.697278931027367, + "reward_std": 0.1657904459252244, + "rewards/acc_reward_func": 1.6972789196741014, + "step": 373 + }, + { + "clip_ratio": 7.448090400430374e-05, + "epoch": 2.4537037037037037, + "grad_norm": 0.08563799411058426, + "learning_rate": 1e-06, + "loss": -0.005, + "step": 374 + }, + { + "clip_ratio": 0.00020172142172842066, + "epoch": 2.460185185185185, + "grad_norm": 0.0837676078081131, + "learning_rate": 1e-06, + "loss": -0.0054, + "step": 375 + }, + { + "clip_ratio": 0.0004377457608782043, + "epoch": 2.466666666666667, + "grad_norm": 0.08231651037931442, + "learning_rate": 1e-06, + "loss": -0.006, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.5204133533296, + "epoch": 2.473148148148148, + "grad_norm": 0.08515173196792603, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 349245410.0, + "reward": 1.7052154427482968, + "reward_std": 0.16742009811458133, + "rewards/acc_reward_func": 1.7052154200417655, + "step": 377 + }, + { + "clip_ratio": 6.578304687753276e-05, + "epoch": 2.4796296296296294, + "grad_norm": 0.08399412035942078, + "learning_rate": 1e-06, + "loss": 0.0032, + "step": 378 + }, + { + "clip_ratio": 0.00018409334491783133, + "epoch": 2.486111111111111, + "grad_norm": 0.08153863251209259, + "learning_rate": 1e-06, + "loss": 0.0028, + "step": 379 + }, + { + "clip_ratio": 0.0003922197685737739, + "epoch": 2.4925925925925925, + "grad_norm": 0.080192930996418, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.0147487095424, + "epoch": 2.4990740740740742, + "grad_norm": 0.07259111106395721, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 352687257.0, + "reward": 1.6315193062736875, + "reward_std": 0.13436711685998098, + "rewards/acc_reward_func": 1.6315192778905232, + "step": 381 + }, + { + "clip_ratio": 6.481163377646313e-05, + "epoch": 2.5055555555555555, + "grad_norm": 0.0701545923948288, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 382 + }, + { + "clip_ratio": 0.00020013943399784954, + "epoch": 2.512037037037037, + "grad_norm": 0.06911647319793701, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 383 + }, + { + "clip_ratio": 0.00035089113017810244, + "epoch": 2.5185185185185186, + "grad_norm": 0.06758937239646912, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.76077706473217, + "epoch": 2.525, + "grad_norm": 0.08028464764356613, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 356398820.0, + "reward": 1.6156462885084606, + "reward_std": 0.15897739288352786, + "rewards/acc_reward_func": 1.6156462658019293, + "step": 385 + }, + { + "clip_ratio": 6.950612147366406e-05, + "epoch": 2.5314814814814817, + "grad_norm": 0.0779779925942421, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 386 + }, + { + "clip_ratio": 0.00015359692943526344, + "epoch": 2.537962962962963, + "grad_norm": 0.07718382030725479, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 387 + }, + { + "clip_ratio": 0.00016256648242623278, + "epoch": 2.5444444444444443, + "grad_norm": 0.07550051063299179, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completion_length": 408.92857869466144, + "epoch": 2.550925925925926, + "grad_norm": 0.06849638372659683, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 360012227.0, + "reward": 1.6315192977587383, + "reward_std": 0.11362639105036146, + "rewards/acc_reward_func": 1.631519269375574, + "step": 389 + }, + { + "clip_ratio": 5.923737660937366e-05, + "epoch": 2.5574074074074074, + "grad_norm": 0.06760665029287338, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 390 + }, + { + "clip_ratio": 0.0001121529177388376, + "epoch": 2.563888888888889, + "grad_norm": 0.06625531613826752, + "learning_rate": 1e-06, + "loss": 0.0031, + "step": 391 + }, + { + "clip_ratio": 0.00021175693923613012, + "epoch": 2.5703703703703704, + "grad_norm": 0.06420101970434189, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completion_length": 408.4580586751302, + "epoch": 2.5768518518518517, + "grad_norm": 0.08265353739261627, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 363854623.0, + "reward": 1.6439909594399589, + "reward_std": 0.15082332562832607, + "rewards/acc_reward_func": 1.6439909310567946, + "step": 393 + }, + { + "clip_ratio": 3.310179055829178e-05, + "epoch": 2.5833333333333335, + "grad_norm": 0.0822535827755928, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 394 + }, + { + "clip_ratio": 0.00014647337836019383, + "epoch": 2.589814814814815, + "grad_norm": 0.07974167913198471, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 395 + }, + { + "clip_ratio": 0.0003786702473007608, + "epoch": 2.5962962962962965, + "grad_norm": 0.076680988073349, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completion_length": 401.6383274623326, + "epoch": 2.602777777777778, + "grad_norm": 0.07131548970937729, + "learning_rate": 1e-06, + "loss": -0.0067, + "num_tokens": 367731450.0, + "reward": 1.6530612593605405, + "reward_std": 0.131582503340074, + "rewards/acc_reward_func": 1.6530612139474778, + "step": 397 + }, + { + "clip_ratio": 2.882930996184725e-05, + "epoch": 2.609259259259259, + "grad_norm": 0.0698467567563057, + "learning_rate": 1e-06, + "loss": -0.0069, + "step": 398 + }, + { + "clip_ratio": 0.00012053488760665503, + "epoch": 2.6157407407407405, + "grad_norm": 0.06839628517627716, + "learning_rate": 1e-06, + "loss": -0.0072, + "step": 399 + }, + { + "clip_ratio": 0.00024320762410449484, + "epoch": 2.6222222222222222, + "grad_norm": 0.06797367334365845, + "learning_rate": 1e-06, + "loss": -0.0076, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 395.0634998139881, + "epoch": 2.6287037037037035, + "grad_norm": 0.11827152222394943, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 371186810.0, + "reward": 1.606575988587879, + "reward_std": 0.14957289291279657, + "rewards/acc_reward_func": 1.6065759658813477, + "step": 401 + }, + { + "clip_ratio": 6.91131310651101e-05, + "epoch": 2.6351851851851853, + "grad_norm": 0.07700794190168381, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 402 + }, + { + "clip_ratio": 0.00016931103008183918, + "epoch": 2.6416666666666666, + "grad_norm": 0.07600509375333786, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 403 + }, + { + "clip_ratio": 0.00031958719098059064, + "epoch": 2.648148148148148, + "grad_norm": 0.0738753154873848, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 404 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.74490501767116, + "epoch": 2.6546296296296297, + "grad_norm": 0.06802462786436081, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 374782817.0, + "reward": 1.6836734896614438, + "reward_std": 0.12749963058602243, + "rewards/acc_reward_func": 1.6836734783081782, + "step": 405 + }, + { + "clip_ratio": 4.254553156594435e-05, + "epoch": 2.661111111111111, + "grad_norm": 0.06773427873849869, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 406 + }, + { + "clip_ratio": 0.00012479171484647806, + "epoch": 2.6675925925925927, + "grad_norm": 0.06698736548423767, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 407 + }, + { + "clip_ratio": 0.00020478161154425747, + "epoch": 2.674074074074074, + "grad_norm": 0.06650257110595703, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 408 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.48753429594495, + "epoch": 2.6805555555555554, + "grad_norm": 0.08356206119060516, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 378096657.0, + "reward": 1.643990953763326, + "reward_std": 0.1610797480458305, + "rewards/acc_reward_func": 1.6439909310567946, + "step": 409 + }, + { + "clip_ratio": 3.973056993258762e-05, + "epoch": 2.687037037037037, + "grad_norm": 0.0816047191619873, + "learning_rate": 1e-06, + "loss": 0.0044, + "step": 410 + }, + { + "clip_ratio": 6.90307292859957e-05, + "epoch": 2.6935185185185184, + "grad_norm": 0.07993968576192856, + "learning_rate": 1e-06, + "loss": 0.004, + "step": 411 + }, + { + "clip_ratio": 0.00017553037434395047, + "epoch": 2.7, + "grad_norm": 0.07977181673049927, + "learning_rate": 1e-06, + "loss": 0.0035, + "step": 412 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.8163350423177, + "epoch": 2.7064814814814815, + "grad_norm": 0.07647784799337387, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 381935865.0, + "reward": 1.674603195417495, + "reward_std": 0.16518590492861612, + "rewards/acc_reward_func": 1.674603161357698, + "step": 413 + }, + { + "clip_ratio": 4.758690729864784e-05, + "epoch": 2.712962962962963, + "grad_norm": 0.07679473608732224, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 414 + }, + { + "clip_ratio": 8.775196450490814e-05, + "epoch": 2.7194444444444446, + "grad_norm": 0.07613964378833771, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 415 + }, + { + "clip_ratio": 0.0002375333022514713, + "epoch": 2.725925925925926, + "grad_norm": 0.07401680946350098, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 416 + }, + { + "clip_ratio": 0.0, + "completion_length": 388.7766549246652, + "epoch": 2.7324074074074076, + "grad_norm": 0.07717788219451904, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 385755250.0, + "reward": 1.769841296332223, + "reward_std": 0.13284824416041374, + "rewards/acc_reward_func": 1.769841267949059, + "step": 417 + }, + { + "clip_ratio": 4.6571577730078606e-05, + "epoch": 2.738888888888889, + "grad_norm": 0.07409494370222092, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 418 + }, + { + "clip_ratio": 0.00010308226739566418, + "epoch": 2.7453703703703702, + "grad_norm": 0.07239258289337158, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 419 + }, + { + "clip_ratio": 0.00022064264131976024, + "epoch": 2.751851851851852, + "grad_norm": 0.07175586372613907, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.83787318638394, + "epoch": 2.7583333333333333, + "grad_norm": 0.0722418949007988, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 389321531.0, + "reward": 1.5861678350539434, + "reward_std": 0.1339329518377781, + "rewards/acc_reward_func": 1.5861677896408808, + "step": 421 + }, + { + "clip_ratio": 3.0351422104840387e-05, + "epoch": 2.764814814814815, + "grad_norm": 0.07102184742689133, + "learning_rate": 1e-06, + "loss": 0.0028, + "step": 422 + }, + { + "clip_ratio": 6.684829956308629e-05, + "epoch": 2.7712962962962964, + "grad_norm": 0.06921263784170151, + "learning_rate": 1e-06, + "loss": 0.0025, + "step": 423 + }, + { + "clip_ratio": 0.0001603819608655093, + "epoch": 2.7777777777777777, + "grad_norm": 0.06847840547561646, + "learning_rate": 1e-06, + "loss": 0.002, + "step": 424 + }, + { + "clip_ratio": 0.0, + "completion_length": 395.74603852771577, + "epoch": 2.784259259259259, + "grad_norm": 0.07620932906866074, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 392811117.0, + "reward": 1.675736977940514, + "reward_std": 0.15024714987902416, + "rewards/acc_reward_func": 1.6757369552339827, + "step": 425 + }, + { + "clip_ratio": 5.349587324114206e-05, + "epoch": 2.7907407407407407, + "grad_norm": 0.07477039843797684, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 426 + }, + { + "clip_ratio": 0.0001953024965630556, + "epoch": 2.7972222222222225, + "grad_norm": 0.07259545475244522, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 427 + }, + { + "clip_ratio": 0.0004224494755146138, + "epoch": 2.803703703703704, + "grad_norm": 0.0708785280585289, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 428 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.82993861607144, + "epoch": 2.810185185185185, + "grad_norm": 0.07910261303186417, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 396527025.0, + "reward": 1.7210884661901564, + "reward_std": 0.16589947470596858, + "rewards/acc_reward_func": 1.7210884264537267, + "step": 429 + }, + { + "clip_ratio": 7.650961353127579e-05, + "epoch": 2.8166666666666664, + "grad_norm": 0.07643198221921921, + "learning_rate": 1e-06, + "loss": 0.0038, + "step": 430 + }, + { + "clip_ratio": 0.00023775612498866394, + "epoch": 2.823148148148148, + "grad_norm": 0.07540789246559143, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 431 + }, + { + "clip_ratio": 0.0005787453077833302, + "epoch": 2.8296296296296295, + "grad_norm": 0.07343152165412903, + "learning_rate": 1e-06, + "loss": 0.0029, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.15986996605284, + "epoch": 2.8361111111111112, + "grad_norm": 0.08127926290035248, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 400366410.0, + "reward": 1.6848072721844627, + "reward_std": 0.11902960372113046, + "rewards/acc_reward_func": 1.684807260831197, + "step": 433 + }, + { + "clip_ratio": 5.624502747585731e-05, + "epoch": 2.8425925925925926, + "grad_norm": 0.08154301345348358, + "learning_rate": 1e-06, + "loss": 0.0052, + "step": 434 + }, + { + "clip_ratio": 0.00014876067438135144, + "epoch": 2.849074074074074, + "grad_norm": 0.07982967048883438, + "learning_rate": 1e-06, + "loss": 0.0048, + "step": 435 + }, + { + "clip_ratio": 0.00047466065838567114, + "epoch": 2.8555555555555556, + "grad_norm": 0.0750332623720169, + "learning_rate": 1e-06, + "loss": 0.0043, + "step": 436 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.58050246465774, + "epoch": 2.862037037037037, + "grad_norm": 0.06907039880752563, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 404099120.0, + "reward": 1.6791383482161022, + "reward_std": 0.09301007627731278, + "rewards/acc_reward_func": 1.6791383255095709, + "step": 437 + }, + { + "clip_ratio": 5.430434404323543e-05, + "epoch": 2.8685185185185187, + "grad_norm": 0.06709130853414536, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 438 + }, + { + "clip_ratio": 0.00010574558421337445, + "epoch": 2.875, + "grad_norm": 0.06637100130319595, + "learning_rate": 1e-06, + "loss": 0.0024, + "step": 439 + }, + { + "clip_ratio": 0.0004048011510998809, + "epoch": 2.8814814814814813, + "grad_norm": 0.06594450771808624, + "learning_rate": 1e-06, + "loss": 0.002, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.1303899855841, + "epoch": 2.887962962962963, + "grad_norm": 0.09415791928768158, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 407581593.0, + "reward": 1.5895691911379497, + "reward_std": 0.15091915730209576, + "rewards/acc_reward_func": 1.5895691627547854, + "step": 441 + }, + { + "clip_ratio": 7.479514776302192e-05, + "epoch": 2.8944444444444444, + "grad_norm": 0.09067609906196594, + "learning_rate": 1e-06, + "loss": -0.004, + "step": 442 + }, + { + "clip_ratio": 0.00017182660078452456, + "epoch": 2.900925925925926, + "grad_norm": 0.08883418887853622, + "learning_rate": 1e-06, + "loss": -0.0045, + "step": 443 + }, + { + "clip_ratio": 0.00044846041141898327, + "epoch": 2.9074074074074074, + "grad_norm": 0.08536746352910995, + "learning_rate": 1e-06, + "loss": -0.0052, + "step": 444 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.00907825288317, + "epoch": 2.9138888888888888, + "grad_norm": 0.08820128440856934, + "learning_rate": 1e-06, + "loss": 0.0061, + "num_tokens": 411218279.0, + "reward": 1.749433125768389, + "reward_std": 0.12802751929987044, + "rewards/acc_reward_func": 1.7494331030618577, + "step": 445 + }, + { + "clip_ratio": 8.860694513367933e-05, + "epoch": 2.9203703703703705, + "grad_norm": 0.08626239001750946, + "learning_rate": 1e-06, + "loss": 0.0058, + "step": 446 + }, + { + "clip_ratio": 0.0003607673178997911, + "epoch": 2.926851851851852, + "grad_norm": 0.08441189676523209, + "learning_rate": 1e-06, + "loss": 0.0053, + "step": 447 + }, + { + "clip_ratio": 0.0008602460790522551, + "epoch": 2.9333333333333336, + "grad_norm": 0.08371831476688385, + "learning_rate": 1e-06, + "loss": 0.0046, + "step": 448 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.5929768880208, + "epoch": 2.939814814814815, + "grad_norm": 0.08556215465068817, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 415063728.0, + "reward": 1.6507936772846041, + "reward_std": 0.12087976418080784, + "rewards/acc_reward_func": 1.6507936602547055, + "step": 449 + }, + { + "clip_ratio": 6.594504590057546e-05, + "epoch": 2.946296296296296, + "grad_norm": 0.08255585283041, + "learning_rate": 1e-06, + "loss": 0.0043, + "step": 450 + }, + { + "clip_ratio": 0.00024212310728173527, + "epoch": 2.9527777777777775, + "grad_norm": 0.0834159404039383, + "learning_rate": 1e-06, + "loss": 0.0038, + "step": 451 + }, + { + "clip_ratio": 0.0005293784523105604, + "epoch": 2.9592592592592593, + "grad_norm": 0.07938043773174286, + "learning_rate": 1e-06, + "loss": 0.0031, + "step": 452 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.26644606817337, + "epoch": 2.965740740740741, + "grad_norm": 0.0936601534485817, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 418688725.0, + "reward": 1.658730183328901, + "reward_std": 0.14163324290088244, + "rewards/acc_reward_func": 1.6587301662990026, + "step": 453 + }, + { + "clip_ratio": 7.96998169140092e-05, + "epoch": 2.9722222222222223, + "grad_norm": 0.08971893042325974, + "learning_rate": 1e-06, + "loss": 0.006, + "step": 454 + }, + { + "clip_ratio": 0.00018841165833042135, + "epoch": 2.9787037037037036, + "grad_norm": 0.08801492303609848, + "learning_rate": 1e-06, + "loss": 0.0054, + "step": 455 + }, + { + "clip_ratio": 0.00046365962216874496, + "epoch": 2.985185185185185, + "grad_norm": 0.0843043327331543, + "learning_rate": 1e-06, + "loss": 0.0047, + "step": 456 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.1598714192708, + "epoch": 3.0064814814814813, + "grad_norm": 0.12096734344959259, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 422140912.0, + "reward": 1.7018140838259743, + "reward_std": 0.16363864338823728, + "rewards/acc_reward_func": 1.7018140667960757, + "step": 457 + }, + { + "clip_ratio": 7.911362107344238e-05, + "epoch": 3.012962962962963, + "grad_norm": 0.1199815645813942, + "learning_rate": 1e-06, + "loss": 0.005, + "step": 458 + }, + { + "clip_ratio": 0.0003393752437356549, + "epoch": 3.0194444444444444, + "grad_norm": 0.11338788270950317, + "learning_rate": 1e-06, + "loss": 0.004, + "step": 459 + }, + { + "clip_ratio": 0.0012846073105243878, + "epoch": 3.025925925925926, + "grad_norm": 0.11216680705547333, + "learning_rate": 1e-06, + "loss": 0.0029, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.7562415713356, + "epoch": 3.0324074074074074, + "grad_norm": 0.14053334295749664, + "learning_rate": 1e-06, + "loss": 0.0426, + "num_tokens": 425862443.0, + "reward": 1.6666666950498308, + "reward_std": 0.2066345683165959, + "rewards/acc_reward_func": 1.6666666723432995, + "step": 461 + }, + { + "clip_ratio": 0.0001409024182413261, + "epoch": 3.0388888888888888, + "grad_norm": 0.13825556635856628, + "learning_rate": 1e-06, + "loss": 0.0418, + "step": 462 + }, + { + "clip_ratio": 0.00045791927654395944, + "epoch": 3.0453703703703705, + "grad_norm": 0.1277666687965393, + "learning_rate": 1e-06, + "loss": 0.0407, + "step": 463 + }, + { + "clip_ratio": 0.0016882882032188632, + "epoch": 3.051851851851852, + "grad_norm": 0.11887232214212418, + "learning_rate": 1e-06, + "loss": 0.0394, + "step": 464 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.12245686848956, + "epoch": 3.058333333333333, + "grad_norm": 0.1422978937625885, + "learning_rate": 1e-06, + "loss": 0.0074, + "num_tokens": 429768887.0, + "reward": 1.7029478720256261, + "reward_std": 0.16384412295051984, + "rewards/acc_reward_func": 1.7029478549957275, + "step": 465 + }, + { + "clip_ratio": 0.00022631913868411045, + "epoch": 3.064814814814815, + "grad_norm": 0.12926463782787323, + "learning_rate": 1e-06, + "loss": 0.0067, + "step": 466 + }, + { + "clip_ratio": 0.0016295394466613374, + "epoch": 3.071296296296296, + "grad_norm": 0.12168161571025848, + "learning_rate": 1e-06, + "loss": 0.0057, + "step": 467 + }, + { + "clip_ratio": 0.004102049317831795, + "epoch": 3.077777777777778, + "grad_norm": 0.13436855375766754, + "learning_rate": 1e-06, + "loss": 0.0047, + "step": 468 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.0952410016741, + "epoch": 3.0842592592592593, + "grad_norm": 0.10855058580636978, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 433203107.0, + "reward": 1.6882086594899495, + "reward_std": 0.13231020988453002, + "rewards/acc_reward_func": 1.688208608400254, + "step": 469 + }, + { + "clip_ratio": 0.00115988185557182, + "epoch": 3.0907407407407406, + "grad_norm": 0.11043041199445724, + "learning_rate": 1e-06, + "loss": 0.0032, + "step": 470 + }, + { + "clip_ratio": 0.002692721124428014, + "epoch": 3.0972222222222223, + "grad_norm": 0.12333739548921585, + "learning_rate": 1e-06, + "loss": 0.0025, + "step": 471 + }, + { + "clip_ratio": 0.0018077372972454344, + "epoch": 3.1037037037037036, + "grad_norm": 0.0895155668258667, + "learning_rate": 1e-06, + "loss": 0.0016, + "step": 472 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.54309227353053, + "epoch": 3.1101851851851854, + "grad_norm": 0.1423775851726532, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 436395760.0, + "reward": 1.6870748485837663, + "reward_std": 0.14476618241696132, + "rewards/acc_reward_func": 1.687074825877235, + "step": 473 + }, + { + "clip_ratio": 0.0009848821119660335, + "epoch": 3.1166666666666667, + "grad_norm": 0.11566051095724106, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 474 + }, + { + "clip_ratio": 0.004755329806357622, + "epoch": 3.123148148148148, + "grad_norm": 0.13648808002471924, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 475 + }, + { + "clip_ratio": 0.0044265871623619685, + "epoch": 3.1296296296296298, + "grad_norm": 0.146810844540596, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 476 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.2312970842634, + "epoch": 3.136111111111111, + "grad_norm": 0.12026971578598022, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 439744150.0, + "reward": 1.633786882672991, + "reward_std": 0.11844597526249431, + "rewards/acc_reward_func": 1.6337868429365612, + "step": 477 + }, + { + "clip_ratio": 0.0009484389579267285, + "epoch": 3.1425925925925924, + "grad_norm": 0.1215616911649704, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 478 + }, + { + "clip_ratio": 0.003697521280541661, + "epoch": 3.149074074074074, + "grad_norm": 0.14095111191272736, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 479 + }, + { + "clip_ratio": 0.0034387974633968304, + "epoch": 3.1555555555555554, + "grad_norm": 0.12663334608078003, + "learning_rate": 1e-06, + "loss": 0.0059, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.1598692394438, + "epoch": 3.162037037037037, + "grad_norm": 0.1346891224384308, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 443271385.0, + "reward": 1.7256236189887637, + "reward_std": 0.14089716740307354, + "rewards/acc_reward_func": 1.7256235792523338, + "step": 481 + }, + { + "clip_ratio": 0.0004448807906425957, + "epoch": 3.1685185185185185, + "grad_norm": 0.12079239636659622, + "learning_rate": 1e-06, + "loss": 0.0041, + "step": 482 + }, + { + "clip_ratio": 0.0020043016965722756, + "epoch": 3.175, + "grad_norm": 0.14017271995544434, + "learning_rate": 1e-06, + "loss": 0.0033, + "step": 483 + }, + { + "clip_ratio": 0.0017200966179925239, + "epoch": 3.1814814814814816, + "grad_norm": 0.15152569115161896, + "learning_rate": 1e-06, + "loss": 0.0023, + "step": 484 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.5975094749814, + "epoch": 3.187962962962963, + "grad_norm": 0.10771705955266953, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 446677080.0, + "reward": 1.7256236189887637, + "reward_std": 0.12167022980394818, + "rewards/acc_reward_func": 1.7256235792523338, + "step": 485 + }, + { + "clip_ratio": 0.0006665461551165208, + "epoch": 3.1944444444444446, + "grad_norm": 0.10599280893802643, + "learning_rate": 1e-06, + "loss": 0.0064, + "step": 486 + }, + { + "clip_ratio": 0.0033372725759233746, + "epoch": 3.200925925925926, + "grad_norm": 0.13055044412612915, + "learning_rate": 1e-06, + "loss": 0.0058, + "step": 487 + }, + { + "clip_ratio": 0.0035411600755261524, + "epoch": 3.2074074074074073, + "grad_norm": 0.12772558629512787, + "learning_rate": 1e-06, + "loss": 0.0049, + "step": 488 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.3888920375279, + "epoch": 3.213888888888889, + "grad_norm": 0.16783253848552704, + "learning_rate": 1e-06, + "loss": 0.0085, + "num_tokens": 450388591.0, + "reward": 1.6995465131033034, + "reward_std": 0.13824334208454406, + "rewards/acc_reward_func": 1.6995464733668737, + "step": 489 + }, + { + "clip_ratio": 0.0005684612243342036, + "epoch": 3.2203703703703703, + "grad_norm": 0.14137648046016693, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 490 + }, + { + "clip_ratio": 0.004452694151994018, + "epoch": 3.226851851851852, + "grad_norm": 0.14627555012702942, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 491 + }, + { + "clip_ratio": 0.006235157244927471, + "epoch": 3.2333333333333334, + "grad_norm": 0.17356812953948975, + "learning_rate": 1e-06, + "loss": 0.0059, + "step": 492 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.3911626906622, + "epoch": 3.2398148148148147, + "grad_norm": 0.1436953991651535, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 454082620.0, + "reward": 1.640589606194269, + "reward_std": 0.13000182736487614, + "rewards/acc_reward_func": 1.6405895664578392, + "step": 493 + }, + { + "clip_ratio": 0.00045952137346224237, + "epoch": 3.2462962962962965, + "grad_norm": 0.14312081038951874, + "learning_rate": 1e-06, + "loss": -0.0018, + "step": 494 + }, + { + "clip_ratio": 0.0025044624372163697, + "epoch": 3.2527777777777778, + "grad_norm": 0.15083995461463928, + "learning_rate": 1e-06, + "loss": -0.003, + "step": 495 + }, + { + "clip_ratio": 0.002988206178304695, + "epoch": 3.259259259259259, + "grad_norm": 0.15178053081035614, + "learning_rate": 1e-06, + "loss": -0.0044, + "step": 496 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.91270228794642, + "epoch": 3.265740740740741, + "grad_norm": 0.11761778593063354, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 457521509.0, + "reward": 1.7120181776228405, + "reward_std": 0.10198826484736942, + "rewards/acc_reward_func": 1.7120181322097778, + "step": 497 + }, + { + "clip_ratio": 0.00034070668923613125, + "epoch": 3.272222222222222, + "grad_norm": 0.10410414636135101, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 498 + }, + { + "clip_ratio": 0.0006974699981150306, + "epoch": 3.278703703703704, + "grad_norm": 0.10498173534870148, + "learning_rate": 1e-06, + "loss": -0.0021, + "step": 499 + }, + { + "clip_ratio": 0.0008304606145386407, + "epoch": 3.285185185185185, + "grad_norm": 0.10428803414106369, + "learning_rate": 1e-06, + "loss": -0.0029, + "step": 500 + } + ], + "logging_steps": 1, + "max_steps": 770, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}