| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 8.104816696762272, | |
| "global_step": 193000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.21, | |
| "learning_rate": 0.0002955626928115623, | |
| "loss": 1.1705, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "eval_loss": 0.22145646810531616, | |
| "eval_runtime": 5082.7114, | |
| "eval_samples_per_second": 4.02, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "learning_rate": 0.0002892236825423657, | |
| "loss": 0.2914, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "eval_loss": 0.19062571227550507, | |
| "eval_runtime": 5086.3406, | |
| "eval_samples_per_second": 4.018, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "learning_rate": 0.00028288467227316906, | |
| "loss": 0.2617, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "eval_loss": 0.1784171611070633, | |
| "eval_runtime": 4895.8348, | |
| "eval_samples_per_second": 4.174, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "learning_rate": 0.0002765456620039724, | |
| "loss": 0.2449, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "eval_loss": 0.16641439497470856, | |
| "eval_runtime": 4945.3495, | |
| "eval_samples_per_second": 4.132, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "learning_rate": 0.0002702066517347758, | |
| "loss": 0.2264, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "eval_loss": 0.15650227665901184, | |
| "eval_runtime": 4999.732, | |
| "eval_samples_per_second": 4.087, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "learning_rate": 0.00026386764146557915, | |
| "loss": 0.2093, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "eval_loss": 0.15184776484966278, | |
| "eval_runtime": 4877.0685, | |
| "eval_samples_per_second": 4.19, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "learning_rate": 0.0002575286311963825, | |
| "loss": 0.2003, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "eval_loss": 0.14190027117729187, | |
| "eval_runtime": 4879.5538, | |
| "eval_samples_per_second": 4.188, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "learning_rate": 0.0002511896209271859, | |
| "loss": 0.1962, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "eval_loss": 0.14596430957317352, | |
| "eval_runtime": 4860.8654, | |
| "eval_samples_per_second": 4.204, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "learning_rate": 0.00024485061065798925, | |
| "loss": 0.19, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "eval_loss": 0.13479308784008026, | |
| "eval_runtime": 4872.4496, | |
| "eval_samples_per_second": 4.194, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "learning_rate": 0.00023851160038879262, | |
| "loss": 0.1769, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "eval_loss": 0.13018357753753662, | |
| "eval_runtime": 4872.3296, | |
| "eval_samples_per_second": 4.194, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "learning_rate": 0.00023217259011959596, | |
| "loss": 0.1674, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "eval_loss": 0.1303720772266388, | |
| "eval_runtime": 4874.4649, | |
| "eval_samples_per_second": 4.192, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "learning_rate": 0.00022583357985039935, | |
| "loss": 0.1655, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "eval_loss": 0.12317115068435669, | |
| "eval_runtime": 4882.2049, | |
| "eval_samples_per_second": 4.186, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "learning_rate": 0.00021949456958120271, | |
| "loss": 0.1608, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "eval_loss": 0.12056649476289749, | |
| "eval_runtime": 4877.3831, | |
| "eval_samples_per_second": 4.19, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "learning_rate": 0.00021315555931200605, | |
| "loss": 0.1565, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "eval_loss": 0.11486475169658661, | |
| "eval_runtime": 4876.7317, | |
| "eval_samples_per_second": 4.19, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "learning_rate": 0.00020681654904280945, | |
| "loss": 0.146, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "eval_loss": 0.11829441785812378, | |
| "eval_runtime": 4833.5153, | |
| "eval_samples_per_second": 4.228, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "learning_rate": 0.00020047753877361279, | |
| "loss": 0.1403, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "eval_loss": 0.11143175512552261, | |
| "eval_runtime": 4853.9814, | |
| "eval_samples_per_second": 4.21, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 3.57, | |
| "learning_rate": 0.00019413852850441618, | |
| "loss": 0.1376, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 3.57, | |
| "eval_loss": 0.11027190089225769, | |
| "eval_runtime": 4858.6043, | |
| "eval_samples_per_second": 4.206, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "learning_rate": 0.00018779951823521952, | |
| "loss": 0.1337, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "eval_loss": 0.10872453451156616, | |
| "eval_runtime": 4864.6388, | |
| "eval_samples_per_second": 4.201, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 3.99, | |
| "learning_rate": 0.00018146050796602288, | |
| "loss": 0.1325, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 3.99, | |
| "eval_loss": 0.10718829929828644, | |
| "eval_runtime": 4921.9134, | |
| "eval_samples_per_second": 4.152, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "learning_rate": 0.00017512149769682625, | |
| "loss": 0.1195, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "eval_loss": 0.10430513322353363, | |
| "eval_runtime": 4864.6263, | |
| "eval_samples_per_second": 4.201, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 4.41, | |
| "learning_rate": 0.00016878248742762961, | |
| "loss": 0.118, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 4.41, | |
| "eval_loss": 0.1070966124534607, | |
| "eval_runtime": 4879.2783, | |
| "eval_samples_per_second": 4.188, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "learning_rate": 0.00016244347715843295, | |
| "loss": 0.1173, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "eval_loss": 0.10433077067136765, | |
| "eval_runtime": 4876.4984, | |
| "eval_samples_per_second": 4.191, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "learning_rate": 0.00015610446688923635, | |
| "loss": 0.115, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "eval_loss": 0.09682977199554443, | |
| "eval_runtime": 4893.4652, | |
| "eval_samples_per_second": 4.176, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 5.04, | |
| "learning_rate": 0.0001497654566200397, | |
| "loss": 0.1102, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 5.04, | |
| "eval_loss": 0.09630288183689117, | |
| "eval_runtime": 4914.2049, | |
| "eval_samples_per_second": 4.158, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 5.25, | |
| "learning_rate": 0.00014342644635084308, | |
| "loss": 0.1019, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 5.25, | |
| "eval_loss": 0.0918075293302536, | |
| "eval_runtime": 4893.9499, | |
| "eval_samples_per_second": 4.176, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 5.46, | |
| "learning_rate": 0.00013708743608164644, | |
| "loss": 0.1014, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 5.46, | |
| "eval_loss": 0.09067174792289734, | |
| "eval_runtime": 4891.4795, | |
| "eval_samples_per_second": 4.178, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 5.67, | |
| "learning_rate": 0.0001307484258124498, | |
| "loss": 0.1, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 5.67, | |
| "eval_loss": 0.08851899951696396, | |
| "eval_runtime": 4884.1688, | |
| "eval_samples_per_second": 4.184, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 5.88, | |
| "learning_rate": 0.00012440941554325318, | |
| "loss": 0.0971, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 5.88, | |
| "eval_loss": 0.08720648288726807, | |
| "eval_runtime": 4896.4597, | |
| "eval_samples_per_second": 4.173, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 6.09, | |
| "learning_rate": 0.00011807040527405654, | |
| "loss": 0.0921, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 6.09, | |
| "eval_loss": 0.08666499704122543, | |
| "eval_runtime": 5117.0953, | |
| "eval_samples_per_second": 3.993, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 6.3, | |
| "learning_rate": 0.00011173139500485991, | |
| "loss": 0.0884, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 6.3, | |
| "eval_loss": 0.0831904485821724, | |
| "eval_runtime": 4879.0536, | |
| "eval_samples_per_second": 4.188, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 6.51, | |
| "learning_rate": 0.00010539238473566326, | |
| "loss": 0.0864, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 6.51, | |
| "eval_loss": 0.08337873965501785, | |
| "eval_runtime": 4900.7475, | |
| "eval_samples_per_second": 4.17, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 6.72, | |
| "learning_rate": 9.905337446646663e-05, | |
| "loss": 0.0861, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 6.72, | |
| "eval_loss": 0.08155979961156845, | |
| "eval_runtime": 4919.8246, | |
| "eval_samples_per_second": 4.154, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 6.93, | |
| "learning_rate": 9.271436419726999e-05, | |
| "loss": 0.083, | |
| "step": 165000 | |
| }, | |
| { | |
| "epoch": 6.93, | |
| "eval_loss": 0.08167865127325058, | |
| "eval_runtime": 4921.4971, | |
| "eval_samples_per_second": 4.152, | |
| "step": 165000 | |
| }, | |
| { | |
| "epoch": 7.14, | |
| "learning_rate": 8.637535392807336e-05, | |
| "loss": 0.0769, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 7.14, | |
| "eval_loss": 0.0775604099035263, | |
| "eval_runtime": 4893.4221, | |
| "eval_samples_per_second": 4.176, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 7.35, | |
| "learning_rate": 8.003634365887672e-05, | |
| "loss": 0.0749, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 7.35, | |
| "eval_loss": 0.07773936539888382, | |
| "eval_runtime": 4914.0163, | |
| "eval_samples_per_second": 4.159, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 7.56, | |
| "learning_rate": 7.369733338968009e-05, | |
| "loss": 0.0735, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 7.56, | |
| "eval_loss": 0.07420430332422256, | |
| "eval_runtime": 4934.0827, | |
| "eval_samples_per_second": 4.142, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 7.77, | |
| "learning_rate": 6.735832312048346e-05, | |
| "loss": 0.0715, | |
| "step": 185000 | |
| }, | |
| { | |
| "epoch": 7.77, | |
| "eval_loss": 0.07269106060266495, | |
| "eval_runtime": 4925.1046, | |
| "eval_samples_per_second": 4.149, | |
| "step": 185000 | |
| }, | |
| { | |
| "epoch": 7.98, | |
| "learning_rate": 6.1019312851286814e-05, | |
| "loss": 0.0702, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 7.98, | |
| "eval_loss": 0.07183075696229935, | |
| "eval_runtime": 4948.9933, | |
| "eval_samples_per_second": 4.129, | |
| "step": 190000 | |
| } | |
| ], | |
| "max_steps": 238130, | |
| "num_train_epochs": 10, | |
| "total_flos": 5.387421756388246e+20, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |