| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.2953367875647668, | |
| "eval_steps": 10000000, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012953367875647668, | |
| "grad_norm": 43.076076550077694, | |
| "learning_rate": 6.476683937823834e-09, | |
| "loss": 3.0704, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.025906735751295335, | |
| "grad_norm": 40.9396389995215, | |
| "learning_rate": 1.2953367875647667e-08, | |
| "loss": 2.9515, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.038860103626943004, | |
| "grad_norm": 40.97955285652574, | |
| "learning_rate": 1.9430051813471502e-08, | |
| "loss": 3.0545, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.05181347150259067, | |
| "grad_norm": 41.33204715803873, | |
| "learning_rate": 2.5906735751295334e-08, | |
| "loss": 3.0228, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06476683937823834, | |
| "grad_norm": 42.23332442613627, | |
| "learning_rate": 3.238341968911917e-08, | |
| "loss": 3.0354, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07772020725388601, | |
| "grad_norm": 44.56518138334028, | |
| "learning_rate": 3.8860103626943005e-08, | |
| "loss": 3.0051, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.09067357512953368, | |
| "grad_norm": 40.7767523212035, | |
| "learning_rate": 4.533678756476684e-08, | |
| "loss": 3.0326, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.10362694300518134, | |
| "grad_norm": 37.191408333303734, | |
| "learning_rate": 5.181347150259067e-08, | |
| "loss": 2.97, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.11658031088082901, | |
| "grad_norm": 37.31766884591996, | |
| "learning_rate": 5.8290155440414504e-08, | |
| "loss": 2.8748, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.12953367875647667, | |
| "grad_norm": 36.208356127388186, | |
| "learning_rate": 6.476683937823834e-08, | |
| "loss": 2.8776, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.14248704663212436, | |
| "grad_norm": 33.13789071114948, | |
| "learning_rate": 7.124352331606218e-08, | |
| "loss": 2.8728, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.15544041450777202, | |
| "grad_norm": 25.73482630280206, | |
| "learning_rate": 7.772020725388601e-08, | |
| "loss": 2.709, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.16839378238341968, | |
| "grad_norm": 22.038324890804294, | |
| "learning_rate": 8.419689119170984e-08, | |
| "loss": 2.6382, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.18134715025906736, | |
| "grad_norm": 22.185048967857036, | |
| "learning_rate": 9.067357512953368e-08, | |
| "loss": 2.5554, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.19430051813471502, | |
| "grad_norm": 18.083733865410974, | |
| "learning_rate": 9.715025906735751e-08, | |
| "loss": 2.4812, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.20725388601036268, | |
| "grad_norm": 9.017324625226863, | |
| "learning_rate": 1.0362694300518134e-07, | |
| "loss": 2.3281, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.22020725388601037, | |
| "grad_norm": 7.678118962268154, | |
| "learning_rate": 1.1010362694300518e-07, | |
| "loss": 2.2715, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.23316062176165803, | |
| "grad_norm": 6.290504564645648, | |
| "learning_rate": 1.1658031088082901e-07, | |
| "loss": 2.2517, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.24611398963730569, | |
| "grad_norm": 5.562957539224074, | |
| "learning_rate": 1.2305699481865284e-07, | |
| "loss": 2.2002, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.25906735751295334, | |
| "grad_norm": 4.640851265650242, | |
| "learning_rate": 1.2953367875647668e-07, | |
| "loss": 2.2032, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.27202072538860106, | |
| "grad_norm": 4.098319743680852, | |
| "learning_rate": 1.3601036269430052e-07, | |
| "loss": 2.1007, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2849740932642487, | |
| "grad_norm": 3.8703435452061665, | |
| "learning_rate": 1.4248704663212436e-07, | |
| "loss": 2.1256, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2979274611398964, | |
| "grad_norm": 3.804744602548621, | |
| "learning_rate": 1.4896373056994818e-07, | |
| "loss": 2.1334, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.31088082901554404, | |
| "grad_norm": 3.72894289463076, | |
| "learning_rate": 1.5544041450777202e-07, | |
| "loss": 2.1497, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3238341968911917, | |
| "grad_norm": 3.6159970324410975, | |
| "learning_rate": 1.6191709844559583e-07, | |
| "loss": 2.1146, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.33678756476683935, | |
| "grad_norm": 3.4513794224622187, | |
| "learning_rate": 1.6839378238341968e-07, | |
| "loss": 2.0987, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.34974093264248707, | |
| "grad_norm": 3.4264148655282707, | |
| "learning_rate": 1.7487046632124352e-07, | |
| "loss": 2.0694, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3626943005181347, | |
| "grad_norm": 3.2474565476427633, | |
| "learning_rate": 1.8134715025906736e-07, | |
| "loss": 2.087, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3756476683937824, | |
| "grad_norm": 3.5065133087524027, | |
| "learning_rate": 1.8782383419689118e-07, | |
| "loss": 2.1012, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.38860103626943004, | |
| "grad_norm": 3.2610500040097623, | |
| "learning_rate": 1.9430051813471502e-07, | |
| "loss": 2.0864, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4015544041450777, | |
| "grad_norm": 3.46273816431845, | |
| "learning_rate": 2.0077720207253883e-07, | |
| "loss": 2.1223, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.41450777202072536, | |
| "grad_norm": 3.4591348076759, | |
| "learning_rate": 2.0725388601036267e-07, | |
| "loss": 2.0511, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4274611398963731, | |
| "grad_norm": 3.143552612232648, | |
| "learning_rate": 2.1373056994818652e-07, | |
| "loss": 2.0698, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.44041450777202074, | |
| "grad_norm": 3.3930346159225193, | |
| "learning_rate": 2.2020725388601036e-07, | |
| "loss": 2.062, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.4533678756476684, | |
| "grad_norm": 3.1968277262246785, | |
| "learning_rate": 2.2668393782383417e-07, | |
| "loss": 2.0499, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.46632124352331605, | |
| "grad_norm": 3.1762842339842545, | |
| "learning_rate": 2.3316062176165802e-07, | |
| "loss": 2.0388, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4792746113989637, | |
| "grad_norm": 3.1131594154841404, | |
| "learning_rate": 2.3963730569948183e-07, | |
| "loss": 2.071, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.49222797927461137, | |
| "grad_norm": 3.4012154706204187, | |
| "learning_rate": 2.4611398963730567e-07, | |
| "loss": 2.0207, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5051813471502591, | |
| "grad_norm": 3.32603264761156, | |
| "learning_rate": 2.525906735751295e-07, | |
| "loss": 2.0551, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5181347150259067, | |
| "grad_norm": 3.0793049667074555, | |
| "learning_rate": 2.5906735751295336e-07, | |
| "loss": 2.0076, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5310880829015544, | |
| "grad_norm": 3.3022746352622536, | |
| "learning_rate": 2.655440414507772e-07, | |
| "loss": 2.0483, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5440414507772021, | |
| "grad_norm": 3.231444229889239, | |
| "learning_rate": 2.7202072538860104e-07, | |
| "loss": 2.0366, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5569948186528497, | |
| "grad_norm": 3.168051485737994, | |
| "learning_rate": 2.7849740932642483e-07, | |
| "loss": 2.0352, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5699481865284974, | |
| "grad_norm": 3.3177612879303284, | |
| "learning_rate": 2.849740932642487e-07, | |
| "loss": 2.0783, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.582901554404145, | |
| "grad_norm": 3.0961489412889542, | |
| "learning_rate": 2.914507772020725e-07, | |
| "loss": 2.0185, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5958549222797928, | |
| "grad_norm": 3.192249587055236, | |
| "learning_rate": 2.9792746113989635e-07, | |
| "loss": 2.0432, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.6088082901554405, | |
| "grad_norm": 3.2200126119045698, | |
| "learning_rate": 3.044041450777202e-07, | |
| "loss": 2.0553, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.6217616580310881, | |
| "grad_norm": 3.207430731372013, | |
| "learning_rate": 3.1088082901554404e-07, | |
| "loss": 2.0304, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6347150259067358, | |
| "grad_norm": 3.196953450486009, | |
| "learning_rate": 3.173575129533679e-07, | |
| "loss": 2.0417, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6476683937823834, | |
| "grad_norm": 3.0518436646257334, | |
| "learning_rate": 3.2383419689119167e-07, | |
| "loss": 1.9995, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6606217616580311, | |
| "grad_norm": 3.0960475518738546, | |
| "learning_rate": 3.303108808290155e-07, | |
| "loss": 2.0019, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6735751295336787, | |
| "grad_norm": 3.4734729489261253, | |
| "learning_rate": 3.3678756476683935e-07, | |
| "loss": 2.0214, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.6865284974093264, | |
| "grad_norm": 3.0715495927160106, | |
| "learning_rate": 3.432642487046632e-07, | |
| "loss": 2.0165, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.6994818652849741, | |
| "grad_norm": 3.298121733914402, | |
| "learning_rate": 3.4974093264248704e-07, | |
| "loss": 1.9978, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.7124352331606217, | |
| "grad_norm": 3.224003768850699, | |
| "learning_rate": 3.562176165803109e-07, | |
| "loss": 2.0165, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.7253886010362695, | |
| "grad_norm": 3.207845261761618, | |
| "learning_rate": 3.626943005181347e-07, | |
| "loss": 2.0213, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.7383419689119171, | |
| "grad_norm": 2.9379832555269103, | |
| "learning_rate": 3.691709844559585e-07, | |
| "loss": 1.9945, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.7512953367875648, | |
| "grad_norm": 3.1048255292841542, | |
| "learning_rate": 3.7564766839378235e-07, | |
| "loss": 2.0447, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7642487046632125, | |
| "grad_norm": 3.1559283368532727, | |
| "learning_rate": 3.8212435233160625e-07, | |
| "loss": 2.0091, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.7772020725388601, | |
| "grad_norm": 3.0089470388517148, | |
| "learning_rate": 3.8860103626943004e-07, | |
| "loss": 2.024, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7901554404145078, | |
| "grad_norm": 2.989858285669738, | |
| "learning_rate": 3.950777202072539e-07, | |
| "loss": 2.0211, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.8031088082901554, | |
| "grad_norm": 2.9838710647706184, | |
| "learning_rate": 4.0155440414507767e-07, | |
| "loss": 2.0124, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.8160621761658031, | |
| "grad_norm": 3.1331786862223545, | |
| "learning_rate": 4.0803108808290156e-07, | |
| "loss": 2.0164, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.8290155440414507, | |
| "grad_norm": 2.8882108643266484, | |
| "learning_rate": 4.1450777202072535e-07, | |
| "loss": 1.9743, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.8419689119170984, | |
| "grad_norm": 3.1820944740927946, | |
| "learning_rate": 4.209844559585492e-07, | |
| "loss": 1.9973, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.8549222797927462, | |
| "grad_norm": 3.1018459499442184, | |
| "learning_rate": 4.2746113989637303e-07, | |
| "loss": 2.0062, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.8678756476683938, | |
| "grad_norm": 3.08458871931019, | |
| "learning_rate": 4.339378238341969e-07, | |
| "loss": 1.9692, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.8808290155440415, | |
| "grad_norm": 3.113974525500442, | |
| "learning_rate": 4.404145077720207e-07, | |
| "loss": 2.0378, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.8937823834196891, | |
| "grad_norm": 3.376694314365291, | |
| "learning_rate": 4.468911917098445e-07, | |
| "loss": 1.9943, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.9067357512953368, | |
| "grad_norm": 3.268661205585023, | |
| "learning_rate": 4.5336787564766835e-07, | |
| "loss": 1.9977, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.9196891191709845, | |
| "grad_norm": 2.996627195399274, | |
| "learning_rate": 4.5984455958549224e-07, | |
| "loss": 1.9782, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.9326424870466321, | |
| "grad_norm": 2.932454046391058, | |
| "learning_rate": 4.6632124352331603e-07, | |
| "loss": 1.9766, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.9455958549222798, | |
| "grad_norm": 3.2738206104680088, | |
| "learning_rate": 4.7279792746113987e-07, | |
| "loss": 1.9388, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.9585492227979274, | |
| "grad_norm": 2.9239778843618884, | |
| "learning_rate": 4.792746113989637e-07, | |
| "loss": 1.9716, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.9715025906735751, | |
| "grad_norm": 3.1972940346458265, | |
| "learning_rate": 4.857512953367875e-07, | |
| "loss": 2.0161, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9844559585492227, | |
| "grad_norm": 3.3347499347681016, | |
| "learning_rate": 4.922279792746113e-07, | |
| "loss": 1.9765, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.9974093264248705, | |
| "grad_norm": 3.0568292052459585, | |
| "learning_rate": 4.987046632124352e-07, | |
| "loss": 1.9368, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.0103626943005182, | |
| "grad_norm": 2.9881753806367066, | |
| "learning_rate": 5.05181347150259e-07, | |
| "loss": 1.9666, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.0233160621761659, | |
| "grad_norm": 3.2370659549778074, | |
| "learning_rate": 5.116580310880829e-07, | |
| "loss": 1.9904, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.0362694300518134, | |
| "grad_norm": 3.0236879879482945, | |
| "learning_rate": 5.181347150259067e-07, | |
| "loss": 1.9652, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.049222797927461, | |
| "grad_norm": 2.905006361103011, | |
| "learning_rate": 5.246113989637306e-07, | |
| "loss": 1.9845, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.0621761658031088, | |
| "grad_norm": 3.155651179064785, | |
| "learning_rate": 5.310880829015544e-07, | |
| "loss": 1.9439, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.0751295336787565, | |
| "grad_norm": 3.0538373364331957, | |
| "learning_rate": 5.375647668393782e-07, | |
| "loss": 1.9636, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.0880829015544042, | |
| "grad_norm": 2.941917535285299, | |
| "learning_rate": 5.440414507772021e-07, | |
| "loss": 1.928, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.1010362694300517, | |
| "grad_norm": 3.143750273608143, | |
| "learning_rate": 5.505181347150258e-07, | |
| "loss": 1.988, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.1139896373056994, | |
| "grad_norm": 3.0526575513233576, | |
| "learning_rate": 5.569948186528497e-07, | |
| "loss": 2.0298, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.1269430051813472, | |
| "grad_norm": 3.139974022960326, | |
| "learning_rate": 5.634715025906735e-07, | |
| "loss": 1.9806, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.1398963730569949, | |
| "grad_norm": 2.962182207514898, | |
| "learning_rate": 5.699481865284974e-07, | |
| "loss": 2.0052, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.1528497409326426, | |
| "grad_norm": 3.332736006863561, | |
| "learning_rate": 5.764248704663213e-07, | |
| "loss": 1.9777, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.16580310880829, | |
| "grad_norm": 3.047159518577182, | |
| "learning_rate": 5.82901554404145e-07, | |
| "loss": 1.9816, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.1787564766839378, | |
| "grad_norm": 3.0900251170460473, | |
| "learning_rate": 5.893782383419689e-07, | |
| "loss": 1.9539, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.1917098445595855, | |
| "grad_norm": 3.2156155562663415, | |
| "learning_rate": 5.958549222797927e-07, | |
| "loss": 1.9271, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.2046632124352332, | |
| "grad_norm": 3.087118578541963, | |
| "learning_rate": 6.023316062176166e-07, | |
| "loss": 1.9906, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.2176165803108807, | |
| "grad_norm": 2.972038701138991, | |
| "learning_rate": 6.088082901554404e-07, | |
| "loss": 1.9839, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.2305699481865284, | |
| "grad_norm": 2.953738637620668, | |
| "learning_rate": 6.152849740932642e-07, | |
| "loss": 1.9457, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.2435233160621761, | |
| "grad_norm": 3.307869579476457, | |
| "learning_rate": 6.217616580310881e-07, | |
| "loss": 1.9658, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.2564766839378239, | |
| "grad_norm": 2.8754881477979253, | |
| "learning_rate": 6.282383419689119e-07, | |
| "loss": 1.9479, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.2694300518134716, | |
| "grad_norm": 3.516289819557544, | |
| "learning_rate": 6.347150259067358e-07, | |
| "loss": 1.9684, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.2823834196891193, | |
| "grad_norm": 3.173293560970888, | |
| "learning_rate": 6.411917098445595e-07, | |
| "loss": 1.9508, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.2953367875647668, | |
| "grad_norm": 3.167031160499092, | |
| "learning_rate": 6.476683937823833e-07, | |
| "loss": 1.9548, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 15440, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 53264261382144.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |