{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.35, "eval_steps": 500, "global_step": 2800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00125, "grad_norm": 0.40701737999916077, "learning_rate": 6.417e-06, "loss": 1.9589118957519531, "step": 10 }, { "epoch": 0.0025, "grad_norm": 0.3704953193664551, "learning_rate": 1.3547e-05, "loss": 1.879218864440918, "step": 20 }, { "epoch": 0.00375, "grad_norm": 0.34090375900268555, "learning_rate": 2.0677e-05, "loss": 1.8871658325195313, "step": 30 }, { "epoch": 0.005, "grad_norm": 0.33982428908348083, "learning_rate": 2.7807e-05, "loss": 1.8348798751831055, "step": 40 }, { "epoch": 0.00625, "grad_norm": 0.3448389172554016, "learning_rate": 3.4937e-05, "loss": 1.8976055145263673, "step": 50 }, { "epoch": 0.0075, "grad_norm": 0.3351344168186188, "learning_rate": 4.2066999999999996e-05, "loss": 1.8488676071166992, "step": 60 }, { "epoch": 0.00875, "grad_norm": 0.33170202374458313, "learning_rate": 4.9197e-05, "loss": 1.8325592041015626, "step": 70 }, { "epoch": 0.01, "grad_norm": 0.34600478410720825, "learning_rate": 5.6327e-05, "loss": 1.8475696563720703, "step": 80 }, { "epoch": 0.01125, "grad_norm": 0.34344804286956787, "learning_rate": 6.3457e-05, "loss": 1.8463781356811524, "step": 90 }, { "epoch": 0.0125, "grad_norm": 0.32425570487976074, "learning_rate": 7.0587e-05, "loss": 1.8811756134033204, "step": 100 }, { "epoch": 0.01375, "grad_norm": 0.33838146924972534, "learning_rate": 7.7717e-05, "loss": 1.8498527526855468, "step": 110 }, { "epoch": 0.015, "grad_norm": 0.34978190064430237, "learning_rate": 8.4847e-05, "loss": 1.7197338104248048, "step": 120 }, { "epoch": 0.01625, "grad_norm": 0.3554218113422394, "learning_rate": 9.1977e-05, "loss": 1.7990310668945313, "step": 130 }, { "epoch": 0.0175, "grad_norm": 0.3349857032299042, "learning_rate": 9.910699999999998e-05, "loss": 1.8458877563476563, "step": 140 }, { "epoch": 0.01875, "grad_norm": 0.3333263099193573, "learning_rate": 0.00010623699999999999, "loss": 1.8082691192626954, "step": 150 }, { "epoch": 0.02, "grad_norm": 0.3492045998573303, "learning_rate": 0.000113367, "loss": 1.7753177642822267, "step": 160 }, { "epoch": 0.02125, "grad_norm": 0.33766260743141174, "learning_rate": 0.000120497, "loss": 1.7588382720947267, "step": 170 }, { "epoch": 0.0225, "grad_norm": 0.3680027723312378, "learning_rate": 0.000127627, "loss": 1.7494930267333983, "step": 180 }, { "epoch": 0.02375, "grad_norm": 0.35260000824928284, "learning_rate": 0.000134757, "loss": 1.758560562133789, "step": 190 }, { "epoch": 0.025, "grad_norm": 0.3592912256717682, "learning_rate": 0.000141887, "loss": 1.8017724990844726, "step": 200 }, { "epoch": 0.02625, "grad_norm": 0.34770476818084717, "learning_rate": 0.00014259953155930407, "loss": 1.8061519622802735, "step": 210 }, { "epoch": 0.0275, "grad_norm": 0.358970582485199, "learning_rate": 0.00014259791226603537, "loss": 1.8515422821044922, "step": 220 }, { "epoch": 0.02875, "grad_norm": 0.34490638971328735, "learning_rate": 0.00014259513636323773, "loss": 1.8080307006835938, "step": 230 }, { "epoch": 0.03, "grad_norm": 0.3587310016155243, "learning_rate": 0.00014259120389594238, "loss": 1.8180580139160156, "step": 240 }, { "epoch": 0.03125, "grad_norm": 0.35348573327064514, "learning_rate": 0.0001425861149279427, "loss": 1.822945785522461, "step": 250 }, { "epoch": 0.0325, "grad_norm": 0.3408539891242981, "learning_rate": 0.00014257986954179292, "loss": 1.804990577697754, "step": 260 }, { "epoch": 0.03375, "grad_norm": 0.35097193717956543, "learning_rate": 0.00014257246783880696, "loss": 1.8341880798339845, "step": 270 }, { "epoch": 0.035, "grad_norm": 0.3467462956905365, "learning_rate": 0.00014256390993905687, "loss": 1.7296785354614257, "step": 280 }, { "epoch": 0.03625, "grad_norm": 0.3492400050163269, "learning_rate": 0.00014255419598137062, "loss": 1.8266151428222657, "step": 290 }, { "epoch": 0.0375, "grad_norm": 0.3718615472316742, "learning_rate": 0.00014254332612333005, "loss": 1.7514339447021485, "step": 300 }, { "epoch": 0.03875, "grad_norm": 0.3476354479789734, "learning_rate": 0.00014253130054126827, "loss": 1.8226016998291015, "step": 310 }, { "epoch": 0.04, "grad_norm": 0.34655508399009705, "learning_rate": 0.00014251811943026674, "loss": 1.8513336181640625, "step": 320 }, { "epoch": 0.04125, "grad_norm": 0.3519170880317688, "learning_rate": 0.00014250378300415223, "loss": 1.864480972290039, "step": 330 }, { "epoch": 0.0425, "grad_norm": 0.3491443395614624, "learning_rate": 0.00014248829149549318, "loss": 1.8030773162841798, "step": 340 }, { "epoch": 0.04375, "grad_norm": 0.3646671175956726, "learning_rate": 0.00014247164515559605, "loss": 1.782710647583008, "step": 350 }, { "epoch": 0.045, "grad_norm": 0.3525862395763397, "learning_rate": 0.00014245384425450123, "loss": 1.8301689147949218, "step": 360 }, { "epoch": 0.04625, "grad_norm": 0.3430674970149994, "learning_rate": 0.00014243488908097866, "loss": 1.7636734008789063, "step": 370 }, { "epoch": 0.0475, "grad_norm": 0.3655545115470886, "learning_rate": 0.00014241477994252308, "loss": 1.8431385040283204, "step": 380 }, { "epoch": 0.04875, "grad_norm": 0.35655322670936584, "learning_rate": 0.00014239351716534906, "loss": 1.8405876159667969, "step": 390 }, { "epoch": 0.05, "grad_norm": 0.3450303077697754, "learning_rate": 0.00014237110109438587, "loss": 1.7880744934082031, "step": 400 }, { "epoch": 0.05125, "grad_norm": 0.36362725496292114, "learning_rate": 0.0001423475320932716, "loss": 1.803448486328125, "step": 410 }, { "epoch": 0.0525, "grad_norm": 0.3608654737472534, "learning_rate": 0.0001423228105443475, "loss": 1.7959218978881837, "step": 420 }, { "epoch": 0.05375, "grad_norm": 0.3524814248085022, "learning_rate": 0.00014229693684865167, "loss": 1.8105106353759766, "step": 430 }, { "epoch": 0.055, "grad_norm": 0.35871171951293945, "learning_rate": 0.0001422699114259126, "loss": 1.7514846801757813, "step": 440 }, { "epoch": 0.05625, "grad_norm": 0.3381369709968567, "learning_rate": 0.00014224173471454223, "loss": 1.811713981628418, "step": 450 }, { "epoch": 0.0575, "grad_norm": 0.3746880292892456, "learning_rate": 0.00014221240717162908, "loss": 1.7895519256591796, "step": 460 }, { "epoch": 0.05875, "grad_norm": 0.35921189188957214, "learning_rate": 0.00014218192927293062, "loss": 1.7877384185791017, "step": 470 }, { "epoch": 0.06, "grad_norm": 0.3727467656135559, "learning_rate": 0.00014215030151286563, "loss": 1.8092086791992188, "step": 480 }, { "epoch": 0.06125, "grad_norm": 0.36004638671875, "learning_rate": 0.00014211752440450624, "loss": 1.845526123046875, "step": 490 }, { "epoch": 0.0625, "grad_norm": 0.34500977396965027, "learning_rate": 0.00014208359847956947, "loss": 1.793890380859375, "step": 500 }, { "epoch": 0.06375, "grad_norm": 0.3571811020374298, "learning_rate": 0.00014204852428840873, "loss": 1.8021648406982422, "step": 510 }, { "epoch": 0.065, "grad_norm": 0.3511386513710022, "learning_rate": 0.0001420123024000048, "loss": 1.7810476303100586, "step": 520 }, { "epoch": 0.06625, "grad_norm": 0.3544309139251709, "learning_rate": 0.00014197493340195673, "loss": 1.782750701904297, "step": 530 }, { "epoch": 0.0675, "grad_norm": 0.35211437940597534, "learning_rate": 0.00014193641790047207, "loss": 1.8397369384765625, "step": 540 }, { "epoch": 0.06875, "grad_norm": 0.3561457097530365, "learning_rate": 0.00014189675652035737, "loss": 1.806086540222168, "step": 550 }, { "epoch": 0.07, "grad_norm": 0.3514038026332855, "learning_rate": 0.0001418559499050077, "loss": 1.7963085174560547, "step": 560 }, { "epoch": 0.07125, "grad_norm": 0.35221120715141296, "learning_rate": 0.00014181399871639652, "loss": 1.777400016784668, "step": 570 }, { "epoch": 0.0725, "grad_norm": 0.34728357195854187, "learning_rate": 0.00014177090363506466, "loss": 1.7832159042358398, "step": 580 }, { "epoch": 0.07375, "grad_norm": 0.35810062289237976, "learning_rate": 0.00014172666536010946, "loss": 1.7859878540039062, "step": 590 }, { "epoch": 0.075, "grad_norm": 0.3402475118637085, "learning_rate": 0.00014168128460917344, "loss": 1.8559268951416015, "step": 600 }, { "epoch": 0.07625, "grad_norm": 0.36799490451812744, "learning_rate": 0.00014163476211843254, "loss": 1.8264755249023437, "step": 610 }, { "epoch": 0.0775, "grad_norm": 0.3646862804889679, "learning_rate": 0.00014158709864258424, "loss": 1.800428581237793, "step": 620 }, { "epoch": 0.07875, "grad_norm": 0.37956395745277405, "learning_rate": 0.00014153829495483538, "loss": 1.7767526626586914, "step": 630 }, { "epoch": 0.08, "grad_norm": 0.3566032648086548, "learning_rate": 0.00014148835184688949, "loss": 1.8472091674804687, "step": 640 }, { "epoch": 0.08125, "grad_norm": 0.333779513835907, "learning_rate": 0.000141437270128934, "loss": 1.8140777587890624, "step": 650 }, { "epoch": 0.0825, "grad_norm": 0.3429010212421417, "learning_rate": 0.0001413850506296272, "loss": 1.8366750717163085, "step": 660 }, { "epoch": 0.08375, "grad_norm": 0.3753111660480499, "learning_rate": 0.00014133169419608456, "loss": 1.760198211669922, "step": 670 }, { "epoch": 0.085, "grad_norm": 0.35503339767456055, "learning_rate": 0.0001412772016938653, "loss": 1.8173086166381835, "step": 680 }, { "epoch": 0.08625, "grad_norm": 0.358216792345047, "learning_rate": 0.0001412215740069581, "loss": 1.7937744140625, "step": 690 }, { "epoch": 0.0875, "grad_norm": 0.3600156605243683, "learning_rate": 0.00014116481203776677, "loss": 1.7986185073852539, "step": 700 }, { "epoch": 0.08875, "grad_norm": 0.3507816195487976, "learning_rate": 0.00014110691670709584, "loss": 1.7555866241455078, "step": 710 }, { "epoch": 0.09, "grad_norm": 0.35459256172180176, "learning_rate": 0.00014104788895413529, "loss": 1.795433807373047, "step": 720 }, { "epoch": 0.09125, "grad_norm": 0.35286569595336914, "learning_rate": 0.00014098772973644564, "loss": 1.820347213745117, "step": 730 }, { "epoch": 0.0925, "grad_norm": 0.3857751786708832, "learning_rate": 0.00014092644002994218, "loss": 1.8153291702270509, "step": 740 }, { "epoch": 0.09375, "grad_norm": 0.3553074598312378, "learning_rate": 0.00014086402082887924, "loss": 1.8413051605224608, "step": 750 }, { "epoch": 0.095, "grad_norm": 0.35642898082733154, "learning_rate": 0.0001408004731458341, "loss": 1.7815227508544922, "step": 760 }, { "epoch": 0.09625, "grad_norm": 0.37263238430023193, "learning_rate": 0.00014073579801169043, "loss": 1.8360301971435546, "step": 770 }, { "epoch": 0.0975, "grad_norm": 0.37507593631744385, "learning_rate": 0.00014066999647562167, "loss": 1.8166229248046875, "step": 780 }, { "epoch": 0.09875, "grad_norm": 0.3496163487434387, "learning_rate": 0.00014060306960507398, "loss": 1.7876134872436524, "step": 790 }, { "epoch": 0.1, "grad_norm": 0.350668340921402, "learning_rate": 0.000140535018485749, "loss": 1.8262884140014648, "step": 800 }, { "epoch": 0.10125, "grad_norm": 0.36257749795913696, "learning_rate": 0.00014046584422158602, "loss": 1.7791305541992188, "step": 810 }, { "epoch": 0.1025, "grad_norm": 0.357570081949234, "learning_rate": 0.00014039554793474442, "loss": 1.8329212188720703, "step": 820 }, { "epoch": 0.10375, "grad_norm": 0.354640930891037, "learning_rate": 0.00014032413076558507, "loss": 1.7825984954833984, "step": 830 }, { "epoch": 0.105, "grad_norm": 0.35969364643096924, "learning_rate": 0.00014025159387265215, "loss": 1.7961544036865233, "step": 840 }, { "epoch": 0.10625, "grad_norm": 0.3408399224281311, "learning_rate": 0.00014017793843265416, "loss": 1.8031917572021485, "step": 850 }, { "epoch": 0.1075, "grad_norm": 0.3505636751651764, "learning_rate": 0.00014010316564044495, "loss": 1.8270240783691407, "step": 860 }, { "epoch": 0.10875, "grad_norm": 0.3612024784088135, "learning_rate": 0.00014002727670900427, "loss": 1.8037662506103516, "step": 870 }, { "epoch": 0.11, "grad_norm": 0.3611273467540741, "learning_rate": 0.00013995027286941813, "loss": 1.7805574417114258, "step": 880 }, { "epoch": 0.11125, "grad_norm": 0.370518296957016, "learning_rate": 0.00013987215537085876, "loss": 1.83743896484375, "step": 890 }, { "epoch": 0.1125, "grad_norm": 0.3627995550632477, "learning_rate": 0.00013979292548056446, "loss": 1.8568729400634765, "step": 900 }, { "epoch": 0.11375, "grad_norm": 0.33446118235588074, "learning_rate": 0.00013971258448381896, "loss": 1.8121458053588868, "step": 910 }, { "epoch": 0.115, "grad_norm": 0.35702356696128845, "learning_rate": 0.00013963113368393058, "loss": 1.8272817611694336, "step": 920 }, { "epoch": 0.11625, "grad_norm": 0.35480058193206787, "learning_rate": 0.00013954857440221107, "loss": 1.8286819458007812, "step": 930 }, { "epoch": 0.1175, "grad_norm": 0.33891281485557556, "learning_rate": 0.00013946490797795425, "loss": 1.7881786346435546, "step": 940 }, { "epoch": 0.11875, "grad_norm": 0.34998786449432373, "learning_rate": 0.00013938013576841426, "loss": 1.8192798614501953, "step": 950 }, { "epoch": 0.12, "grad_norm": 0.36356785893440247, "learning_rate": 0.0001392942591487834, "loss": 1.8080211639404298, "step": 960 }, { "epoch": 0.12125, "grad_norm": 0.3536245822906494, "learning_rate": 0.00013920727951217003, "loss": 1.7745712280273438, "step": 970 }, { "epoch": 0.1225, "grad_norm": 0.35819944739341736, "learning_rate": 0.00013911919826957588, "loss": 1.8335809707641602, "step": 980 }, { "epoch": 0.12375, "grad_norm": 0.3673238754272461, "learning_rate": 0.0001390300168498732, "loss": 1.7918657302856444, "step": 990 }, { "epoch": 0.125, "grad_norm": 0.37633419036865234, "learning_rate": 0.0001389397366997814, "loss": 1.7912788391113281, "step": 1000 }, { "epoch": 0.12625, "grad_norm": 0.36260703206062317, "learning_rate": 0.00013884835928384387, "loss": 1.7769220352172852, "step": 1010 }, { "epoch": 0.1275, "grad_norm": 0.3502698242664337, "learning_rate": 0.00013875588608440397, "loss": 1.8571086883544923, "step": 1020 }, { "epoch": 0.12875, "grad_norm": 0.37244319915771484, "learning_rate": 0.0001386623186015812, "loss": 1.7873695373535157, "step": 1030 }, { "epoch": 0.13, "grad_norm": 0.36906760931015015, "learning_rate": 0.00013856765835324657, "loss": 1.7982921600341797, "step": 1040 }, { "epoch": 0.13125, "grad_norm": 0.3458193838596344, "learning_rate": 0.0001384719068749984, "loss": 1.896946907043457, "step": 1050 }, { "epoch": 0.1325, "grad_norm": 0.3625653088092804, "learning_rate": 0.00013837506572013695, "loss": 1.8590087890625, "step": 1060 }, { "epoch": 0.13375, "grad_norm": 0.37704798579216003, "learning_rate": 0.00013827713645963959, "loss": 1.7953170776367187, "step": 1070 }, { "epoch": 0.135, "grad_norm": 0.35103756189346313, "learning_rate": 0.00013817812068213505, "loss": 1.864565658569336, "step": 1080 }, { "epoch": 0.13625, "grad_norm": 0.39145445823669434, "learning_rate": 0.0001380780199938779, "loss": 1.787282371520996, "step": 1090 }, { "epoch": 0.1375, "grad_norm": 0.3810483515262604, "learning_rate": 0.00013797683601872218, "loss": 1.8461406707763672, "step": 1100 }, { "epoch": 0.13875, "grad_norm": 0.36001554131507874, "learning_rate": 0.00013787457039809542, "loss": 1.7846809387207032, "step": 1110 }, { "epoch": 0.14, "grad_norm": 0.36254000663757324, "learning_rate": 0.0001377712247909717, "loss": 1.8589000701904297, "step": 1120 }, { "epoch": 0.14125, "grad_norm": 0.3535791337490082, "learning_rate": 0.00013766680087384488, "loss": 1.790989875793457, "step": 1130 }, { "epoch": 0.1425, "grad_norm": 0.36819183826446533, "learning_rate": 0.00013756130034070147, "loss": 1.8115760803222656, "step": 1140 }, { "epoch": 0.14375, "grad_norm": 0.35042834281921387, "learning_rate": 0.00013745472490299298, "loss": 1.7872331619262696, "step": 1150 }, { "epoch": 0.145, "grad_norm": 0.36452701687812805, "learning_rate": 0.0001373470762896083, "loss": 1.8083602905273437, "step": 1160 }, { "epoch": 0.14625, "grad_norm": 0.35632047057151794, "learning_rate": 0.00013723835624684556, "loss": 1.8238039016723633, "step": 1170 }, { "epoch": 0.1475, "grad_norm": 0.36330121755599976, "learning_rate": 0.00013712856653838384, "loss": 1.8468303680419922, "step": 1180 }, { "epoch": 0.14875, "grad_norm": 0.37948107719421387, "learning_rate": 0.0001370177089452546, "loss": 1.7772663116455079, "step": 1190 }, { "epoch": 0.15, "grad_norm": 0.3759608268737793, "learning_rate": 0.0001369057852658127, "loss": 1.793960952758789, "step": 1200 }, { "epoch": 0.15125, "grad_norm": 0.3672516644001007, "learning_rate": 0.00013679279731570733, "loss": 1.7799537658691407, "step": 1210 }, { "epoch": 0.1525, "grad_norm": 0.3496241569519043, "learning_rate": 0.00013667874692785244, "loss": 1.7861103057861327, "step": 1220 }, { "epoch": 0.15375, "grad_norm": 0.3461642265319824, "learning_rate": 0.00013656363595239708, "loss": 1.8481361389160156, "step": 1230 }, { "epoch": 0.155, "grad_norm": 0.33858028054237366, "learning_rate": 0.0001364474662566954, "loss": 1.77642822265625, "step": 1240 }, { "epoch": 0.15625, "grad_norm": 0.3424132764339447, "learning_rate": 0.00013633023972527632, "loss": 1.7893180847167969, "step": 1250 }, { "epoch": 0.1575, "grad_norm": 0.35095998644828796, "learning_rate": 0.00013621195825981293, "loss": 1.7366466522216797, "step": 1260 }, { "epoch": 0.15875, "grad_norm": 0.36417317390441895, "learning_rate": 0.00013609262377909176, "loss": 1.839132308959961, "step": 1270 }, { "epoch": 0.16, "grad_norm": 0.3565835654735565, "learning_rate": 0.00013597223821898145, "loss": 1.757269287109375, "step": 1280 }, { "epoch": 0.16125, "grad_norm": 0.34676891565322876, "learning_rate": 0.00013585080353240158, "loss": 1.781381607055664, "step": 1290 }, { "epoch": 0.1625, "grad_norm": 0.3492533564567566, "learning_rate": 0.00013572832168929085, "loss": 1.8004392623901366, "step": 1300 }, { "epoch": 0.16375, "grad_norm": 0.33528923988342285, "learning_rate": 0.0001356047946765751, "loss": 1.7787307739257812, "step": 1310 }, { "epoch": 0.165, "grad_norm": 0.35009509325027466, "learning_rate": 0.00013548022449813522, "loss": 1.7703327178955077, "step": 1320 }, { "epoch": 0.16625, "grad_norm": 0.38126665353775024, "learning_rate": 0.00013535461317477446, "loss": 1.8216169357299805, "step": 1330 }, { "epoch": 0.1675, "grad_norm": 0.3653838038444519, "learning_rate": 0.00013522796274418575, "loss": 1.784686279296875, "step": 1340 }, { "epoch": 0.16875, "grad_norm": 0.35842376947402954, "learning_rate": 0.00013510027526091872, "loss": 1.818338394165039, "step": 1350 }, { "epoch": 0.17, "grad_norm": 0.3575061559677124, "learning_rate": 0.00013497155279634617, "loss": 1.8177734375, "step": 1360 }, { "epoch": 0.17125, "grad_norm": 0.36351051926612854, "learning_rate": 0.00013484179743863064, "loss": 1.8408927917480469, "step": 1370 }, { "epoch": 0.1725, "grad_norm": 0.37017935514450073, "learning_rate": 0.0001347110112926905, "loss": 1.8088676452636718, "step": 1380 }, { "epoch": 0.17375, "grad_norm": 0.35998839139938354, "learning_rate": 0.00013457919648016573, "loss": 1.8451946258544922, "step": 1390 }, { "epoch": 0.175, "grad_norm": 0.36173009872436523, "learning_rate": 0.0001344463551393836, "loss": 1.7784915924072267, "step": 1400 }, { "epoch": 0.17625, "grad_norm": 0.3683062493801117, "learning_rate": 0.00013431248942532385, "loss": 1.745309829711914, "step": 1410 }, { "epoch": 0.1775, "grad_norm": 0.3488103151321411, "learning_rate": 0.00013417760150958392, "loss": 1.793316650390625, "step": 1420 }, { "epoch": 0.17875, "grad_norm": 0.35314610600471497, "learning_rate": 0.00013404169358034355, "loss": 1.7867753982543946, "step": 1430 }, { "epoch": 0.18, "grad_norm": 0.3577822744846344, "learning_rate": 0.0001339047678423294, "loss": 1.7581512451171875, "step": 1440 }, { "epoch": 0.18125, "grad_norm": 0.3387848436832428, "learning_rate": 0.00013376682651677918, "loss": 1.7947473526000977, "step": 1450 }, { "epoch": 0.1825, "grad_norm": 0.3571684658527374, "learning_rate": 0.00013362787184140572, "loss": 1.7496719360351562, "step": 1460 }, { "epoch": 0.18375, "grad_norm": 0.3472369313240051, "learning_rate": 0.0001334879060703606, "loss": 1.7750968933105469, "step": 1470 }, { "epoch": 0.185, "grad_norm": 0.3559383749961853, "learning_rate": 0.00013334693147419759, "loss": 1.8256034851074219, "step": 1480 }, { "epoch": 0.18625, "grad_norm": 0.35892486572265625, "learning_rate": 0.00013320495033983585, "loss": 1.7993803024291992, "step": 1490 }, { "epoch": 0.1875, "grad_norm": 0.3679066300392151, "learning_rate": 0.0001330619649705228, "loss": 1.8065261840820312, "step": 1500 }, { "epoch": 0.18875, "grad_norm": 0.36252209544181824, "learning_rate": 0.0001329179776857968, "loss": 1.8372112274169923, "step": 1510 }, { "epoch": 0.19, "grad_norm": 0.3526136577129364, "learning_rate": 0.0001327729908214494, "loss": 1.799185562133789, "step": 1520 }, { "epoch": 0.19125, "grad_norm": 0.3635775148868561, "learning_rate": 0.0001326270067294877, "loss": 1.8340118408203125, "step": 1530 }, { "epoch": 0.1925, "grad_norm": 0.36545416712760925, "learning_rate": 0.00013248002777809586, "loss": 1.7582477569580077, "step": 1540 }, { "epoch": 0.19375, "grad_norm": 0.37526363134384155, "learning_rate": 0.00013233205635159695, "loss": 1.799554443359375, "step": 1550 }, { "epoch": 0.195, "grad_norm": 0.35140055418014526, "learning_rate": 0.0001321830948504142, "loss": 1.84625244140625, "step": 1560 }, { "epoch": 0.19625, "grad_norm": 0.3566315770149231, "learning_rate": 0.0001320331456910319, "loss": 1.7883316040039063, "step": 1570 }, { "epoch": 0.1975, "grad_norm": 0.35099372267723083, "learning_rate": 0.0001318822113059565, "loss": 1.794087028503418, "step": 1580 }, { "epoch": 0.19875, "grad_norm": 0.35940778255462646, "learning_rate": 0.00013173029414367693, "loss": 1.7220880508422851, "step": 1590 }, { "epoch": 0.2, "grad_norm": 0.36045801639556885, "learning_rate": 0.0001315773966686249, "loss": 1.7802143096923828, "step": 1600 }, { "epoch": 0.20125, "grad_norm": 0.3581635057926178, "learning_rate": 0.000131423521361135, "loss": 1.799722671508789, "step": 1610 }, { "epoch": 0.2025, "grad_norm": 0.33708855509757996, "learning_rate": 0.00013126867071740436, "loss": 1.8053092956542969, "step": 1620 }, { "epoch": 0.20375, "grad_norm": 0.3750436007976532, "learning_rate": 0.00013111284724945228, "loss": 1.8074203491210938, "step": 1630 }, { "epoch": 0.205, "grad_norm": 0.35119321942329407, "learning_rate": 0.0001309560534850794, "loss": 1.8175487518310547, "step": 1640 }, { "epoch": 0.20625, "grad_norm": 0.3611745834350586, "learning_rate": 0.00013079829196782668, "loss": 1.7702863693237305, "step": 1650 }, { "epoch": 0.2075, "grad_norm": 0.3799806833267212, "learning_rate": 0.00013063956525693424, "loss": 1.8235919952392579, "step": 1660 }, { "epoch": 0.20875, "grad_norm": 0.33240807056427, "learning_rate": 0.0001304798759272997, "loss": 1.768626594543457, "step": 1670 }, { "epoch": 0.21, "grad_norm": 0.36028313636779785, "learning_rate": 0.00013031922656943647, "loss": 1.829296875, "step": 1680 }, { "epoch": 0.21125, "grad_norm": 0.34874534606933594, "learning_rate": 0.00013015761978943185, "loss": 1.8018821716308593, "step": 1690 }, { "epoch": 0.2125, "grad_norm": 0.34944280982017517, "learning_rate": 0.00012999505820890448, "loss": 1.8226497650146485, "step": 1700 }, { "epoch": 0.21375, "grad_norm": 0.35128575563430786, "learning_rate": 0.00012983154446496209, "loss": 1.7741992950439454, "step": 1710 }, { "epoch": 0.215, "grad_norm": 0.3564985692501068, "learning_rate": 0.0001296670812101586, "loss": 1.7850433349609376, "step": 1720 }, { "epoch": 0.21625, "grad_norm": 0.3676067292690277, "learning_rate": 0.000129501671112451, "loss": 1.8290214538574219, "step": 1730 }, { "epoch": 0.2175, "grad_norm": 0.3726136386394501, "learning_rate": 0.00012933531685515627, "loss": 1.7774532318115235, "step": 1740 }, { "epoch": 0.21875, "grad_norm": 0.3493287265300751, "learning_rate": 0.00012916802113690766, "loss": 1.7807361602783203, "step": 1750 }, { "epoch": 0.22, "grad_norm": 0.37059202790260315, "learning_rate": 0.00012899978667161105, "loss": 1.749721145629883, "step": 1760 }, { "epoch": 0.22125, "grad_norm": 0.356022447347641, "learning_rate": 0.00012883061618840087, "loss": 1.8218292236328124, "step": 1770 }, { "epoch": 0.2225, "grad_norm": 0.3568074405193329, "learning_rate": 0.00012866051243159572, "loss": 1.8072574615478516, "step": 1780 }, { "epoch": 0.22375, "grad_norm": 0.3749092221260071, "learning_rate": 0.00012848947816065416, "loss": 1.8410078048706056, "step": 1790 }, { "epoch": 0.225, "grad_norm": 0.35633665323257446, "learning_rate": 0.00012831751615012955, "loss": 1.7817327499389648, "step": 1800 }, { "epoch": 0.22625, "grad_norm": 0.3607875108718872, "learning_rate": 0.00012814462918962533, "loss": 1.8118452072143554, "step": 1810 }, { "epoch": 0.2275, "grad_norm": 0.34315699338912964, "learning_rate": 0.00012797082008374967, "loss": 1.8008819580078126, "step": 1820 }, { "epoch": 0.22875, "grad_norm": 0.358188658952713, "learning_rate": 0.00012779609165206992, "loss": 1.8048545837402343, "step": 1830 }, { "epoch": 0.23, "grad_norm": 0.3641424775123596, "learning_rate": 0.000127620446729067, "loss": 1.8129388809204101, "step": 1840 }, { "epoch": 0.23125, "grad_norm": 0.36388713121414185, "learning_rate": 0.00012744388816408926, "loss": 1.7981510162353516, "step": 1850 }, { "epoch": 0.2325, "grad_norm": 0.3411344587802887, "learning_rate": 0.00012726641882130642, "loss": 1.7846858978271485, "step": 1860 }, { "epoch": 0.23375, "grad_norm": 0.36635443568229675, "learning_rate": 0.00012708804157966297, "loss": 1.8334461212158204, "step": 1870 }, { "epoch": 0.235, "grad_norm": 0.3459226191043854, "learning_rate": 0.00012690875933283154, "loss": 1.7850067138671875, "step": 1880 }, { "epoch": 0.23625, "grad_norm": 0.3630014657974243, "learning_rate": 0.00012672857498916595, "loss": 1.8400045394897462, "step": 1890 }, { "epoch": 0.2375, "grad_norm": 0.3783304691314697, "learning_rate": 0.000126547491471654, "loss": 1.7719623565673828, "step": 1900 }, { "epoch": 0.23875, "grad_norm": 0.3790845572948456, "learning_rate": 0.0001263655117178701, "loss": 1.8144996643066407, "step": 1910 }, { "epoch": 0.24, "grad_norm": 0.35528555512428284, "learning_rate": 0.0001261826386799276, "loss": 1.797579002380371, "step": 1920 }, { "epoch": 0.24125, "grad_norm": 0.3462880253791809, "learning_rate": 0.00012599887532443088, "loss": 1.7669387817382813, "step": 1930 }, { "epoch": 0.2425, "grad_norm": 0.35499900579452515, "learning_rate": 0.00012581422463242716, "loss": 1.782514762878418, "step": 1940 }, { "epoch": 0.24375, "grad_norm": 0.35548484325408936, "learning_rate": 0.00012562868959935835, "loss": 1.7927711486816407, "step": 1950 }, { "epoch": 0.245, "grad_norm": 0.36208584904670715, "learning_rate": 0.00012544227323501222, "loss": 1.8539527893066405, "step": 1960 }, { "epoch": 0.24625, "grad_norm": 0.3629232347011566, "learning_rate": 0.0001252549785634738, "loss": 1.7535400390625, "step": 1970 }, { "epoch": 0.2475, "grad_norm": 0.33926820755004883, "learning_rate": 0.000125066808623076, "loss": 1.7788131713867188, "step": 1980 }, { "epoch": 0.24875, "grad_norm": 0.3651394546031952, "learning_rate": 0.00012487776646635072, "loss": 1.8248186111450195, "step": 1990 }, { "epoch": 0.25, "grad_norm": 0.35856956243515015, "learning_rate": 0.00012468785515997905, "loss": 1.7728294372558593, "step": 2000 }, { "epoch": 0.25125, "grad_norm": 0.36707815527915955, "learning_rate": 0.0001244970777847416, "loss": 1.797306442260742, "step": 2010 }, { "epoch": 0.2525, "grad_norm": 0.37768349051475525, "learning_rate": 0.00012430543743546853, "loss": 1.8138954162597656, "step": 2020 }, { "epoch": 0.25375, "grad_norm": 0.3719421625137329, "learning_rate": 0.00012411293722098938, "loss": 1.8046173095703124, "step": 2030 }, { "epoch": 0.255, "grad_norm": 0.35382720828056335, "learning_rate": 0.00012391958026408258, "loss": 1.765408706665039, "step": 2040 }, { "epoch": 0.25625, "grad_norm": 0.3717374801635742, "learning_rate": 0.00012372536970142481, "loss": 1.794291877746582, "step": 2050 }, { "epoch": 0.2575, "grad_norm": 0.37810182571411133, "learning_rate": 0.0001235303086835401, "loss": 1.7855905532836913, "step": 2060 }, { "epoch": 0.25875, "grad_norm": 0.34465938806533813, "learning_rate": 0.00012333440037474877, "loss": 1.7502609252929688, "step": 2070 }, { "epoch": 0.26, "grad_norm": 0.3537978529930115, "learning_rate": 0.0001231376479531161, "loss": 1.8433588027954102, "step": 2080 }, { "epoch": 0.26125, "grad_norm": 0.3481179475784302, "learning_rate": 0.00012294005461040066, "loss": 1.778417205810547, "step": 2090 }, { "epoch": 0.2625, "grad_norm": 0.36712074279785156, "learning_rate": 0.00012274162355200264, "loss": 1.8297000885009767, "step": 2100 }, { "epoch": 0.26375, "grad_norm": 0.36218199133872986, "learning_rate": 0.0001225423579969119, "loss": 1.8048271179199218, "step": 2110 }, { "epoch": 0.265, "grad_norm": 0.3427264988422394, "learning_rate": 0.00012234226117765565, "loss": 1.765831756591797, "step": 2120 }, { "epoch": 0.26625, "grad_norm": 0.35128286480903625, "learning_rate": 0.00012214133634024592, "loss": 1.8477115631103516, "step": 2130 }, { "epoch": 0.2675, "grad_norm": 0.36919906735420227, "learning_rate": 0.0001219395867441272, "loss": 1.7384143829345704, "step": 2140 }, { "epoch": 0.26875, "grad_norm": 0.37480294704437256, "learning_rate": 0.00012173701566212328, "loss": 1.776589584350586, "step": 2150 }, { "epoch": 0.27, "grad_norm": 0.3442743718624115, "learning_rate": 0.00012153362638038429, "loss": 1.7534845352172852, "step": 2160 }, { "epoch": 0.27125, "grad_norm": 0.3617842495441437, "learning_rate": 0.0001213294221983334, "loss": 1.8287986755371093, "step": 2170 }, { "epoch": 0.2725, "grad_norm": 0.3468424081802368, "learning_rate": 0.00012112440642861319, "loss": 1.7810518264770507, "step": 2180 }, { "epoch": 0.27375, "grad_norm": 0.36655351519584656, "learning_rate": 0.000120918582397032, "loss": 1.8189208984375, "step": 2190 }, { "epoch": 0.275, "grad_norm": 0.35723134875297546, "learning_rate": 0.00012071195344251006, "loss": 1.8201839447021484, "step": 2200 }, { "epoch": 0.27625, "grad_norm": 0.36652442812919617, "learning_rate": 0.00012050452291702508, "loss": 1.8076786041259765, "step": 2210 }, { "epoch": 0.2775, "grad_norm": 0.3568657338619232, "learning_rate": 0.00012029629418555812, "loss": 1.7748506546020508, "step": 2220 }, { "epoch": 0.27875, "grad_norm": 0.34934675693511963, "learning_rate": 0.00012008727062603888, "loss": 1.8173185348510743, "step": 2230 }, { "epoch": 0.28, "grad_norm": 0.34384509921073914, "learning_rate": 0.00011987745562929093, "loss": 1.7502407073974608, "step": 2240 }, { "epoch": 0.28125, "grad_norm": 0.3680790066719055, "learning_rate": 0.00011966685259897665, "loss": 1.741659927368164, "step": 2250 }, { "epoch": 0.2825, "grad_norm": 0.37108564376831055, "learning_rate": 0.00011945546495154214, "loss": 1.7894527435302734, "step": 2260 }, { "epoch": 0.28375, "grad_norm": 0.37491941452026367, "learning_rate": 0.00011924329611616168, "loss": 1.7868507385253907, "step": 2270 }, { "epoch": 0.285, "grad_norm": 0.3443116545677185, "learning_rate": 0.00011903034953468213, "loss": 1.7541233062744142, "step": 2280 }, { "epoch": 0.28625, "grad_norm": 0.3643540143966675, "learning_rate": 0.00011881662866156715, "loss": 1.8128959655761718, "step": 2290 }, { "epoch": 0.2875, "grad_norm": 0.35639819502830505, "learning_rate": 0.00011860213696384107, "loss": 1.7657649993896485, "step": 2300 }, { "epoch": 0.28875, "grad_norm": 0.36442187428474426, "learning_rate": 0.00011838687792103273, "loss": 1.792444610595703, "step": 2310 }, { "epoch": 0.29, "grad_norm": 0.36035555601119995, "learning_rate": 0.00011817085502511903, "loss": 1.7670486450195313, "step": 2320 }, { "epoch": 0.29125, "grad_norm": 0.3552349805831909, "learning_rate": 0.00011795407178046817, "loss": 1.8542526245117188, "step": 2330 }, { "epoch": 0.2925, "grad_norm": 0.3693036437034607, "learning_rate": 0.00011773653170378296, "loss": 1.6886547088623047, "step": 2340 }, { "epoch": 0.29375, "grad_norm": 0.3605458736419678, "learning_rate": 0.00011751823832404365, "loss": 1.7754722595214845, "step": 2350 }, { "epoch": 0.295, "grad_norm": 0.35839903354644775, "learning_rate": 0.00011729919518245076, "loss": 1.7882440567016602, "step": 2360 }, { "epoch": 0.29625, "grad_norm": 0.36839786171913147, "learning_rate": 0.00011707940583236761, "loss": 1.7781326293945312, "step": 2370 }, { "epoch": 0.2975, "grad_norm": 0.35868513584136963, "learning_rate": 0.0001168588738392626, "loss": 1.7871665954589844, "step": 2380 }, { "epoch": 0.29875, "grad_norm": 0.3435186743736267, "learning_rate": 0.00011663760278065153, "loss": 1.8193252563476563, "step": 2390 }, { "epoch": 0.3, "grad_norm": 0.3949030935764313, "learning_rate": 0.00011641559624603941, "loss": 1.7928247451782227, "step": 2400 }, { "epoch": 0.30125, "grad_norm": 0.3681996762752533, "learning_rate": 0.00011619285783686234, "loss": 1.7616628646850585, "step": 2410 }, { "epoch": 0.3025, "grad_norm": 0.3694431781768799, "learning_rate": 0.00011596939116642899, "loss": 1.8024406433105469, "step": 2420 }, { "epoch": 0.30375, "grad_norm": 0.3637784719467163, "learning_rate": 0.00011574519985986208, "loss": 1.757676887512207, "step": 2430 }, { "epoch": 0.305, "grad_norm": 0.3616812229156494, "learning_rate": 0.00011552028755403952, "loss": 1.79559326171875, "step": 2440 }, { "epoch": 0.30625, "grad_norm": 0.36502957344055176, "learning_rate": 0.00011529465789753538, "loss": 1.7899351119995117, "step": 2450 }, { "epoch": 0.3075, "grad_norm": 0.3788166344165802, "learning_rate": 0.00011506831455056079, "loss": 1.8282848358154298, "step": 2460 }, { "epoch": 0.30875, "grad_norm": 0.36333489418029785, "learning_rate": 0.00011484126118490451, "loss": 1.766189956665039, "step": 2470 }, { "epoch": 0.31, "grad_norm": 0.35034865140914917, "learning_rate": 0.00011461350148387332, "loss": 1.7669204711914062, "step": 2480 }, { "epoch": 0.31125, "grad_norm": 0.35153037309646606, "learning_rate": 0.00011438503914223241, "loss": 1.7271625518798828, "step": 2490 }, { "epoch": 0.3125, "grad_norm": 0.3732260763645172, "learning_rate": 0.00011415587786614524, "loss": 1.7690876007080079, "step": 2500 }, { "epoch": 0.31375, "grad_norm": 0.3613711893558502, "learning_rate": 0.0001139260213731136, "loss": 1.7684833526611328, "step": 2510 }, { "epoch": 0.315, "grad_norm": 0.35713133215904236, "learning_rate": 0.00011369547339191726, "loss": 1.7643346786499023, "step": 2520 }, { "epoch": 0.31625, "grad_norm": 0.35974639654159546, "learning_rate": 0.0001134642376625534, "loss": 1.7887260437011718, "step": 2530 }, { "epoch": 0.3175, "grad_norm": 0.36356088519096375, "learning_rate": 0.00011323231793617599, "loss": 1.788846206665039, "step": 2540 }, { "epoch": 0.31875, "grad_norm": 0.3578101098537445, "learning_rate": 0.00011299971797503495, "loss": 1.781305694580078, "step": 2550 }, { "epoch": 0.32, "grad_norm": 0.35546955466270447, "learning_rate": 0.00011276644155241517, "loss": 1.7678417205810546, "step": 2560 }, { "epoch": 0.32125, "grad_norm": 0.3539295792579651, "learning_rate": 0.00011253249245257516, "loss": 1.7507053375244142, "step": 2570 }, { "epoch": 0.3225, "grad_norm": 0.35056355595588684, "learning_rate": 0.00011229787447068576, "loss": 1.8345399856567384, "step": 2580 }, { "epoch": 0.32375, "grad_norm": 0.3503001034259796, "learning_rate": 0.00011206259141276858, "loss": 1.8280166625976562, "step": 2590 }, { "epoch": 0.325, "grad_norm": 0.3602514863014221, "learning_rate": 0.0001118266470956342, "loss": 1.7046276092529298, "step": 2600 }, { "epoch": 0.32625, "grad_norm": 0.3672384023666382, "learning_rate": 0.00011159004534682027, "loss": 1.805099868774414, "step": 2610 }, { "epoch": 0.3275, "grad_norm": 0.3589872419834137, "learning_rate": 0.00011135279000452953, "loss": 1.7550365447998046, "step": 2620 }, { "epoch": 0.32875, "grad_norm": 0.3497745990753174, "learning_rate": 0.00011111488491756732, "loss": 1.758819580078125, "step": 2630 }, { "epoch": 0.33, "grad_norm": 0.3647236227989197, "learning_rate": 0.00011087633394527935, "loss": 1.765294647216797, "step": 2640 }, { "epoch": 0.33125, "grad_norm": 0.33403027057647705, "learning_rate": 0.00011063714095748899, "loss": 1.7979480743408203, "step": 2650 }, { "epoch": 0.3325, "grad_norm": 0.3792349696159363, "learning_rate": 0.00011039730983443455, "loss": 1.829258346557617, "step": 2660 }, { "epoch": 0.33375, "grad_norm": 0.3754643201828003, "learning_rate": 0.00011015684446670626, "loss": 1.783727264404297, "step": 2670 }, { "epoch": 0.335, "grad_norm": 0.3466981053352356, "learning_rate": 0.00010991574875518323, "loss": 1.7687664031982422, "step": 2680 }, { "epoch": 0.33625, "grad_norm": 0.3535688519477844, "learning_rate": 0.00010967402661097012, "loss": 1.8189085006713868, "step": 2690 }, { "epoch": 0.3375, "grad_norm": 0.36101067066192627, "learning_rate": 0.0001094316819553337, "loss": 1.752197265625, "step": 2700 }, { "epoch": 0.33875, "grad_norm": 0.36568474769592285, "learning_rate": 0.0001091887187196393, "loss": 1.7754268646240234, "step": 2710 }, { "epoch": 0.34, "grad_norm": 0.3312813639640808, "learning_rate": 0.00010894514084528695, "loss": 1.75748291015625, "step": 2720 }, { "epoch": 0.34125, "grad_norm": 0.3573434054851532, "learning_rate": 0.00010870095228364743, "loss": 1.7631900787353516, "step": 2730 }, { "epoch": 0.3425, "grad_norm": 0.35645684599876404, "learning_rate": 0.00010845615699599832, "loss": 1.747064971923828, "step": 2740 }, { "epoch": 0.34375, "grad_norm": 0.3608238101005554, "learning_rate": 0.00010821075895345951, "loss": 1.772369384765625, "step": 2750 }, { "epoch": 0.345, "grad_norm": 0.37147653102874756, "learning_rate": 0.00010796476213692903, "loss": 1.8682558059692382, "step": 2760 }, { "epoch": 0.34625, "grad_norm": 0.3562459349632263, "learning_rate": 0.0001077181705370183, "loss": 1.7756576538085938, "step": 2770 }, { "epoch": 0.3475, "grad_norm": 0.3861102759838104, "learning_rate": 0.00010747098815398739, "loss": 1.797110366821289, "step": 2780 }, { "epoch": 0.34875, "grad_norm": 0.3438943326473236, "learning_rate": 0.0001072232189976802, "loss": 1.7463438034057617, "step": 2790 }, { "epoch": 0.35, "grad_norm": 0.3862653374671936, "learning_rate": 0.00010697486708745942, "loss": 1.781214141845703, "step": 2800 } ], "logging_steps": 10, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.750651595063296e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }