| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9992193598750976, | |
| "eval_steps": 500, | |
| "global_step": 640, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00156128024980484, | |
| "grad_norm": 157.15646152338417, | |
| "learning_rate": 3.125e-06, | |
| "loss": 1.2885, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0078064012490242, | |
| "grad_norm": 853.4262826240488, | |
| "learning_rate": 1.5625e-05, | |
| "loss": 1.6014, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0156128024980484, | |
| "grad_norm": 176.7851609681746, | |
| "learning_rate": 3.125e-05, | |
| "loss": 1.3238, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0234192037470726, | |
| "grad_norm": 72.1158095953751, | |
| "learning_rate": 4.6875e-05, | |
| "loss": 1.3881, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0312256049960968, | |
| "grad_norm": 110.29629232077926, | |
| "learning_rate": 6.25e-05, | |
| "loss": 2.0626, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.039032006245121, | |
| "grad_norm": 329.0378004452346, | |
| "learning_rate": 7.8125e-05, | |
| "loss": 1.5588, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0468384074941452, | |
| "grad_norm": 57.761989240852614, | |
| "learning_rate": 9.375e-05, | |
| "loss": 2.346, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0546448087431694, | |
| "grad_norm": 32.91341606027756, | |
| "learning_rate": 0.000109375, | |
| "loss": 1.4373, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0624512099921936, | |
| "grad_norm": 193.08987603577697, | |
| "learning_rate": 0.000125, | |
| "loss": 1.8054, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0702576112412178, | |
| "grad_norm": 692.8558598515177, | |
| "learning_rate": 0.00014062500000000002, | |
| "loss": 2.811, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.078064012490242, | |
| "grad_norm": 823.3938270920198, | |
| "learning_rate": 0.00015625, | |
| "loss": 3.0194, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0858704137392662, | |
| "grad_norm": 86.72205086386609, | |
| "learning_rate": 0.00017187500000000002, | |
| "loss": 7.2331, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.0936768149882904, | |
| "grad_norm": 2858.9230612729734, | |
| "learning_rate": 0.0001875, | |
| "loss": 9.3111, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1014832162373146, | |
| "grad_norm": 42.021069127858276, | |
| "learning_rate": 0.00019999851261394218, | |
| "loss": 8.0654, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.1092896174863388, | |
| "grad_norm": 466.6971436418362, | |
| "learning_rate": 0.00019994645874763658, | |
| "loss": 34.4646, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.117096018735363, | |
| "grad_norm": 60.785831180901994, | |
| "learning_rate": 0.00019982007981886847, | |
| "loss": 14.4744, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.1249024199843872, | |
| "grad_norm": 64.3133081084402, | |
| "learning_rate": 0.00019961946980917456, | |
| "loss": 15.3848, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1327088212334114, | |
| "grad_norm": 62.372835015380694, | |
| "learning_rate": 0.00019934477790194445, | |
| "loss": 15.4141, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.1405152224824356, | |
| "grad_norm": 43.926768906864126, | |
| "learning_rate": 0.00019899620837148077, | |
| "loss": 11.7617, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1483216237314598, | |
| "grad_norm": 40.79962268028991, | |
| "learning_rate": 0.0001985740204310909, | |
| "loss": 9.396, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.156128024980484, | |
| "grad_norm": 30.358089519115435, | |
| "learning_rate": 0.00019807852804032305, | |
| "loss": 8.001, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16393442622950818, | |
| "grad_norm": 29.528505324475315, | |
| "learning_rate": 0.00019751009967149087, | |
| "loss": 7.7649, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.1717408274785324, | |
| "grad_norm": 34.508708528572726, | |
| "learning_rate": 0.00019686915803565934, | |
| "loss": 7.2406, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1795472287275566, | |
| "grad_norm": 27.549362726844333, | |
| "learning_rate": 0.0001961561797682962, | |
| "loss": 6.9614, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.1873536299765808, | |
| "grad_norm": 18.670623052254278, | |
| "learning_rate": 0.0001953716950748227, | |
| "loss": 6.7584, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.195160031225605, | |
| "grad_norm": 15.678010714271142, | |
| "learning_rate": 0.0001945162873363268, | |
| "loss": 6.6136, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2029664324746292, | |
| "grad_norm": 25.38891212752094, | |
| "learning_rate": 0.0001935905926757326, | |
| "loss": 6.5037, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2107728337236534, | |
| "grad_norm": 17.104405780834306, | |
| "learning_rate": 0.00019259529948474833, | |
| "loss": 6.4509, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.2185792349726776, | |
| "grad_norm": 18.211804642526683, | |
| "learning_rate": 0.00019153114791194473, | |
| "loss": 6.4213, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2263856362217018, | |
| "grad_norm": 13.43509168137259, | |
| "learning_rate": 0.00019039892931234435, | |
| "loss": 6.4289, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.234192037470726, | |
| "grad_norm": 24.30609404768323, | |
| "learning_rate": 0.00018919948565893142, | |
| "loss": 6.543, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2419984387197502, | |
| "grad_norm": 13.24603379582485, | |
| "learning_rate": 0.00018793370891651972, | |
| "loss": 6.4651, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.2498048399687744, | |
| "grad_norm": 13.84571961153875, | |
| "learning_rate": 0.00018660254037844388, | |
| "loss": 6.5636, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2576112412177986, | |
| "grad_norm": 18.892394839886915, | |
| "learning_rate": 0.00018520696996656788, | |
| "loss": 6.3237, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.2654176424668228, | |
| "grad_norm": 16.539574669022365, | |
| "learning_rate": 0.0001837480354951308, | |
| "loss": 6.289, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.273224043715847, | |
| "grad_norm": 14.553617681599023, | |
| "learning_rate": 0.00018222682189897752, | |
| "loss": 6.2864, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.2810304449648712, | |
| "grad_norm": 9.20342696970865, | |
| "learning_rate": 0.00018064446042674828, | |
| "loss": 6.2649, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2888368462138954, | |
| "grad_norm": 8.702607819247694, | |
| "learning_rate": 0.0001790021277996269, | |
| "loss": 6.2786, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.2966432474629196, | |
| "grad_norm": 13.099160207983402, | |
| "learning_rate": 0.0001773010453362737, | |
| "loss": 6.371, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3044496487119438, | |
| "grad_norm": 8.619612463148753, | |
| "learning_rate": 0.00017554247804459316, | |
| "loss": 6.2621, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.312256049960968, | |
| "grad_norm": 9.173309873502212, | |
| "learning_rate": 0.0001737277336810124, | |
| "loss": 6.1984, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3200624512099922, | |
| "grad_norm": 6.449786485091637, | |
| "learning_rate": 0.0001718581617779698, | |
| "loss": 6.2149, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.32786885245901637, | |
| "grad_norm": 7.729524110198087, | |
| "learning_rate": 0.00016993515264033672, | |
| "loss": 6.242, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3356752537080406, | |
| "grad_norm": 7.284647734979868, | |
| "learning_rate": 0.00016796013631151897, | |
| "loss": 6.2354, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.3434816549570648, | |
| "grad_norm": 9.118520382715053, | |
| "learning_rate": 0.00016593458151000688, | |
| "loss": 6.3628, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.351288056206089, | |
| "grad_norm": 9.589710899262458, | |
| "learning_rate": 0.00016385999453716454, | |
| "loss": 6.2361, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.3590944574551132, | |
| "grad_norm": 6.437393903376951, | |
| "learning_rate": 0.00016173791815707051, | |
| "loss": 6.1567, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3669008587041374, | |
| "grad_norm": 5.453863541032297, | |
| "learning_rate": 0.00015956993044924334, | |
| "loss": 6.1254, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.3747072599531616, | |
| "grad_norm": 6.169112414603225, | |
| "learning_rate": 0.0001573576436351046, | |
| "loss": 6.1168, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3825136612021858, | |
| "grad_norm": 539.7477490832886, | |
| "learning_rate": 0.0001551027028790524, | |
| "loss": 6.6357, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.39032006245121, | |
| "grad_norm": 8.2908726316984, | |
| "learning_rate": 0.0001528067850650368, | |
| "loss": 6.1525, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3981264637002342, | |
| "grad_norm": 23.234788125019623, | |
| "learning_rate": 0.0001504715975495472, | |
| "loss": 6.1363, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.4059328649492584, | |
| "grad_norm": 15.562031107869837, | |
| "learning_rate": 0.00014809887689193877, | |
| "loss": 5.9872, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.4137392661982826, | |
| "grad_norm": 8.993011846031898, | |
| "learning_rate": 0.00014569038756304207, | |
| "loss": 6.0967, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.4215456674473068, | |
| "grad_norm": 7.023501611334114, | |
| "learning_rate": 0.00014324792063301662, | |
| "loss": 5.8908, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.42935206869633097, | |
| "grad_norm": 9.001419937551082, | |
| "learning_rate": 0.00014077329243942369, | |
| "loss": 5.8014, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.4371584699453552, | |
| "grad_norm": 4.987151065225559, | |
| "learning_rate": 0.000138268343236509, | |
| "loss": 5.6783, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.4449648711943794, | |
| "grad_norm": 10.7381170330122, | |
| "learning_rate": 0.00013573493582670003, | |
| "loss": 5.7037, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.4527712724434036, | |
| "grad_norm": 8.164842583353764, | |
| "learning_rate": 0.00013317495417533524, | |
| "loss": 5.559, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4605776736924278, | |
| "grad_norm": 9.135687146937308, | |
| "learning_rate": 0.00013059030200965536, | |
| "loss": 5.5549, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.468384074941452, | |
| "grad_norm": 13.341060822205531, | |
| "learning_rate": 0.00012798290140309923, | |
| "loss": 5.4623, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.47619047619047616, | |
| "grad_norm": 14.745374236234094, | |
| "learning_rate": 0.00012535469134595595, | |
| "loss": 5.4395, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.4839968774395004, | |
| "grad_norm": 5.92938394119908, | |
| "learning_rate": 0.00012270762630343734, | |
| "loss": 5.3539, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4918032786885246, | |
| "grad_norm": 7.685908647839541, | |
| "learning_rate": 0.00012004367476224206, | |
| "loss": 5.3268, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.4996096799375488, | |
| "grad_norm": 5.2865860628796675, | |
| "learning_rate": 0.00011736481776669306, | |
| "loss": 5.265, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.507416081186573, | |
| "grad_norm": 6.173590068189982, | |
| "learning_rate": 0.00011467304744553618, | |
| "loss": 5.216, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.5152224824355972, | |
| "grad_norm": 6.816268113111804, | |
| "learning_rate": 0.00011197036553049625, | |
| "loss": 5.1764, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5230288836846214, | |
| "grad_norm": 7.422948718792107, | |
| "learning_rate": 0.00010925878186769158, | |
| "loss": 5.0837, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.5308352849336456, | |
| "grad_norm": 8.567751763291117, | |
| "learning_rate": 0.00010654031292301432, | |
| "loss": 4.9996, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.5386416861826698, | |
| "grad_norm": 10.80004035132535, | |
| "learning_rate": 0.00010381698028258817, | |
| "loss": 4.957, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.546448087431694, | |
| "grad_norm": 26.492130474508215, | |
| "learning_rate": 0.00010109080914941824, | |
| "loss": 4.9046, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5542544886807181, | |
| "grad_norm": 25.934427911795183, | |
| "learning_rate": 9.836382683735132e-05, | |
| "loss": 4.8369, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.5620608899297423, | |
| "grad_norm": 16.57403138193505, | |
| "learning_rate": 9.563806126346642e-05, | |
| "loss": 4.8889, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5698672911787666, | |
| "grad_norm": 7.069388142727566, | |
| "learning_rate": 9.29155394400166e-05, | |
| "loss": 4.74, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.5776736924277908, | |
| "grad_norm": 5.274766418061986, | |
| "learning_rate": 9.019828596704394e-05, | |
| "loss": 4.5576, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.585480093676815, | |
| "grad_norm": 5.931115883955146, | |
| "learning_rate": 8.74883215267881e-05, | |
| "loss": 4.4682, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.5932864949258392, | |
| "grad_norm": 5.310312707279328, | |
| "learning_rate": 8.478766138100834e-05, | |
| "loss": 4.442, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6010928961748634, | |
| "grad_norm": 5.30406906823985, | |
| "learning_rate": 8.209831387233676e-05, | |
| "loss": 4.4152, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.6088992974238876, | |
| "grad_norm": 9.462865189747752, | |
| "learning_rate": 7.942227893077652e-05, | |
| "loss": 4.3487, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6167056986729118, | |
| "grad_norm": 5.991992327255806, | |
| "learning_rate": 7.676154658645656e-05, | |
| "loss": 4.2129, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.624512099921936, | |
| "grad_norm": 5.789080303907565, | |
| "learning_rate": 7.411809548974792e-05, | |
| "loss": 4.1259, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6323185011709602, | |
| "grad_norm": 4.259589401030486, | |
| "learning_rate": 7.149389143984295e-05, | |
| "loss": 4.1542, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.6401249024199844, | |
| "grad_norm": 7.6466711994087495, | |
| "learning_rate": 6.889088592289093e-05, | |
| "loss": 4.0767, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.6479313036690086, | |
| "grad_norm": 5.788736636880761, | |
| "learning_rate": 6.6311014660778e-05, | |
| "loss": 4.0619, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.6557377049180327, | |
| "grad_norm": 8.998684058136783, | |
| "learning_rate": 6.375619617162985e-05, | |
| "loss": 4.0185, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.663544106167057, | |
| "grad_norm": 11.002645236663817, | |
| "learning_rate": 6.122833034310793e-05, | |
| "loss": 3.9383, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.6713505074160812, | |
| "grad_norm": 8.016050927093819, | |
| "learning_rate": 5.872929701956054e-05, | |
| "loss": 3.8441, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.6791569086651054, | |
| "grad_norm": 10.175896641084389, | |
| "learning_rate": 5.6260954604078585e-05, | |
| "loss": 3.8583, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.6869633099141296, | |
| "grad_norm": 11.690630430844724, | |
| "learning_rate": 5.382513867649663e-05, | |
| "loss": 3.8034, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6947697111631538, | |
| "grad_norm": 7.352060919259499, | |
| "learning_rate": 5.142366062836599e-05, | |
| "loss": 3.7862, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.702576112412178, | |
| "grad_norm": 6.876419598591147, | |
| "learning_rate": 4.9058306315915826e-05, | |
| "loss": 3.6953, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.7103825136612022, | |
| "grad_norm": 4.487889989074099, | |
| "learning_rate": 4.6730834732003104e-05, | |
| "loss": 3.7287, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.7181889149102264, | |
| "grad_norm": 6.939469433182013, | |
| "learning_rate": 4.444297669803981e-05, | |
| "loss": 3.6574, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.7259953161592506, | |
| "grad_norm": 7.054811740640252, | |
| "learning_rate": 4.219643357686967e-05, | |
| "loss": 3.5359, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.7338017174082748, | |
| "grad_norm": 8.924310882869229, | |
| "learning_rate": 3.999287600755192e-05, | |
| "loss": 3.5094, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.741608118657299, | |
| "grad_norm": 8.710035323966535, | |
| "learning_rate": 3.783394266299228e-05, | |
| "loss": 3.5068, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.7494145199063232, | |
| "grad_norm": 6.905209805801851, | |
| "learning_rate": 3.5721239031346066e-05, | |
| "loss": 3.4915, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.7572209211553473, | |
| "grad_norm": 5.9741716032348995, | |
| "learning_rate": 3.365633622209891e-05, | |
| "loss": 3.483, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.7650273224043715, | |
| "grad_norm": 5.945682725760953, | |
| "learning_rate": 3.164076979771287e-05, | |
| "loss": 3.3759, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.7728337236533958, | |
| "grad_norm": 6.085256420633571, | |
| "learning_rate": 2.9676038631707593e-05, | |
| "loss": 3.4618, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.78064012490242, | |
| "grad_norm": 5.373433106481478, | |
| "learning_rate": 2.776360379402445e-05, | |
| "loss": 3.3415, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7884465261514442, | |
| "grad_norm": 6.496439130287859, | |
| "learning_rate": 2.5904887464504114e-05, | |
| "loss": 3.2356, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.7962529274004684, | |
| "grad_norm": 5.202927614655838, | |
| "learning_rate": 2.4101271875283817e-05, | |
| "loss": 3.3062, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.8040593286494926, | |
| "grad_norm": 5.324495288216005, | |
| "learning_rate": 2.2354098282902446e-05, | |
| "loss": 3.2729, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.8118657298985168, | |
| "grad_norm": 6.545312772932895, | |
| "learning_rate": 2.0664665970876496e-05, | |
| "loss": 3.3021, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.819672131147541, | |
| "grad_norm": 7.261186320057872, | |
| "learning_rate": 1.903423128348959e-05, | |
| "loss": 3.1684, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.8274785323965652, | |
| "grad_norm": 6.811704253239715, | |
| "learning_rate": 1.7464006691513623e-05, | |
| "loss": 3.2032, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.8352849336455894, | |
| "grad_norm": 6.412082158580574, | |
| "learning_rate": 1.595515989055618e-05, | |
| "loss": 3.1793, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.8430913348946136, | |
| "grad_norm": 6.831927670347802, | |
| "learning_rate": 1.4508812932705363e-05, | |
| "loss": 3.0849, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.8508977361436377, | |
| "grad_norm": 4.372184263429538, | |
| "learning_rate": 1.3126041392116772e-05, | |
| "loss": 3.1848, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.8587041373926619, | |
| "grad_norm": 7.928305921666649, | |
| "learning_rate": 1.1807873565164506e-05, | |
| "loss": 3.0978, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8665105386416861, | |
| "grad_norm": 4.701380420842563, | |
| "learning_rate": 1.0555289705749483e-05, | |
| "loss": 3.0917, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.8743169398907104, | |
| "grad_norm": 8.565304872371204, | |
| "learning_rate": 9.369221296335006e-06, | |
| "loss": 3.0766, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.8821233411397346, | |
| "grad_norm": 4.22210926878525, | |
| "learning_rate": 8.250550355250875e-06, | |
| "loss": 3.0209, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.8899297423887588, | |
| "grad_norm": 4.982109804526221, | |
| "learning_rate": 7.200108780781556e-06, | |
| "loss": 3.1047, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.897736143637783, | |
| "grad_norm": 6.891473220421351, | |
| "learning_rate": 6.218677732526035e-06, | |
| "loss": 3.0509, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.9055425448868072, | |
| "grad_norm": 4.917071390959025, | |
| "learning_rate": 5.306987050489442e-06, | |
| "loss": 3.038, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.9133489461358314, | |
| "grad_norm": 4.903793393578422, | |
| "learning_rate": 4.465714712338398e-06, | |
| "loss": 3.0094, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.9211553473848556, | |
| "grad_norm": 5.396232120209849, | |
| "learning_rate": 3.6954863292237297e-06, | |
| "loss": 3.0498, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.9289617486338798, | |
| "grad_norm": 5.802132583965925, | |
| "learning_rate": 2.996874680545603e-06, | |
| "loss": 3.057, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.936768149882904, | |
| "grad_norm": 5.440739345432039, | |
| "learning_rate": 2.3703992880066638e-06, | |
| "loss": 3.0201, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9445745511319282, | |
| "grad_norm": 3.8338950124823943, | |
| "learning_rate": 1.8165260292704711e-06, | |
| "loss": 2.9121, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 5.560264910250828, | |
| "learning_rate": 1.3356667915121025e-06, | |
| "loss": 2.9788, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.9601873536299765, | |
| "grad_norm": 3.797002226494683, | |
| "learning_rate": 9.281791651187366e-07, | |
| "loss": 2.9708, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.9679937548790007, | |
| "grad_norm": 5.696764249716179, | |
| "learning_rate": 5.943661777680354e-07, | |
| "loss": 2.9568, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.975800156128025, | |
| "grad_norm": 3.943146082989434, | |
| "learning_rate": 3.3447606908196817e-07, | |
| "loss": 2.9859, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.9836065573770492, | |
| "grad_norm": 4.70202314087053, | |
| "learning_rate": 1.487021060236904e-07, | |
| "loss": 3.0433, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.9914129586260734, | |
| "grad_norm": 4.829329261543813, | |
| "learning_rate": 3.7182439174832106e-08, | |
| "loss": 2.9716, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.9992193598750976, | |
| "grad_norm": 4.305237273106749, | |
| "learning_rate": 0.0, | |
| "loss": 2.9717, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.9992193598750976, | |
| "eval_loss": 5.434586524963379, | |
| "eval_runtime": 2.3452, | |
| "eval_samples_per_second": 2.132, | |
| "eval_steps_per_second": 0.426, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.9992193598750976, | |
| "step": 640, | |
| "total_flos": 33474572451840.0, | |
| "train_loss": 5.168503437563777, | |
| "train_runtime": 10100.8492, | |
| "train_samples_per_second": 2.029, | |
| "train_steps_per_second": 0.063 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 640, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 33474572451840.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |