coder_noed_10_cp3000 / trainer_state.json
timhua's picture
Upload folder using huggingface_hub
a03e6e0 verified
raw
history blame
53.6 kB
{
"best_global_step": 3000,
"best_metric": 0.9244844913482666,
"best_model_checkpoint": "/workspace/runs_coder_noed_10/checkpoint-3000",
"epoch": 0.7334963325183375,
"eval_steps": 1000,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0024449877750611247,
"grad_norm": 0.7603356838226318,
"learning_rate": 9.988997555012225e-06,
"loss": 2.0597,
"step": 10
},
{
"epoch": 0.004889975550122249,
"grad_norm": 0.491472065448761,
"learning_rate": 9.97677261613692e-06,
"loss": 1.8106,
"step": 20
},
{
"epoch": 0.007334963325183374,
"grad_norm": 0.421262264251709,
"learning_rate": 9.964547677261615e-06,
"loss": 1.6648,
"step": 30
},
{
"epoch": 0.009779951100244499,
"grad_norm": 0.3425203263759613,
"learning_rate": 9.95232273838631e-06,
"loss": 1.552,
"step": 40
},
{
"epoch": 0.012224938875305624,
"grad_norm": 0.3278200328350067,
"learning_rate": 9.940097799511004e-06,
"loss": 1.4701,
"step": 50
},
{
"epoch": 0.014669926650366748,
"grad_norm": 0.3467581868171692,
"learning_rate": 9.927872860635697e-06,
"loss": 1.3899,
"step": 60
},
{
"epoch": 0.017114914425427872,
"grad_norm": 0.3279109001159668,
"learning_rate": 9.915647921760392e-06,
"loss": 1.3596,
"step": 70
},
{
"epoch": 0.019559902200488997,
"grad_norm": 0.3338731825351715,
"learning_rate": 9.903422982885086e-06,
"loss": 1.335,
"step": 80
},
{
"epoch": 0.022004889975550123,
"grad_norm": 0.39886194467544556,
"learning_rate": 9.891198044009781e-06,
"loss": 1.3055,
"step": 90
},
{
"epoch": 0.02444987775061125,
"grad_norm": 0.3964369297027588,
"learning_rate": 9.878973105134476e-06,
"loss": 1.2911,
"step": 100
},
{
"epoch": 0.02689486552567237,
"grad_norm": 0.42521995306015015,
"learning_rate": 9.866748166259169e-06,
"loss": 1.2764,
"step": 110
},
{
"epoch": 0.029339853300733496,
"grad_norm": 0.40386906266212463,
"learning_rate": 9.854523227383865e-06,
"loss": 1.2573,
"step": 120
},
{
"epoch": 0.03178484107579462,
"grad_norm": 0.42052966356277466,
"learning_rate": 9.842298288508558e-06,
"loss": 1.2147,
"step": 130
},
{
"epoch": 0.034229828850855744,
"grad_norm": 0.4453684091567993,
"learning_rate": 9.830073349633253e-06,
"loss": 1.2383,
"step": 140
},
{
"epoch": 0.03667481662591687,
"grad_norm": 0.4648647606372833,
"learning_rate": 9.817848410757947e-06,
"loss": 1.2257,
"step": 150
},
{
"epoch": 0.039119804400977995,
"grad_norm": 0.4787471890449524,
"learning_rate": 9.805623471882642e-06,
"loss": 1.1803,
"step": 160
},
{
"epoch": 0.04156479217603912,
"grad_norm": 0.5450437068939209,
"learning_rate": 9.793398533007335e-06,
"loss": 1.1454,
"step": 170
},
{
"epoch": 0.044009779951100246,
"grad_norm": 0.4808744192123413,
"learning_rate": 9.78117359413203e-06,
"loss": 1.1936,
"step": 180
},
{
"epoch": 0.04645476772616137,
"grad_norm": 0.48509007692337036,
"learning_rate": 9.768948655256724e-06,
"loss": 1.1674,
"step": 190
},
{
"epoch": 0.0488997555012225,
"grad_norm": 0.5013049840927124,
"learning_rate": 9.756723716381419e-06,
"loss": 1.1691,
"step": 200
},
{
"epoch": 0.05134474327628362,
"grad_norm": 0.5472636818885803,
"learning_rate": 9.744498777506112e-06,
"loss": 1.1596,
"step": 210
},
{
"epoch": 0.05378973105134474,
"grad_norm": 0.5386976599693298,
"learning_rate": 9.732273838630808e-06,
"loss": 1.1449,
"step": 220
},
{
"epoch": 0.05623471882640587,
"grad_norm": 0.5038230419158936,
"learning_rate": 9.720048899755501e-06,
"loss": 1.1029,
"step": 230
},
{
"epoch": 0.05867970660146699,
"grad_norm": 0.5687288045883179,
"learning_rate": 9.707823960880196e-06,
"loss": 1.1463,
"step": 240
},
{
"epoch": 0.061124694376528114,
"grad_norm": 0.53361576795578,
"learning_rate": 9.69559902200489e-06,
"loss": 1.1289,
"step": 250
},
{
"epoch": 0.06356968215158924,
"grad_norm": 0.5407126545906067,
"learning_rate": 9.683374083129585e-06,
"loss": 1.0949,
"step": 260
},
{
"epoch": 0.06601466992665037,
"grad_norm": 0.6220566630363464,
"learning_rate": 9.67114914425428e-06,
"loss": 1.1131,
"step": 270
},
{
"epoch": 0.06845965770171149,
"grad_norm": 0.5715101361274719,
"learning_rate": 9.658924205378973e-06,
"loss": 1.108,
"step": 280
},
{
"epoch": 0.07090464547677261,
"grad_norm": 0.5447026491165161,
"learning_rate": 9.646699266503668e-06,
"loss": 1.0953,
"step": 290
},
{
"epoch": 0.07334963325183375,
"grad_norm": 0.607760488986969,
"learning_rate": 9.634474327628362e-06,
"loss": 1.0689,
"step": 300
},
{
"epoch": 0.07579462102689487,
"grad_norm": 0.5735874176025391,
"learning_rate": 9.622249388753057e-06,
"loss": 1.0704,
"step": 310
},
{
"epoch": 0.07823960880195599,
"grad_norm": 0.6558325886726379,
"learning_rate": 9.610024449877752e-06,
"loss": 1.0834,
"step": 320
},
{
"epoch": 0.08068459657701711,
"grad_norm": 0.644957959651947,
"learning_rate": 9.597799511002447e-06,
"loss": 1.1016,
"step": 330
},
{
"epoch": 0.08312958435207823,
"grad_norm": 0.4896561801433563,
"learning_rate": 9.58557457212714e-06,
"loss": 1.0945,
"step": 340
},
{
"epoch": 0.08557457212713937,
"grad_norm": 0.5884416699409485,
"learning_rate": 9.573349633251834e-06,
"loss": 1.1114,
"step": 350
},
{
"epoch": 0.08801955990220049,
"grad_norm": 0.6569796204566956,
"learning_rate": 9.561124694376529e-06,
"loss": 1.1083,
"step": 360
},
{
"epoch": 0.09046454767726161,
"grad_norm": 0.6272417306900024,
"learning_rate": 9.548899755501224e-06,
"loss": 1.0743,
"step": 370
},
{
"epoch": 0.09290953545232274,
"grad_norm": 0.5897823572158813,
"learning_rate": 9.536674816625916e-06,
"loss": 1.0434,
"step": 380
},
{
"epoch": 0.09535452322738386,
"grad_norm": 0.5946202278137207,
"learning_rate": 9.524449877750613e-06,
"loss": 1.0474,
"step": 390
},
{
"epoch": 0.097799511002445,
"grad_norm": 0.5851741433143616,
"learning_rate": 9.512224938875306e-06,
"loss": 1.0697,
"step": 400
},
{
"epoch": 0.10024449877750612,
"grad_norm": 0.7547958493232727,
"learning_rate": 9.5e-06,
"loss": 1.0767,
"step": 410
},
{
"epoch": 0.10268948655256724,
"grad_norm": 0.651896595954895,
"learning_rate": 9.487775061124695e-06,
"loss": 1.0853,
"step": 420
},
{
"epoch": 0.10513447432762836,
"grad_norm": 0.6152411103248596,
"learning_rate": 9.47555012224939e-06,
"loss": 1.0582,
"step": 430
},
{
"epoch": 0.10757946210268948,
"grad_norm": 0.707961916923523,
"learning_rate": 9.463325183374083e-06,
"loss": 1.0658,
"step": 440
},
{
"epoch": 0.1100244498777506,
"grad_norm": 0.6049492359161377,
"learning_rate": 9.45110024449878e-06,
"loss": 1.0687,
"step": 450
},
{
"epoch": 0.11246943765281174,
"grad_norm": 0.6512130498886108,
"learning_rate": 9.438875305623472e-06,
"loss": 1.0574,
"step": 460
},
{
"epoch": 0.11491442542787286,
"grad_norm": 0.6922876834869385,
"learning_rate": 9.426650366748167e-06,
"loss": 1.0311,
"step": 470
},
{
"epoch": 0.11735941320293398,
"grad_norm": 0.6023642420768738,
"learning_rate": 9.414425427872862e-06,
"loss": 1.0098,
"step": 480
},
{
"epoch": 0.1198044009779951,
"grad_norm": 0.6482692360877991,
"learning_rate": 9.402200488997556e-06,
"loss": 1.0537,
"step": 490
},
{
"epoch": 0.12224938875305623,
"grad_norm": 0.6799217462539673,
"learning_rate": 9.38997555012225e-06,
"loss": 1.0474,
"step": 500
},
{
"epoch": 0.12469437652811736,
"grad_norm": 0.7018452286720276,
"learning_rate": 9.377750611246944e-06,
"loss": 1.0273,
"step": 510
},
{
"epoch": 0.1271393643031785,
"grad_norm": 0.7044438123703003,
"learning_rate": 9.365525672371639e-06,
"loss": 1.0305,
"step": 520
},
{
"epoch": 0.1295843520782396,
"grad_norm": 0.7802556157112122,
"learning_rate": 9.353300733496333e-06,
"loss": 1.0758,
"step": 530
},
{
"epoch": 0.13202933985330073,
"grad_norm": 0.6845948100090027,
"learning_rate": 9.341075794621028e-06,
"loss": 1.0158,
"step": 540
},
{
"epoch": 0.13447432762836187,
"grad_norm": 0.7554173469543457,
"learning_rate": 9.328850855745723e-06,
"loss": 1.0307,
"step": 550
},
{
"epoch": 0.13691931540342298,
"grad_norm": 0.7574878334999084,
"learning_rate": 9.316625916870417e-06,
"loss": 0.9949,
"step": 560
},
{
"epoch": 0.1393643031784841,
"grad_norm": 0.6939298510551453,
"learning_rate": 9.30440097799511e-06,
"loss": 1.041,
"step": 570
},
{
"epoch": 0.14180929095354522,
"grad_norm": 0.6318020224571228,
"learning_rate": 9.292176039119805e-06,
"loss": 1.032,
"step": 580
},
{
"epoch": 0.14425427872860636,
"grad_norm": 0.7026094794273376,
"learning_rate": 9.2799511002445e-06,
"loss": 1.0642,
"step": 590
},
{
"epoch": 0.1466992665036675,
"grad_norm": 0.6520543694496155,
"learning_rate": 9.267726161369194e-06,
"loss": 1.0535,
"step": 600
},
{
"epoch": 0.1491442542787286,
"grad_norm": 0.6793085336685181,
"learning_rate": 9.255501222493887e-06,
"loss": 1.0447,
"step": 610
},
{
"epoch": 0.15158924205378974,
"grad_norm": 0.6935579776763916,
"learning_rate": 9.243276283618584e-06,
"loss": 1.0151,
"step": 620
},
{
"epoch": 0.15403422982885084,
"grad_norm": 0.767677903175354,
"learning_rate": 9.231051344743277e-06,
"loss": 0.9973,
"step": 630
},
{
"epoch": 0.15647921760391198,
"grad_norm": 0.7972632646560669,
"learning_rate": 9.218826405867971e-06,
"loss": 0.9839,
"step": 640
},
{
"epoch": 0.15892420537897312,
"grad_norm": 0.716218113899231,
"learning_rate": 9.206601466992666e-06,
"loss": 0.9674,
"step": 650
},
{
"epoch": 0.16136919315403422,
"grad_norm": 0.6637817621231079,
"learning_rate": 9.19437652811736e-06,
"loss": 1.0171,
"step": 660
},
{
"epoch": 0.16381418092909536,
"grad_norm": 0.6555060148239136,
"learning_rate": 9.182151589242054e-06,
"loss": 1.0037,
"step": 670
},
{
"epoch": 0.16625916870415647,
"grad_norm": 0.7460082769393921,
"learning_rate": 9.16992665036675e-06,
"loss": 1.0329,
"step": 680
},
{
"epoch": 0.1687041564792176,
"grad_norm": 0.6652795672416687,
"learning_rate": 9.157701711491443e-06,
"loss": 1.041,
"step": 690
},
{
"epoch": 0.17114914425427874,
"grad_norm": 0.6055278182029724,
"learning_rate": 9.145476772616138e-06,
"loss": 1.0152,
"step": 700
},
{
"epoch": 0.17359413202933985,
"grad_norm": 0.7681952118873596,
"learning_rate": 9.133251833740832e-06,
"loss": 1.0292,
"step": 710
},
{
"epoch": 0.17603911980440098,
"grad_norm": 0.8185127973556519,
"learning_rate": 9.121026894865527e-06,
"loss": 1.002,
"step": 720
},
{
"epoch": 0.1784841075794621,
"grad_norm": 0.6619365811347961,
"learning_rate": 9.10880195599022e-06,
"loss": 1.0032,
"step": 730
},
{
"epoch": 0.18092909535452323,
"grad_norm": 0.6229087710380554,
"learning_rate": 9.096577017114915e-06,
"loss": 1.0107,
"step": 740
},
{
"epoch": 0.18337408312958436,
"grad_norm": 0.8737857937812805,
"learning_rate": 9.08435207823961e-06,
"loss": 1.0276,
"step": 750
},
{
"epoch": 0.18581907090464547,
"grad_norm": 0.7398986220359802,
"learning_rate": 9.072127139364304e-06,
"loss": 1.0185,
"step": 760
},
{
"epoch": 0.1882640586797066,
"grad_norm": 0.662419855594635,
"learning_rate": 9.059902200488999e-06,
"loss": 1.0028,
"step": 770
},
{
"epoch": 0.19070904645476772,
"grad_norm": 0.7412256002426147,
"learning_rate": 9.047677261613693e-06,
"loss": 1.0102,
"step": 780
},
{
"epoch": 0.19315403422982885,
"grad_norm": 0.7194265127182007,
"learning_rate": 9.035452322738388e-06,
"loss": 1.0024,
"step": 790
},
{
"epoch": 0.19559902200489,
"grad_norm": 0.6712033152580261,
"learning_rate": 9.023227383863081e-06,
"loss": 0.981,
"step": 800
},
{
"epoch": 0.1980440097799511,
"grad_norm": 0.7297273278236389,
"learning_rate": 9.011002444987776e-06,
"loss": 1.0087,
"step": 810
},
{
"epoch": 0.20048899755501223,
"grad_norm": 0.7646290063858032,
"learning_rate": 8.99877750611247e-06,
"loss": 1.008,
"step": 820
},
{
"epoch": 0.20293398533007334,
"grad_norm": 0.6914790272712708,
"learning_rate": 8.986552567237165e-06,
"loss": 0.9967,
"step": 830
},
{
"epoch": 0.20537897310513448,
"grad_norm": 0.7169461250305176,
"learning_rate": 8.974327628361858e-06,
"loss": 0.9945,
"step": 840
},
{
"epoch": 0.2078239608801956,
"grad_norm": 0.686245322227478,
"learning_rate": 8.962102689486554e-06,
"loss": 1.003,
"step": 850
},
{
"epoch": 0.21026894865525672,
"grad_norm": 0.7075187563896179,
"learning_rate": 8.949877750611247e-06,
"loss": 0.9875,
"step": 860
},
{
"epoch": 0.21271393643031786,
"grad_norm": 0.8134737610816956,
"learning_rate": 8.937652811735942e-06,
"loss": 1.0068,
"step": 870
},
{
"epoch": 0.21515892420537897,
"grad_norm": 0.6891161799430847,
"learning_rate": 8.925427872860637e-06,
"loss": 1.0258,
"step": 880
},
{
"epoch": 0.2176039119804401,
"grad_norm": 0.8024953007698059,
"learning_rate": 8.913202933985331e-06,
"loss": 1.0199,
"step": 890
},
{
"epoch": 0.2200488997555012,
"grad_norm": 0.6608093976974487,
"learning_rate": 8.900977995110024e-06,
"loss": 1.0042,
"step": 900
},
{
"epoch": 0.22249388753056235,
"grad_norm": 0.7665097117424011,
"learning_rate": 8.888753056234719e-06,
"loss": 1.0004,
"step": 910
},
{
"epoch": 0.22493887530562348,
"grad_norm": 0.7506141662597656,
"learning_rate": 8.876528117359414e-06,
"loss": 0.9913,
"step": 920
},
{
"epoch": 0.2273838630806846,
"grad_norm": 0.8530935049057007,
"learning_rate": 8.864303178484108e-06,
"loss": 1.0061,
"step": 930
},
{
"epoch": 0.22982885085574573,
"grad_norm": 0.7526981830596924,
"learning_rate": 8.852078239608803e-06,
"loss": 1.0343,
"step": 940
},
{
"epoch": 0.23227383863080683,
"grad_norm": 0.6752293109893799,
"learning_rate": 8.839853300733498e-06,
"loss": 1.0351,
"step": 950
},
{
"epoch": 0.23471882640586797,
"grad_norm": 0.7808672189712524,
"learning_rate": 8.82762836185819e-06,
"loss": 1.0216,
"step": 960
},
{
"epoch": 0.2371638141809291,
"grad_norm": 0.7220605611801147,
"learning_rate": 8.815403422982885e-06,
"loss": 1.0066,
"step": 970
},
{
"epoch": 0.2396088019559902,
"grad_norm": 0.777722954750061,
"learning_rate": 8.80317848410758e-06,
"loss": 0.9936,
"step": 980
},
{
"epoch": 0.24205378973105135,
"grad_norm": 0.7381393313407898,
"learning_rate": 8.790953545232275e-06,
"loss": 0.9982,
"step": 990
},
{
"epoch": 0.24449877750611246,
"grad_norm": 0.6590691804885864,
"learning_rate": 8.77872860635697e-06,
"loss": 0.9788,
"step": 1000
},
{
"epoch": 0.24449877750611246,
"eval_loss": 0.9873583912849426,
"eval_runtime": 795.9609,
"eval_samples_per_second": 18.27,
"eval_steps_per_second": 0.572,
"step": 1000
},
{
"epoch": 0.2469437652811736,
"grad_norm": 0.851658284664154,
"learning_rate": 8.766503667481662e-06,
"loss": 0.9715,
"step": 1010
},
{
"epoch": 0.24938875305623473,
"grad_norm": 0.6827694177627563,
"learning_rate": 8.754278728606359e-06,
"loss": 0.9521,
"step": 1020
},
{
"epoch": 0.25183374083129584,
"grad_norm": 0.7433952689170837,
"learning_rate": 8.742053789731052e-06,
"loss": 1.0064,
"step": 1030
},
{
"epoch": 0.254278728606357,
"grad_norm": 0.7873063087463379,
"learning_rate": 8.729828850855746e-06,
"loss": 0.997,
"step": 1040
},
{
"epoch": 0.2567237163814181,
"grad_norm": 0.7808490991592407,
"learning_rate": 8.717603911980441e-06,
"loss": 1.021,
"step": 1050
},
{
"epoch": 0.2591687041564792,
"grad_norm": 0.7002695798873901,
"learning_rate": 8.705378973105136e-06,
"loss": 0.9787,
"step": 1060
},
{
"epoch": 0.2616136919315403,
"grad_norm": 0.8023979663848877,
"learning_rate": 8.693154034229829e-06,
"loss": 0.9913,
"step": 1070
},
{
"epoch": 0.26405867970660146,
"grad_norm": 0.6559330224990845,
"learning_rate": 8.680929095354525e-06,
"loss": 0.9755,
"step": 1080
},
{
"epoch": 0.2665036674816626,
"grad_norm": 0.703667402267456,
"learning_rate": 8.668704156479218e-06,
"loss": 1.0047,
"step": 1090
},
{
"epoch": 0.26894865525672373,
"grad_norm": 0.7463963627815247,
"learning_rate": 8.656479217603913e-06,
"loss": 0.9607,
"step": 1100
},
{
"epoch": 0.2713936430317848,
"grad_norm": 0.8031372427940369,
"learning_rate": 8.644254278728606e-06,
"loss": 0.9835,
"step": 1110
},
{
"epoch": 0.27383863080684595,
"grad_norm": 0.7800993919372559,
"learning_rate": 8.632029339853302e-06,
"loss": 0.9651,
"step": 1120
},
{
"epoch": 0.2762836185819071,
"grad_norm": 0.8147879242897034,
"learning_rate": 8.619804400977995e-06,
"loss": 1.029,
"step": 1130
},
{
"epoch": 0.2787286063569682,
"grad_norm": 0.6160231828689575,
"learning_rate": 8.60757946210269e-06,
"loss": 0.9771,
"step": 1140
},
{
"epoch": 0.28117359413202936,
"grad_norm": 0.8735865354537964,
"learning_rate": 8.595354523227385e-06,
"loss": 0.9719,
"step": 1150
},
{
"epoch": 0.28361858190709044,
"grad_norm": 0.6970623135566711,
"learning_rate": 8.58312958435208e-06,
"loss": 0.9948,
"step": 1160
},
{
"epoch": 0.2860635696821516,
"grad_norm": 0.7240091562271118,
"learning_rate": 8.570904645476774e-06,
"loss": 0.9798,
"step": 1170
},
{
"epoch": 0.2885085574572127,
"grad_norm": 0.8112177848815918,
"learning_rate": 8.558679706601469e-06,
"loss": 0.9857,
"step": 1180
},
{
"epoch": 0.29095354523227385,
"grad_norm": 0.7607941031455994,
"learning_rate": 8.546454767726162e-06,
"loss": 0.9467,
"step": 1190
},
{
"epoch": 0.293398533007335,
"grad_norm": 0.7950695753097534,
"learning_rate": 8.534229828850856e-06,
"loss": 0.9691,
"step": 1200
},
{
"epoch": 0.29584352078239606,
"grad_norm": 0.7780068516731262,
"learning_rate": 8.522004889975551e-06,
"loss": 0.9658,
"step": 1210
},
{
"epoch": 0.2982885085574572,
"grad_norm": 0.7490630149841309,
"learning_rate": 8.509779951100246e-06,
"loss": 0.9528,
"step": 1220
},
{
"epoch": 0.30073349633251834,
"grad_norm": 0.7754825353622437,
"learning_rate": 8.49755501222494e-06,
"loss": 0.995,
"step": 1230
},
{
"epoch": 0.30317848410757947,
"grad_norm": 0.7797338366508484,
"learning_rate": 8.485330073349633e-06,
"loss": 0.9976,
"step": 1240
},
{
"epoch": 0.3056234718826406,
"grad_norm": 0.7192637920379639,
"learning_rate": 8.473105134474328e-06,
"loss": 0.9764,
"step": 1250
},
{
"epoch": 0.3080684596577017,
"grad_norm": 0.7009165287017822,
"learning_rate": 8.460880195599023e-06,
"loss": 0.9794,
"step": 1260
},
{
"epoch": 0.3105134474327628,
"grad_norm": 0.7941039204597473,
"learning_rate": 8.448655256723717e-06,
"loss": 0.9701,
"step": 1270
},
{
"epoch": 0.31295843520782396,
"grad_norm": 0.7658547163009644,
"learning_rate": 8.436430317848412e-06,
"loss": 0.9658,
"step": 1280
},
{
"epoch": 0.3154034229828851,
"grad_norm": 0.8190957903862,
"learning_rate": 8.424205378973107e-06,
"loss": 0.9566,
"step": 1290
},
{
"epoch": 0.31784841075794623,
"grad_norm": 0.7688964605331421,
"learning_rate": 8.4119804400978e-06,
"loss": 0.9743,
"step": 1300
},
{
"epoch": 0.3202933985330073,
"grad_norm": 0.7160921096801758,
"learning_rate": 8.399755501222494e-06,
"loss": 0.9363,
"step": 1310
},
{
"epoch": 0.32273838630806845,
"grad_norm": 0.6877838373184204,
"learning_rate": 8.387530562347189e-06,
"loss": 0.9704,
"step": 1320
},
{
"epoch": 0.3251833740831296,
"grad_norm": 0.7970196008682251,
"learning_rate": 8.375305623471884e-06,
"loss": 0.9824,
"step": 1330
},
{
"epoch": 0.3276283618581907,
"grad_norm": 0.7897329926490784,
"learning_rate": 8.363080684596577e-06,
"loss": 0.971,
"step": 1340
},
{
"epoch": 0.33007334963325186,
"grad_norm": 0.7263765931129456,
"learning_rate": 8.350855745721273e-06,
"loss": 0.9754,
"step": 1350
},
{
"epoch": 0.33251833740831294,
"grad_norm": 0.6933903098106384,
"learning_rate": 8.338630806845966e-06,
"loss": 0.9451,
"step": 1360
},
{
"epoch": 0.33496332518337407,
"grad_norm": 0.6975806951522827,
"learning_rate": 8.32640586797066e-06,
"loss": 0.9538,
"step": 1370
},
{
"epoch": 0.3374083129584352,
"grad_norm": 0.8183455467224121,
"learning_rate": 8.314180929095355e-06,
"loss": 0.955,
"step": 1380
},
{
"epoch": 0.33985330073349634,
"grad_norm": 0.8085072636604309,
"learning_rate": 8.30195599022005e-06,
"loss": 0.9337,
"step": 1390
},
{
"epoch": 0.3422982885085575,
"grad_norm": 0.7899576425552368,
"learning_rate": 8.289731051344743e-06,
"loss": 0.984,
"step": 1400
},
{
"epoch": 0.34474327628361856,
"grad_norm": 0.8049628734588623,
"learning_rate": 8.277506112469438e-06,
"loss": 0.967,
"step": 1410
},
{
"epoch": 0.3471882640586797,
"grad_norm": 0.7915717959403992,
"learning_rate": 8.265281173594132e-06,
"loss": 0.9379,
"step": 1420
},
{
"epoch": 0.34963325183374083,
"grad_norm": 0.7397669553756714,
"learning_rate": 8.253056234718827e-06,
"loss": 0.9771,
"step": 1430
},
{
"epoch": 0.35207823960880197,
"grad_norm": 0.7894385457038879,
"learning_rate": 8.240831295843522e-06,
"loss": 0.9647,
"step": 1440
},
{
"epoch": 0.3545232273838631,
"grad_norm": 0.7507162094116211,
"learning_rate": 8.228606356968216e-06,
"loss": 0.9783,
"step": 1450
},
{
"epoch": 0.3569682151589242,
"grad_norm": 0.8646982908248901,
"learning_rate": 8.216381418092911e-06,
"loss": 0.9817,
"step": 1460
},
{
"epoch": 0.3594132029339853,
"grad_norm": 0.7320569753646851,
"learning_rate": 8.204156479217604e-06,
"loss": 0.9502,
"step": 1470
},
{
"epoch": 0.36185819070904646,
"grad_norm": 0.7949222922325134,
"learning_rate": 8.191931540342299e-06,
"loss": 0.9694,
"step": 1480
},
{
"epoch": 0.3643031784841076,
"grad_norm": 0.8175340294837952,
"learning_rate": 8.179706601466993e-06,
"loss": 0.972,
"step": 1490
},
{
"epoch": 0.36674816625916873,
"grad_norm": 0.8479053974151611,
"learning_rate": 8.167481662591688e-06,
"loss": 1.0059,
"step": 1500
},
{
"epoch": 0.3691931540342298,
"grad_norm": 0.7485557794570923,
"learning_rate": 8.155256723716381e-06,
"loss": 0.9303,
"step": 1510
},
{
"epoch": 0.37163814180929094,
"grad_norm": 0.7518207430839539,
"learning_rate": 8.143031784841077e-06,
"loss": 0.9572,
"step": 1520
},
{
"epoch": 0.3740831295843521,
"grad_norm": 0.8368051052093506,
"learning_rate": 8.13080684596577e-06,
"loss": 0.9865,
"step": 1530
},
{
"epoch": 0.3765281173594132,
"grad_norm": 0.8675082325935364,
"learning_rate": 8.118581907090465e-06,
"loss": 0.9355,
"step": 1540
},
{
"epoch": 0.37897310513447435,
"grad_norm": 0.8239061832427979,
"learning_rate": 8.10635696821516e-06,
"loss": 0.9475,
"step": 1550
},
{
"epoch": 0.38141809290953543,
"grad_norm": 0.700191080570221,
"learning_rate": 8.094132029339854e-06,
"loss": 0.9343,
"step": 1560
},
{
"epoch": 0.38386308068459657,
"grad_norm": 0.789686381816864,
"learning_rate": 8.081907090464547e-06,
"loss": 0.9633,
"step": 1570
},
{
"epoch": 0.3863080684596577,
"grad_norm": 0.8411287665367126,
"learning_rate": 8.069682151589244e-06,
"loss": 0.991,
"step": 1580
},
{
"epoch": 0.38875305623471884,
"grad_norm": 0.7479956150054932,
"learning_rate": 8.057457212713937e-06,
"loss": 0.9371,
"step": 1590
},
{
"epoch": 0.39119804400978,
"grad_norm": 0.726841151714325,
"learning_rate": 8.045232273838631e-06,
"loss": 0.9595,
"step": 1600
},
{
"epoch": 0.39364303178484106,
"grad_norm": 0.7860400676727295,
"learning_rate": 8.033007334963326e-06,
"loss": 0.9673,
"step": 1610
},
{
"epoch": 0.3960880195599022,
"grad_norm": 0.8316232562065125,
"learning_rate": 8.02078239608802e-06,
"loss": 0.9592,
"step": 1620
},
{
"epoch": 0.39853300733496333,
"grad_norm": 0.743289053440094,
"learning_rate": 8.008557457212714e-06,
"loss": 0.9424,
"step": 1630
},
{
"epoch": 0.40097799511002447,
"grad_norm": 0.7066758275032043,
"learning_rate": 7.996332518337408e-06,
"loss": 0.9515,
"step": 1640
},
{
"epoch": 0.4034229828850856,
"grad_norm": 0.8527940511703491,
"learning_rate": 7.984107579462103e-06,
"loss": 0.9579,
"step": 1650
},
{
"epoch": 0.4058679706601467,
"grad_norm": 0.747097909450531,
"learning_rate": 7.971882640586798e-06,
"loss": 0.96,
"step": 1660
},
{
"epoch": 0.4083129584352078,
"grad_norm": 0.7440111637115479,
"learning_rate": 7.959657701711492e-06,
"loss": 0.9428,
"step": 1670
},
{
"epoch": 0.41075794621026895,
"grad_norm": 0.7893000245094299,
"learning_rate": 7.947432762836187e-06,
"loss": 0.9775,
"step": 1680
},
{
"epoch": 0.4132029339853301,
"grad_norm": 0.7849116325378418,
"learning_rate": 7.935207823960882e-06,
"loss": 0.9636,
"step": 1690
},
{
"epoch": 0.4156479217603912,
"grad_norm": 0.8839460611343384,
"learning_rate": 7.922982885085575e-06,
"loss": 0.9427,
"step": 1700
},
{
"epoch": 0.4180929095354523,
"grad_norm": 0.7875691056251526,
"learning_rate": 7.91075794621027e-06,
"loss": 0.9188,
"step": 1710
},
{
"epoch": 0.42053789731051344,
"grad_norm": 0.7899657487869263,
"learning_rate": 7.898533007334964e-06,
"loss": 0.9337,
"step": 1720
},
{
"epoch": 0.4229828850855746,
"grad_norm": 0.8064518570899963,
"learning_rate": 7.886308068459659e-06,
"loss": 0.9473,
"step": 1730
},
{
"epoch": 0.4254278728606357,
"grad_norm": 0.9735845327377319,
"learning_rate": 7.874083129584352e-06,
"loss": 0.943,
"step": 1740
},
{
"epoch": 0.4278728606356968,
"grad_norm": 0.8656879663467407,
"learning_rate": 7.861858190709048e-06,
"loss": 0.9276,
"step": 1750
},
{
"epoch": 0.43031784841075793,
"grad_norm": 0.7446141242980957,
"learning_rate": 7.849633251833741e-06,
"loss": 0.9609,
"step": 1760
},
{
"epoch": 0.43276283618581907,
"grad_norm": 0.7805430889129639,
"learning_rate": 7.837408312958436e-06,
"loss": 0.9437,
"step": 1770
},
{
"epoch": 0.4352078239608802,
"grad_norm": 0.8328519463539124,
"learning_rate": 7.82518337408313e-06,
"loss": 0.9921,
"step": 1780
},
{
"epoch": 0.43765281173594134,
"grad_norm": 0.7910134196281433,
"learning_rate": 7.812958435207825e-06,
"loss": 0.9392,
"step": 1790
},
{
"epoch": 0.4400977995110024,
"grad_norm": 0.8082761168479919,
"learning_rate": 7.800733496332518e-06,
"loss": 0.9655,
"step": 1800
},
{
"epoch": 0.44254278728606355,
"grad_norm": 0.7104289531707764,
"learning_rate": 7.788508557457214e-06,
"loss": 0.9604,
"step": 1810
},
{
"epoch": 0.4449877750611247,
"grad_norm": 0.7942298054695129,
"learning_rate": 7.776283618581907e-06,
"loss": 0.9839,
"step": 1820
},
{
"epoch": 0.4474327628361858,
"grad_norm": 0.7665939927101135,
"learning_rate": 7.764058679706602e-06,
"loss": 0.9907,
"step": 1830
},
{
"epoch": 0.44987775061124696,
"grad_norm": 0.8066325187683105,
"learning_rate": 7.751833740831297e-06,
"loss": 0.9503,
"step": 1840
},
{
"epoch": 0.45232273838630804,
"grad_norm": 0.7494056224822998,
"learning_rate": 7.739608801955991e-06,
"loss": 0.9594,
"step": 1850
},
{
"epoch": 0.4547677261613692,
"grad_norm": 0.7743037939071655,
"learning_rate": 7.727383863080684e-06,
"loss": 0.9708,
"step": 1860
},
{
"epoch": 0.4572127139364303,
"grad_norm": 0.6371968388557434,
"learning_rate": 7.715158924205379e-06,
"loss": 0.9247,
"step": 1870
},
{
"epoch": 0.45965770171149145,
"grad_norm": 0.8373169898986816,
"learning_rate": 7.702933985330074e-06,
"loss": 0.9631,
"step": 1880
},
{
"epoch": 0.4621026894865526,
"grad_norm": 0.898855984210968,
"learning_rate": 7.690709046454768e-06,
"loss": 0.9598,
"step": 1890
},
{
"epoch": 0.46454767726161367,
"grad_norm": 0.7827709317207336,
"learning_rate": 7.678484107579463e-06,
"loss": 0.9407,
"step": 1900
},
{
"epoch": 0.4669926650366748,
"grad_norm": 0.8428008556365967,
"learning_rate": 7.666259168704158e-06,
"loss": 0.9753,
"step": 1910
},
{
"epoch": 0.46943765281173594,
"grad_norm": 0.7905760407447815,
"learning_rate": 7.654034229828853e-06,
"loss": 0.9487,
"step": 1920
},
{
"epoch": 0.4718826405867971,
"grad_norm": 0.7715455293655396,
"learning_rate": 7.641809290953546e-06,
"loss": 0.9304,
"step": 1930
},
{
"epoch": 0.4743276283618582,
"grad_norm": 0.7903933525085449,
"learning_rate": 7.62958435207824e-06,
"loss": 0.9277,
"step": 1940
},
{
"epoch": 0.4767726161369193,
"grad_norm": 0.7191179990768433,
"learning_rate": 7.617359413202935e-06,
"loss": 0.915,
"step": 1950
},
{
"epoch": 0.4792176039119804,
"grad_norm": 0.8574967980384827,
"learning_rate": 7.605134474327629e-06,
"loss": 0.918,
"step": 1960
},
{
"epoch": 0.48166259168704156,
"grad_norm": 0.8661892414093018,
"learning_rate": 7.592909535452323e-06,
"loss": 0.9658,
"step": 1970
},
{
"epoch": 0.4841075794621027,
"grad_norm": 0.8399495482444763,
"learning_rate": 7.580684596577018e-06,
"loss": 0.9611,
"step": 1980
},
{
"epoch": 0.48655256723716384,
"grad_norm": 0.8675727844238281,
"learning_rate": 7.568459657701712e-06,
"loss": 0.9367,
"step": 1990
},
{
"epoch": 0.4889975550122249,
"grad_norm": 0.8307384252548218,
"learning_rate": 7.5562347188264065e-06,
"loss": 0.9628,
"step": 2000
},
{
"epoch": 0.4889975550122249,
"eval_loss": 0.9446731805801392,
"eval_runtime": 795.9747,
"eval_samples_per_second": 18.269,
"eval_steps_per_second": 0.572,
"step": 2000
},
{
"epoch": 0.49144254278728605,
"grad_norm": 0.7585815191268921,
"learning_rate": 7.544009779951101e-06,
"loss": 0.9499,
"step": 2010
},
{
"epoch": 0.4938875305623472,
"grad_norm": 0.870883584022522,
"learning_rate": 7.531784841075795e-06,
"loss": 0.9758,
"step": 2020
},
{
"epoch": 0.4963325183374083,
"grad_norm": 0.7365143895149231,
"learning_rate": 7.51955990220049e-06,
"loss": 0.9429,
"step": 2030
},
{
"epoch": 0.49877750611246946,
"grad_norm": 0.9007924199104309,
"learning_rate": 7.5073349633251836e-06,
"loss": 0.9208,
"step": 2040
},
{
"epoch": 0.5012224938875306,
"grad_norm": 0.8040947914123535,
"learning_rate": 7.495110024449879e-06,
"loss": 0.931,
"step": 2050
},
{
"epoch": 0.5036674816625917,
"grad_norm": 0.7994057536125183,
"learning_rate": 7.482885085574573e-06,
"loss": 0.9406,
"step": 2060
},
{
"epoch": 0.5061124694376528,
"grad_norm": 0.7404657006263733,
"learning_rate": 7.470660146699267e-06,
"loss": 0.91,
"step": 2070
},
{
"epoch": 0.508557457212714,
"grad_norm": 0.8444405794143677,
"learning_rate": 7.458435207823962e-06,
"loss": 0.9748,
"step": 2080
},
{
"epoch": 0.511002444987775,
"grad_norm": 0.830786406993866,
"learning_rate": 7.446210268948656e-06,
"loss": 0.9546,
"step": 2090
},
{
"epoch": 0.5134474327628362,
"grad_norm": 0.8706929683685303,
"learning_rate": 7.43398533007335e-06,
"loss": 0.9359,
"step": 2100
},
{
"epoch": 0.5158924205378973,
"grad_norm": 0.937125563621521,
"learning_rate": 7.4217603911980454e-06,
"loss": 0.9367,
"step": 2110
},
{
"epoch": 0.5183374083129584,
"grad_norm": 0.871612548828125,
"learning_rate": 7.409535452322739e-06,
"loss": 0.9363,
"step": 2120
},
{
"epoch": 0.5207823960880196,
"grad_norm": 0.7397099733352661,
"learning_rate": 7.397310513447433e-06,
"loss": 0.9312,
"step": 2130
},
{
"epoch": 0.5232273838630807,
"grad_norm": 0.9171463847160339,
"learning_rate": 7.385085574572127e-06,
"loss": 0.9657,
"step": 2140
},
{
"epoch": 0.5256723716381418,
"grad_norm": 0.7274787425994873,
"learning_rate": 7.3728606356968224e-06,
"loss": 0.9359,
"step": 2150
},
{
"epoch": 0.5281173594132029,
"grad_norm": 0.8870149850845337,
"learning_rate": 7.360635696821516e-06,
"loss": 0.9469,
"step": 2160
},
{
"epoch": 0.530562347188264,
"grad_norm": 0.770986020565033,
"learning_rate": 7.34841075794621e-06,
"loss": 0.9529,
"step": 2170
},
{
"epoch": 0.5330073349633252,
"grad_norm": 0.7791648507118225,
"learning_rate": 7.336185819070906e-06,
"loss": 0.9509,
"step": 2180
},
{
"epoch": 0.5354523227383863,
"grad_norm": 0.8232088088989258,
"learning_rate": 7.3239608801955995e-06,
"loss": 0.9639,
"step": 2190
},
{
"epoch": 0.5378973105134475,
"grad_norm": 0.8500565886497498,
"learning_rate": 7.311735941320294e-06,
"loss": 0.97,
"step": 2200
},
{
"epoch": 0.5403422982885085,
"grad_norm": 0.8228991627693176,
"learning_rate": 7.299511002444989e-06,
"loss": 0.9514,
"step": 2210
},
{
"epoch": 0.5427872860635696,
"grad_norm": 0.7978509068489075,
"learning_rate": 7.287286063569683e-06,
"loss": 0.9599,
"step": 2220
},
{
"epoch": 0.5452322738386308,
"grad_norm": 0.7359360456466675,
"learning_rate": 7.275061124694377e-06,
"loss": 0.9087,
"step": 2230
},
{
"epoch": 0.5476772616136919,
"grad_norm": 0.8995153903961182,
"learning_rate": 7.262836185819071e-06,
"loss": 0.9082,
"step": 2240
},
{
"epoch": 0.5501222493887531,
"grad_norm": 0.7778546214103699,
"learning_rate": 7.250611246943766e-06,
"loss": 0.9599,
"step": 2250
},
{
"epoch": 0.5525672371638142,
"grad_norm": 0.7799263596534729,
"learning_rate": 7.2383863080684605e-06,
"loss": 0.895,
"step": 2260
},
{
"epoch": 0.5550122249388753,
"grad_norm": 0.831047534942627,
"learning_rate": 7.226161369193154e-06,
"loss": 0.9426,
"step": 2270
},
{
"epoch": 0.5574572127139364,
"grad_norm": 0.863277792930603,
"learning_rate": 7.213936430317849e-06,
"loss": 0.947,
"step": 2280
},
{
"epoch": 0.5599022004889975,
"grad_norm": 0.7998844981193542,
"learning_rate": 7.201711491442544e-06,
"loss": 0.9366,
"step": 2290
},
{
"epoch": 0.5623471882640587,
"grad_norm": 0.818229079246521,
"learning_rate": 7.1894865525672375e-06,
"loss": 0.9524,
"step": 2300
},
{
"epoch": 0.5647921760391198,
"grad_norm": 0.8541170954704285,
"learning_rate": 7.177261613691933e-06,
"loss": 0.9297,
"step": 2310
},
{
"epoch": 0.5672371638141809,
"grad_norm": 0.7943381071090698,
"learning_rate": 7.165036674816627e-06,
"loss": 0.9186,
"step": 2320
},
{
"epoch": 0.5696821515892421,
"grad_norm": 0.886789858341217,
"learning_rate": 7.152811735941321e-06,
"loss": 0.9551,
"step": 2330
},
{
"epoch": 0.5721271393643031,
"grad_norm": 0.7614038586616516,
"learning_rate": 7.1405867970660145e-06,
"loss": 0.9427,
"step": 2340
},
{
"epoch": 0.5745721271393643,
"grad_norm": 0.8962105512619019,
"learning_rate": 7.12836185819071e-06,
"loss": 0.9191,
"step": 2350
},
{
"epoch": 0.5770171149144254,
"grad_norm": 0.9211586713790894,
"learning_rate": 7.116136919315404e-06,
"loss": 0.933,
"step": 2360
},
{
"epoch": 0.5794621026894865,
"grad_norm": 0.9243327975273132,
"learning_rate": 7.103911980440098e-06,
"loss": 0.9152,
"step": 2370
},
{
"epoch": 0.5819070904645477,
"grad_norm": 0.851266086101532,
"learning_rate": 7.091687041564793e-06,
"loss": 0.9697,
"step": 2380
},
{
"epoch": 0.5843520782396088,
"grad_norm": 0.8055517673492432,
"learning_rate": 7.079462102689487e-06,
"loss": 0.921,
"step": 2390
},
{
"epoch": 0.58679706601467,
"grad_norm": 0.9354420900344849,
"learning_rate": 7.067237163814181e-06,
"loss": 0.9368,
"step": 2400
},
{
"epoch": 0.589242053789731,
"grad_norm": 0.8339366912841797,
"learning_rate": 7.055012224938876e-06,
"loss": 0.9387,
"step": 2410
},
{
"epoch": 0.5916870415647921,
"grad_norm": 0.8128229975700378,
"learning_rate": 7.04278728606357e-06,
"loss": 0.9462,
"step": 2420
},
{
"epoch": 0.5941320293398533,
"grad_norm": 0.7740962505340576,
"learning_rate": 7.030562347188264e-06,
"loss": 0.931,
"step": 2430
},
{
"epoch": 0.5965770171149144,
"grad_norm": 0.8874317407608032,
"learning_rate": 7.018337408312959e-06,
"loss": 0.9809,
"step": 2440
},
{
"epoch": 0.5990220048899756,
"grad_norm": 0.8482634425163269,
"learning_rate": 7.006112469437653e-06,
"loss": 0.9104,
"step": 2450
},
{
"epoch": 0.6014669926650367,
"grad_norm": 0.7957248687744141,
"learning_rate": 6.993887530562348e-06,
"loss": 0.9195,
"step": 2460
},
{
"epoch": 0.6039119804400978,
"grad_norm": 0.8182454109191895,
"learning_rate": 6.981662591687042e-06,
"loss": 0.9565,
"step": 2470
},
{
"epoch": 0.6063569682151589,
"grad_norm": 0.7790459990501404,
"learning_rate": 6.969437652811737e-06,
"loss": 0.9114,
"step": 2480
},
{
"epoch": 0.60880195599022,
"grad_norm": 0.7870607376098633,
"learning_rate": 6.957212713936431e-06,
"loss": 0.9372,
"step": 2490
},
{
"epoch": 0.6112469437652812,
"grad_norm": 0.8847014307975769,
"learning_rate": 6.944987775061125e-06,
"loss": 0.9344,
"step": 2500
},
{
"epoch": 0.6136919315403423,
"grad_norm": 0.8556163907051086,
"learning_rate": 6.93276283618582e-06,
"loss": 0.915,
"step": 2510
},
{
"epoch": 0.6161369193154034,
"grad_norm": 0.846760630607605,
"learning_rate": 6.9205378973105144e-06,
"loss": 0.9301,
"step": 2520
},
{
"epoch": 0.6185819070904646,
"grad_norm": 0.7853952646255493,
"learning_rate": 6.908312958435208e-06,
"loss": 0.9395,
"step": 2530
},
{
"epoch": 0.6210268948655256,
"grad_norm": 0.8032101392745972,
"learning_rate": 6.896088019559902e-06,
"loss": 0.9561,
"step": 2540
},
{
"epoch": 0.6234718826405868,
"grad_norm": 0.7820572853088379,
"learning_rate": 6.883863080684598e-06,
"loss": 0.9167,
"step": 2550
},
{
"epoch": 0.6259168704156479,
"grad_norm": 0.8646982908248901,
"learning_rate": 6.8716381418092915e-06,
"loss": 0.9297,
"step": 2560
},
{
"epoch": 0.628361858190709,
"grad_norm": 0.8482604026794434,
"learning_rate": 6.859413202933985e-06,
"loss": 0.9254,
"step": 2570
},
{
"epoch": 0.6308068459657702,
"grad_norm": 0.775800883769989,
"learning_rate": 6.847188264058681e-06,
"loss": 0.9346,
"step": 2580
},
{
"epoch": 0.6332518337408313,
"grad_norm": 0.7460722327232361,
"learning_rate": 6.834963325183375e-06,
"loss": 0.9545,
"step": 2590
},
{
"epoch": 0.6356968215158925,
"grad_norm": 0.8128139972686768,
"learning_rate": 6.8227383863080685e-06,
"loss": 0.9262,
"step": 2600
},
{
"epoch": 0.6381418092909535,
"grad_norm": 0.8368150591850281,
"learning_rate": 6.810513447432764e-06,
"loss": 0.9123,
"step": 2610
},
{
"epoch": 0.6405867970660146,
"grad_norm": 0.7825431227684021,
"learning_rate": 6.798288508557458e-06,
"loss": 0.9336,
"step": 2620
},
{
"epoch": 0.6430317848410758,
"grad_norm": 0.7432078123092651,
"learning_rate": 6.786063569682152e-06,
"loss": 0.9396,
"step": 2630
},
{
"epoch": 0.6454767726161369,
"grad_norm": 0.6994110345840454,
"learning_rate": 6.773838630806846e-06,
"loss": 0.9527,
"step": 2640
},
{
"epoch": 0.6479217603911981,
"grad_norm": 0.8581116795539856,
"learning_rate": 6.761613691931541e-06,
"loss": 0.8979,
"step": 2650
},
{
"epoch": 0.6503667481662592,
"grad_norm": 0.7824479937553406,
"learning_rate": 6.749388753056235e-06,
"loss": 0.93,
"step": 2660
},
{
"epoch": 0.6528117359413202,
"grad_norm": 0.7407302260398865,
"learning_rate": 6.7371638141809295e-06,
"loss": 0.9175,
"step": 2670
},
{
"epoch": 0.6552567237163814,
"grad_norm": 0.8635402917861938,
"learning_rate": 6.724938875305624e-06,
"loss": 0.951,
"step": 2680
},
{
"epoch": 0.6577017114914425,
"grad_norm": 0.7739470601081848,
"learning_rate": 6.712713936430319e-06,
"loss": 0.9199,
"step": 2690
},
{
"epoch": 0.6601466992665037,
"grad_norm": 0.879205048084259,
"learning_rate": 6.700488997555013e-06,
"loss": 0.9296,
"step": 2700
},
{
"epoch": 0.6625916870415648,
"grad_norm": 0.7694469094276428,
"learning_rate": 6.688264058679707e-06,
"loss": 0.9312,
"step": 2710
},
{
"epoch": 0.6650366748166259,
"grad_norm": 0.8447741270065308,
"learning_rate": 6.676039119804402e-06,
"loss": 0.9486,
"step": 2720
},
{
"epoch": 0.6674816625916871,
"grad_norm": 0.7987990379333496,
"learning_rate": 6.663814180929096e-06,
"loss": 0.9554,
"step": 2730
},
{
"epoch": 0.6699266503667481,
"grad_norm": 0.8286950588226318,
"learning_rate": 6.65158924205379e-06,
"loss": 0.9264,
"step": 2740
},
{
"epoch": 0.6723716381418093,
"grad_norm": 0.8135730028152466,
"learning_rate": 6.639364303178485e-06,
"loss": 0.9183,
"step": 2750
},
{
"epoch": 0.6748166259168704,
"grad_norm": 0.9580305218696594,
"learning_rate": 6.627139364303179e-06,
"loss": 0.9055,
"step": 2760
},
{
"epoch": 0.6772616136919315,
"grad_norm": 0.7580097317695618,
"learning_rate": 6.614914425427873e-06,
"loss": 0.923,
"step": 2770
},
{
"epoch": 0.6797066014669927,
"grad_norm": 0.7250223755836487,
"learning_rate": 6.602689486552568e-06,
"loss": 0.9472,
"step": 2780
},
{
"epoch": 0.6821515892420538,
"grad_norm": 0.720930278301239,
"learning_rate": 6.590464547677262e-06,
"loss": 0.9155,
"step": 2790
},
{
"epoch": 0.684596577017115,
"grad_norm": 0.7639275789260864,
"learning_rate": 6.578239608801956e-06,
"loss": 0.917,
"step": 2800
},
{
"epoch": 0.687041564792176,
"grad_norm": 0.8890448808670044,
"learning_rate": 6.5660146699266516e-06,
"loss": 0.933,
"step": 2810
},
{
"epoch": 0.6894865525672371,
"grad_norm": 0.816336452960968,
"learning_rate": 6.553789731051345e-06,
"loss": 0.9606,
"step": 2820
},
{
"epoch": 0.6919315403422983,
"grad_norm": 0.8258111476898193,
"learning_rate": 6.541564792176039e-06,
"loss": 0.9331,
"step": 2830
},
{
"epoch": 0.6943765281173594,
"grad_norm": 0.8009893894195557,
"learning_rate": 6.529339853300734e-06,
"loss": 0.918,
"step": 2840
},
{
"epoch": 0.6968215158924206,
"grad_norm": 0.861232578754425,
"learning_rate": 6.517114914425429e-06,
"loss": 0.9157,
"step": 2850
},
{
"epoch": 0.6992665036674817,
"grad_norm": 0.8324180841445923,
"learning_rate": 6.504889975550122e-06,
"loss": 0.9322,
"step": 2860
},
{
"epoch": 0.7017114914425427,
"grad_norm": 0.8205760717391968,
"learning_rate": 6.492665036674817e-06,
"loss": 0.9401,
"step": 2870
},
{
"epoch": 0.7041564792176039,
"grad_norm": 0.8924916386604309,
"learning_rate": 6.480440097799512e-06,
"loss": 0.9165,
"step": 2880
},
{
"epoch": 0.706601466992665,
"grad_norm": 0.7587029337882996,
"learning_rate": 6.468215158924206e-06,
"loss": 0.9127,
"step": 2890
},
{
"epoch": 0.7090464547677262,
"grad_norm": 0.7448475360870361,
"learning_rate": 6.4559902200489e-06,
"loss": 0.9366,
"step": 2900
},
{
"epoch": 0.7114914425427873,
"grad_norm": 0.8168658018112183,
"learning_rate": 6.443765281173595e-06,
"loss": 0.9377,
"step": 2910
},
{
"epoch": 0.7139364303178484,
"grad_norm": 0.7668200731277466,
"learning_rate": 6.431540342298289e-06,
"loss": 0.9342,
"step": 2920
},
{
"epoch": 0.7163814180929096,
"grad_norm": 0.8051294088363647,
"learning_rate": 6.4193154034229834e-06,
"loss": 0.9236,
"step": 2930
},
{
"epoch": 0.7188264058679706,
"grad_norm": 0.7371141910552979,
"learning_rate": 6.407090464547677e-06,
"loss": 0.9123,
"step": 2940
},
{
"epoch": 0.7212713936430318,
"grad_norm": 0.7507117390632629,
"learning_rate": 6.394865525672373e-06,
"loss": 0.8978,
"step": 2950
},
{
"epoch": 0.7237163814180929,
"grad_norm": 0.8171530365943909,
"learning_rate": 6.382640586797067e-06,
"loss": 0.9516,
"step": 2960
},
{
"epoch": 0.726161369193154,
"grad_norm": 0.8637788891792297,
"learning_rate": 6.3704156479217605e-06,
"loss": 0.8987,
"step": 2970
},
{
"epoch": 0.7286063569682152,
"grad_norm": 0.7505759596824646,
"learning_rate": 6.358190709046456e-06,
"loss": 0.9134,
"step": 2980
},
{
"epoch": 0.7310513447432763,
"grad_norm": 0.9182484149932861,
"learning_rate": 6.34596577017115e-06,
"loss": 0.9351,
"step": 2990
},
{
"epoch": 0.7334963325183375,
"grad_norm": 1.006665825843811,
"learning_rate": 6.333740831295844e-06,
"loss": 0.9171,
"step": 3000
},
{
"epoch": 0.7334963325183375,
"eval_loss": 0.9244844913482666,
"eval_runtime": 795.2171,
"eval_samples_per_second": 18.287,
"eval_steps_per_second": 0.572,
"step": 3000
}
],
"logging_steps": 10,
"max_steps": 8180,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.914805497032868e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}