Upload 10 files

Browse files

Files changed (6) hide show

model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +1485 -3
training_args.bin +1 -1

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4baee20d911bded3ac972714a9c339be4051aac75f3be17c5dd47c3bb0a04e63
 size 598635032

 version https://git-lfs.github.com/spec/v1
+oid sha256:8cf6fd855c268d3938b4c6d6a77aa9284b5bd03c679875c89cb3579c489295b8
 size 598635032

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dffb20f09d581f1a8db94110ac7014fac958626dee3c29e960da4cb1c9f38e85
 size 1197359627

 version https://git-lfs.github.com/spec/v1
+oid sha256:20d57d69e8d4eb0bcf9143bb2a5722964a200d83b3b1c090ed18f98299556b3a
 size 1197359627

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:db4c787397c7bd17a5fb6bef85caf0ed539cdf41b0fe201e17b766ab049c2a38
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:be293c0bc96c40007a1ca95bf99da704f29c24d932c7c8e19b962a361adfdc4c
 size 14645

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:528826677c74cc85ac3103f0a7ddc5d791ae235096533cc53310113c112a4947
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:af5ee4dc438217ac40b2a125900214146b721f2725ba954be785ed61a3abe011
 size 1465

trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.0195357196680044,
   "eval_steps": 1000,
-  "global_step": 167000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -13041,6 +13041,1488 @@
       "eval_samples_per_second": 195.51,
       "eval_steps_per_second": 1.534,
       "step": 167000
     }
   ],
   "logging_steps": 100,
@@ -13060,7 +14542,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.4574492479127552e+19,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.0055816341908584,
   "eval_steps": 1000,
+  "global_step": 186000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 195.51,
       "eval_steps_per_second": 1.534,
       "step": 167000
+    },
+    {
+      "epoch": 0.01981480137754732,
+      "grad_norm": 1.5712428092956543,
+      "learning_rate": 2.8278830871985708e-05,
+      "loss": 1.7747,
+      "step": 167100
+    },
+    {
+      "epoch": 0.020093883087090238,
+      "grad_norm": 1.5386340618133545,
+      "learning_rate": 2.8256880354422098e-05,
+      "loss": 1.7738,
+      "step": 167200
+    },
+    {
+      "epoch": 0.02037296479663316,
+      "grad_norm": 1.5471428632736206,
+      "learning_rate": 2.8234927282417417e-05,
+      "loss": 1.779,
+      "step": 167300
+    },
+    {
+      "epoch": 0.020652046506176077,
+      "grad_norm": 1.5163718461990356,
+      "learning_rate": 2.821297167318992e-05,
+      "loss": 1.7741,
+      "step": 167400
+    },
+    {
+      "epoch": 0.020931128215719,
+      "grad_norm": 1.5554001331329346,
+      "learning_rate": 2.819101354395986e-05,
+      "loss": 1.7825,
+      "step": 167500
+    },
+    {
+      "epoch": 0.02121020992526192,
+      "grad_norm": 1.4839155673980713,
+      "learning_rate": 2.8169052911949484e-05,
+      "loss": 1.7729,
+      "step": 167600
+    },
+    {
+      "epoch": 0.021489291634804838,
+      "grad_norm": 1.5696512460708618,
+      "learning_rate": 2.8147089794382965e-05,
+      "loss": 1.7754,
+      "step": 167700
+    },
+    {
+      "epoch": 0.02176837334434776,
+      "grad_norm": 1.6730250120162964,
+      "learning_rate": 2.8125124208486465e-05,
+      "loss": 1.7736,
+      "step": 167800
+    },
+    {
+      "epoch": 0.022047455053890677,
+      "grad_norm": 1.6201075315475464,
+      "learning_rate": 2.810315617148806e-05,
+      "loss": 1.7771,
+      "step": 167900
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "grad_norm": 1.6662862300872803,
+      "learning_rate": 2.8081185700617746e-05,
+      "loss": 1.7761,
+      "step": 168000
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "eval_loss": 2.161256790161133,
+      "eval_runtime": 52.0964,
+      "eval_samples_per_second": 195.676,
+      "eval_steps_per_second": 1.536,
+      "step": 168000
+    },
+    {
+      "epoch": 0.022605618472976517,
+      "grad_norm": 1.4697953462600708,
+      "learning_rate": 2.8059212813107438e-05,
+      "loss": 1.7894,
+      "step": 168100
+    },
+    {
+      "epoch": 0.022884700182519438,
+      "grad_norm": 1.6400997638702393,
+      "learning_rate": 2.803723752619094e-05,
+      "loss": 1.7779,
+      "step": 168200
+    },
+    {
+      "epoch": 0.02316378189206236,
+      "grad_norm": 1.5220052003860474,
+      "learning_rate": 2.8015259857103942e-05,
+      "loss": 1.7732,
+      "step": 168300
+    },
+    {
+      "epoch": 0.023442863601605277,
+      "grad_norm": 1.542869210243225,
+      "learning_rate": 2.7993279823084007e-05,
+      "loss": 1.7771,
+      "step": 168400
+    },
+    {
+      "epoch": 0.0237219453111482,
+      "grad_norm": 1.4953099489212036,
+      "learning_rate": 2.7971297441370542e-05,
+      "loss": 1.7774,
+      "step": 168500
+    },
+    {
+      "epoch": 0.024001027020691117,
+      "grad_norm": 1.5665849447250366,
+      "learning_rate": 2.7949312729204803e-05,
+      "loss": 1.7633,
+      "step": 168600
+    },
+    {
+      "epoch": 0.024280108730234038,
+      "grad_norm": 1.5860687494277954,
+      "learning_rate": 2.792732570382986e-05,
+      "loss": 1.7798,
+      "step": 168700
+    },
+    {
+      "epoch": 0.02455919043977696,
+      "grad_norm": 1.602845311164856,
+      "learning_rate": 2.790533638249062e-05,
+      "loss": 1.7694,
+      "step": 168800
+    },
+    {
+      "epoch": 0.024838272149319877,
+      "grad_norm": 1.5015400648117065,
+      "learning_rate": 2.7883344782433774e-05,
+      "loss": 1.7628,
+      "step": 168900
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "grad_norm": 1.5296344757080078,
+      "learning_rate": 2.7861350920907807e-05,
+      "loss": 1.7753,
+      "step": 169000
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "eval_loss": 2.1639742851257324,
+      "eval_runtime": 52.1527,
+      "eval_samples_per_second": 195.465,
+      "eval_steps_per_second": 1.534,
+      "step": 169000
+    },
+    {
+      "epoch": 0.025396435568405717,
+      "grad_norm": 1.591369390487671,
+      "learning_rate": 2.783935481516297e-05,
+      "loss": 1.7695,
+      "step": 169100
+    },
+    {
+      "epoch": 0.025675517277948638,
+      "grad_norm": 1.5569419860839844,
+      "learning_rate": 2.7817356482451297e-05,
+      "loss": 1.7689,
+      "step": 169200
+    },
+    {
+      "epoch": 0.025954598987491556,
+      "grad_norm": 1.6080352067947388,
+      "learning_rate": 2.779535594002654e-05,
+      "loss": 1.767,
+      "step": 169300
+    },
+    {
+      "epoch": 0.026233680697034478,
+      "grad_norm": 1.47182035446167,
+      "learning_rate": 2.77733532051442e-05,
+      "loss": 1.7717,
+      "step": 169400
+    },
+    {
+      "epoch": 0.0265127624065774,
+      "grad_norm": 1.6706403493881226,
+      "learning_rate": 2.775134829506148e-05,
+      "loss": 1.7787,
+      "step": 169500
+    },
+    {
+      "epoch": 0.026791844116120317,
+      "grad_norm": 1.6530786752700806,
+      "learning_rate": 2.7729341227037313e-05,
+      "loss": 1.7726,
+      "step": 169600
+    },
+    {
+      "epoch": 0.02707092582566324,
+      "grad_norm": 1.4457296133041382,
+      "learning_rate": 2.7707332018332323e-05,
+      "loss": 1.7697,
+      "step": 169700
+    },
+    {
+      "epoch": 0.027350007535206156,
+      "grad_norm": 1.5824190378189087,
+      "learning_rate": 2.7685320686208793e-05,
+      "loss": 1.7734,
+      "step": 169800
+    },
+    {
+      "epoch": 0.027629089244749078,
+      "grad_norm": 1.6177047491073608,
+      "learning_rate": 2.7663307247930686e-05,
+      "loss": 1.7782,
+      "step": 169900
+    },
+    {
+      "epoch": 0.027908170954292,
+      "grad_norm": 1.505018949508667,
+      "learning_rate": 2.7641291720763612e-05,
+      "loss": 1.7659,
+      "step": 170000
+    },
+    {
+      "epoch": 0.027908170954292,
+      "eval_loss": 2.1508195400238037,
+      "eval_runtime": 52.1147,
+      "eval_samples_per_second": 195.607,
+      "eval_steps_per_second": 1.535,
+      "step": 170000
+    },
+    {
+      "epoch": 0.028187252663834917,
+      "grad_norm": 1.6319383382797241,
+      "learning_rate": 2.7619274121974825e-05,
+      "loss": 1.7709,
+      "step": 170100
+    },
+    {
+      "epoch": 0.02846633437337784,
+      "grad_norm": 1.6314260959625244,
+      "learning_rate": 2.759725446883319e-05,
+      "loss": 1.7675,
+      "step": 170200
+    },
+    {
+      "epoch": 0.028745416082920756,
+      "grad_norm": 1.471872329711914,
+      "learning_rate": 2.7575232778609206e-05,
+      "loss": 1.771,
+      "step": 170300
+    },
+    {
+      "epoch": 0.029024497792463678,
+      "grad_norm": 1.5450881719589233,
+      "learning_rate": 2.755320906857494e-05,
+      "loss": 1.836,
+      "step": 170400
+    },
+    {
+      "epoch": 0.0293035795020066,
+      "grad_norm": 1.5527344942092896,
+      "learning_rate": 2.753118335600408e-05,
+      "loss": 1.8808,
+      "step": 170500
+    },
+    {
+      "epoch": 0.029582661211549517,
+      "grad_norm": 1.8364976644515991,
+      "learning_rate": 2.7509155658171852e-05,
+      "loss": 1.8776,
+      "step": 170600
+    },
+    {
+      "epoch": 0.02986174292109244,
+      "grad_norm": 1.4847674369812012,
+      "learning_rate": 2.7487125992355058e-05,
+      "loss": 1.8724,
+      "step": 170700
+    },
+    {
+      "epoch": 0.030140824630635357,
+      "grad_norm": 1.5595808029174805,
+      "learning_rate": 2.7465094375832028e-05,
+      "loss": 1.8799,
+      "step": 170800
+    },
+    {
+      "epoch": 0.030419906340178278,
+      "grad_norm": 1.54868483543396,
+      "learning_rate": 2.744306082588264e-05,
+      "loss": 1.8704,
+      "step": 170900
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "grad_norm": 1.8504784107208252,
+      "learning_rate": 2.742102535978827e-05,
+      "loss": 1.8736,
+      "step": 171000
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "eval_loss": 2.1541635990142822,
+      "eval_runtime": 52.1803,
+      "eval_samples_per_second": 195.361,
+      "eval_steps_per_second": 1.533,
+      "step": 171000
+    },
+    {
+      "epoch": 0.030978069759264117,
+      "grad_norm": 1.6168150901794434,
+      "learning_rate": 2.7398987994831822e-05,
+      "loss": 1.8737,
+      "step": 171100
+    },
+    {
+      "epoch": 0.03125715146880704,
+      "grad_norm": 1.6291587352752686,
+      "learning_rate": 2.737694874829766e-05,
+      "loss": 1.8691,
+      "step": 171200
+    },
+    {
+      "epoch": 0.03153623317834996,
+      "grad_norm": 1.5887749195098877,
+      "learning_rate": 2.735490763747164e-05,
+      "loss": 1.8725,
+      "step": 171300
+    },
+    {
+      "epoch": 0.031815314887892875,
+      "grad_norm": 1.6395853757858276,
+      "learning_rate": 2.733286467964108e-05,
+      "loss": 1.8857,
+      "step": 171400
+    },
+    {
+      "epoch": 0.0320943965974358,
+      "grad_norm": 1.5826025009155273,
+      "learning_rate": 2.7310819892094742e-05,
+      "loss": 1.8546,
+      "step": 171500
+    },
+    {
+      "epoch": 0.03237347830697872,
+      "grad_norm": 2.160349130630493,
+      "learning_rate": 2.7288773292122827e-05,
+      "loss": 1.8623,
+      "step": 171600
+    },
+    {
+      "epoch": 0.032652560016521635,
+      "grad_norm": 1.6130859851837158,
+      "learning_rate": 2.726672489701696e-05,
+      "loss": 1.8629,
+      "step": 171700
+    },
+    {
+      "epoch": 0.03293164172606456,
+      "grad_norm": 1.619787335395813,
+      "learning_rate": 2.7244674724070163e-05,
+      "loss": 1.8646,
+      "step": 171800
+    },
+    {
+      "epoch": 0.03321072343560748,
+      "grad_norm": 2.099820375442505,
+      "learning_rate": 2.722262279057687e-05,
+      "loss": 1.8679,
+      "step": 171900
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "grad_norm": 1.7083640098571777,
+      "learning_rate": 2.720056911383287e-05,
+      "loss": 1.8554,
+      "step": 172000
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "eval_loss": 2.1523571014404297,
+      "eval_runtime": 52.1491,
+      "eval_samples_per_second": 195.478,
+      "eval_steps_per_second": 1.534,
+      "step": 172000
+    },
+    {
+      "epoch": 0.033768886854693314,
+      "grad_norm": 1.5392628908157349,
+      "learning_rate": 2.717851371113534e-05,
+      "loss": 1.8658,
+      "step": 172100
+    },
+    {
+      "epoch": 0.03404796856423624,
+      "grad_norm": 2.007720708847046,
+      "learning_rate": 2.715645659978281e-05,
+      "loss": 1.861,
+      "step": 172200
+    },
+    {
+      "epoch": 0.03432705027377916,
+      "grad_norm": 1.566613793373108,
+      "learning_rate": 2.7134397797075145e-05,
+      "loss": 1.8669,
+      "step": 172300
+    },
+    {
+      "epoch": 0.034606131983322075,
+      "grad_norm": 1.588408350944519,
+      "learning_rate": 2.7112337320313524e-05,
+      "loss": 1.8568,
+      "step": 172400
+    },
+    {
+      "epoch": 0.034885213692865,
+      "grad_norm": 1.6406699419021606,
+      "learning_rate": 2.7090275186800474e-05,
+      "loss": 1.8713,
+      "step": 172500
+    },
+    {
+      "epoch": 0.03516429540240792,
+      "grad_norm": 1.5397433042526245,
+      "learning_rate": 2.7068211413839782e-05,
+      "loss": 1.8629,
+      "step": 172600
+    },
+    {
+      "epoch": 0.035443377111950836,
+      "grad_norm": 1.5865190029144287,
+      "learning_rate": 2.704614601873654e-05,
+      "loss": 1.8579,
+      "step": 172700
+    },
+    {
+      "epoch": 0.035722458821493754,
+      "grad_norm": 1.7077267169952393,
+      "learning_rate": 2.702407901879712e-05,
+      "loss": 1.8616,
+      "step": 172800
+    },
+    {
+      "epoch": 0.03600154053103668,
+      "grad_norm": 1.727586269378662,
+      "learning_rate": 2.7002010431329134e-05,
+      "loss": 1.8574,
+      "step": 172900
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "grad_norm": 1.5238264799118042,
+      "learning_rate": 2.6979940273641453e-05,
+      "loss": 1.8595,
+      "step": 173000
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "eval_loss": 2.141134738922119,
+      "eval_runtime": 52.1591,
+      "eval_samples_per_second": 195.441,
+      "eval_steps_per_second": 1.534,
+      "step": 173000
+    },
+    {
+      "epoch": 0.036559703950122514,
+      "grad_norm": 1.5688259601593018,
+      "learning_rate": 2.6957868563044176e-05,
+      "loss": 1.8674,
+      "step": 173100
+    },
+    {
+      "epoch": 0.03683878565966544,
+      "grad_norm": 1.5195534229278564,
+      "learning_rate": 2.6935795316848612e-05,
+      "loss": 1.8653,
+      "step": 173200
+    },
+    {
+      "epoch": 0.03711786736920836,
+      "grad_norm": 1.6201164722442627,
+      "learning_rate": 2.691372055236728e-05,
+      "loss": 1.8579,
+      "step": 173300
+    },
+    {
+      "epoch": 0.037396949078751275,
+      "grad_norm": 1.8065686225891113,
+      "learning_rate": 2.6891644286913897e-05,
+      "loss": 1.8755,
+      "step": 173400
+    },
+    {
+      "epoch": 0.0376760307882942,
+      "grad_norm": 1.5661702156066895,
+      "learning_rate": 2.6869566537803347e-05,
+      "loss": 1.8552,
+      "step": 173500
+    },
+    {
+      "epoch": 0.03795511249783712,
+      "grad_norm": 1.6565943956375122,
+      "learning_rate": 2.6847487322351694e-05,
+      "loss": 1.8664,
+      "step": 173600
+    },
+    {
+      "epoch": 0.038234194207380036,
+      "grad_norm": 1.49613356590271,
+      "learning_rate": 2.6825406657876123e-05,
+      "loss": 1.8524,
+      "step": 173700
+    },
+    {
+      "epoch": 0.038513275916922954,
+      "grad_norm": 1.5829864740371704,
+      "learning_rate": 2.6803324561694988e-05,
+      "loss": 1.8732,
+      "step": 173800
+    },
+    {
+      "epoch": 0.03879235762646588,
+      "grad_norm": 1.6095563173294067,
+      "learning_rate": 2.6781241051127738e-05,
+      "loss": 1.8503,
+      "step": 173900
+    },
+    {
+      "epoch": 0.0390714393360088,
+      "grad_norm": 1.5767251253128052,
+      "learning_rate": 2.675915614349495e-05,
+      "loss": 1.856,
+      "step": 174000
+    },
+    {
+      "epoch": 0.0390714393360088,
+      "eval_loss": 2.1416378021240234,
+      "eval_runtime": 52.1112,
+      "eval_samples_per_second": 195.62,
+      "eval_steps_per_second": 1.535,
+      "step": 174000
+    },
+    {
+      "epoch": 0.00027908170954291995,
+      "grad_norm": 1.5513286590576172,
+      "learning_rate": 2.6737069856118284e-05,
+      "loss": 1.7542,
+      "step": 174100
+    },
+    {
+      "epoch": 0.0005581634190858399,
+      "grad_norm": 1.5664585828781128,
+      "learning_rate": 2.67149822063205e-05,
+      "loss": 1.7515,
+      "step": 174200
+    },
+    {
+      "epoch": 0.0008372451286287599,
+      "grad_norm": 1.5423948764801025,
+      "learning_rate": 2.66928932114254e-05,
+      "loss": 1.7557,
+      "step": 174300
+    },
+    {
+      "epoch": 0.0011163268381716798,
+      "grad_norm": 1.5535671710968018,
+      "learning_rate": 2.667080288875788e-05,
+      "loss": 1.7569,
+      "step": 174400
+    },
+    {
+      "epoch": 0.0013954085477146,
+      "grad_norm": 1.5592520236968994,
+      "learning_rate": 2.6648711255643828e-05,
+      "loss": 1.7506,
+      "step": 174500
+    },
+    {
+      "epoch": 0.0016744902572575198,
+      "grad_norm": 1.5440510511398315,
+      "learning_rate": 2.6626618329410198e-05,
+      "loss": 1.7618,
+      "step": 174600
+    },
+    {
+      "epoch": 0.00195357196680044,
+      "grad_norm": 1.54314124584198,
+      "learning_rate": 2.6604524127384937e-05,
+      "loss": 1.7491,
+      "step": 174700
+    },
+    {
+      "epoch": 0.0022326536763433596,
+      "grad_norm": 1.592208743095398,
+      "learning_rate": 2.658242866689702e-05,
+      "loss": 1.7458,
+      "step": 174800
+    },
+    {
+      "epoch": 0.0025117353858862797,
+      "grad_norm": 1.5204849243164062,
+      "learning_rate": 2.6560331965276363e-05,
+      "loss": 1.7523,
+      "step": 174900
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "grad_norm": 1.5259612798690796,
+      "learning_rate": 2.653823403985391e-05,
+      "loss": 1.7535,
+      "step": 175000
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "eval_loss": 2.1326749324798584,
+      "eval_runtime": 52.049,
+      "eval_samples_per_second": 195.854,
+      "eval_steps_per_second": 1.537,
+      "step": 175000
+    },
+    {
+      "epoch": 0.00306989880497212,
+      "grad_norm": 1.52047598361969,
+      "learning_rate": 2.651613490796152e-05,
+      "loss": 1.7447,
+      "step": 175100
+    },
+    {
+      "epoch": 0.0033489805145150396,
+      "grad_norm": 1.5134586095809937,
+      "learning_rate": 2.6494034586932027e-05,
+      "loss": 1.7452,
+      "step": 175200
+    },
+    {
+      "epoch": 0.0036280622240579597,
+      "grad_norm": 1.572095513343811,
+      "learning_rate": 2.6471933094099177e-05,
+      "loss": 1.7571,
+      "step": 175300
+    },
+    {
+      "epoch": 0.00390714393360088,
+      "grad_norm": 1.5933750867843628,
+      "learning_rate": 2.6449830446797653e-05,
+      "loss": 1.745,
+      "step": 175400
+    },
+    {
+      "epoch": 0.0041862256431437995,
+      "grad_norm": 1.6601353883743286,
+      "learning_rate": 2.6427726662363023e-05,
+      "loss": 1.7462,
+      "step": 175500
+    },
+    {
+      "epoch": 0.004465307352686719,
+      "grad_norm": 1.5466818809509277,
+      "learning_rate": 2.640562175813177e-05,
+      "loss": 1.7573,
+      "step": 175600
+    },
+    {
+      "epoch": 0.00474438906222964,
+      "grad_norm": 1.5273200273513794,
+      "learning_rate": 2.6383515751441234e-05,
+      "loss": 1.7578,
+      "step": 175700
+    },
+    {
+      "epoch": 0.005023470771772559,
+      "grad_norm": 1.609778881072998,
+      "learning_rate": 2.636140865962965e-05,
+      "loss": 1.7513,
+      "step": 175800
+    },
+    {
+      "epoch": 0.00530255248131548,
+      "grad_norm": 1.6019160747528076,
+      "learning_rate": 2.633930050003606e-05,
+      "loss": 1.7557,
+      "step": 175900
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "grad_norm": 1.5547572374343872,
+      "learning_rate": 2.6317191290000383e-05,
+      "loss": 1.7645,
+      "step": 176000
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "eval_loss": 2.141494035720825,
+      "eval_runtime": 51.4645,
+      "eval_samples_per_second": 198.078,
+      "eval_steps_per_second": 1.554,
+      "step": 176000
+    },
+    {
+      "epoch": 0.005860715900401319,
+      "grad_norm": 1.6100679636001587,
+      "learning_rate": 2.629508104686334e-05,
+      "loss": 1.7566,
+      "step": 176100
+    },
+    {
+      "epoch": 0.00613979760994424,
+      "grad_norm": 1.5966265201568604,
+      "learning_rate": 2.6272969787966466e-05,
+      "loss": 1.7511,
+      "step": 176200
+    },
+    {
+      "epoch": 0.0064188793194871595,
+      "grad_norm": 1.5519967079162598,
+      "learning_rate": 2.6250857530652113e-05,
+      "loss": 1.7534,
+      "step": 176300
+    },
+    {
+      "epoch": 0.006697961029030079,
+      "grad_norm": 1.5537617206573486,
+      "learning_rate": 2.6228744292263367e-05,
+      "loss": 1.7448,
+      "step": 176400
+    },
+    {
+      "epoch": 0.006977042738573,
+      "grad_norm": 1.5397429466247559,
+      "learning_rate": 2.6206630090144153e-05,
+      "loss": 1.7456,
+      "step": 176500
+    },
+    {
+      "epoch": 0.0072561244481159195,
+      "grad_norm": 1.5131994485855103,
+      "learning_rate": 2.618451494163908e-05,
+      "loss": 1.7472,
+      "step": 176600
+    },
+    {
+      "epoch": 0.007535206157658839,
+      "grad_norm": 1.553226113319397,
+      "learning_rate": 2.6162398864093553e-05,
+      "loss": 1.7588,
+      "step": 176700
+    },
+    {
+      "epoch": 0.00781428786720176,
+      "grad_norm": 1.5782634019851685,
+      "learning_rate": 2.6140281874853666e-05,
+      "loss": 1.7498,
+      "step": 176800
+    },
+    {
+      "epoch": 0.00809336957674468,
+      "grad_norm": 1.5181629657745361,
+      "learning_rate": 2.6118163991266275e-05,
+      "loss": 1.7525,
+      "step": 176900
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "grad_norm": 1.622118353843689,
+      "learning_rate": 2.6096045230678888e-05,
+      "loss": 1.7472,
+      "step": 177000
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "eval_loss": 2.1567530632019043,
+      "eval_runtime": 51.4987,
+      "eval_samples_per_second": 197.947,
+      "eval_steps_per_second": 1.553,
+      "step": 177000
+    },
+    {
+      "epoch": 0.008651532995830519,
+      "grad_norm": 1.5844262838363647,
+      "learning_rate": 2.6073925610439738e-05,
+      "loss": 1.7489,
+      "step": 177100
+    },
+    {
+      "epoch": 0.008930614705373438,
+      "grad_norm": 1.4944721460342407,
+      "learning_rate": 2.6051805147897713e-05,
+      "loss": 1.7535,
+      "step": 177200
+    },
+    {
+      "epoch": 0.00920969641491636,
+      "grad_norm": 1.607365608215332,
+      "learning_rate": 2.602968386040236e-05,
+      "loss": 1.7476,
+      "step": 177300
+    },
+    {
+      "epoch": 0.00948877812445928,
+      "grad_norm": 1.5790349245071411,
+      "learning_rate": 2.6007561765303878e-05,
+      "loss": 1.7465,
+      "step": 177400
+    },
+    {
+      "epoch": 0.0097678598340022,
+      "grad_norm": 1.5833547115325928,
+      "learning_rate": 2.5985438879953107e-05,
+      "loss": 1.7581,
+      "step": 177500
+    },
+    {
+      "epoch": 0.010046941543545119,
+      "grad_norm": 1.5244640111923218,
+      "learning_rate": 2.5963315221701496e-05,
+      "loss": 1.7489,
+      "step": 177600
+    },
+    {
+      "epoch": 0.010326023253088039,
+      "grad_norm": 1.6332496404647827,
+      "learning_rate": 2.5941190807901117e-05,
+      "loss": 1.7593,
+      "step": 177700
+    },
+    {
+      "epoch": 0.01060510496263096,
+      "grad_norm": 1.4967930316925049,
+      "learning_rate": 2.5919065655904606e-05,
+      "loss": 1.7487,
+      "step": 177800
+    },
+    {
+      "epoch": 0.01088418667217388,
+      "grad_norm": 1.5874158143997192,
+      "learning_rate": 2.5896939783065198e-05,
+      "loss": 1.7488,
+      "step": 177900
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "grad_norm": 1.6334315538406372,
+      "learning_rate": 2.587481320673669e-05,
+      "loss": 1.7558,
+      "step": 178000
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "eval_loss": 2.1407663822174072,
+      "eval_runtime": 51.564,
+      "eval_samples_per_second": 197.696,
+      "eval_steps_per_second": 1.551,
+      "step": 178000
+    },
+    {
+      "epoch": 0.011442350091259719,
+      "grad_norm": 1.5070706605911255,
+      "learning_rate": 2.5852685944273437e-05,
+      "loss": 1.7515,
+      "step": 178100
+    },
+    {
+      "epoch": 0.011721431800802639,
+      "grad_norm": 1.675197958946228,
+      "learning_rate": 2.583055801303031e-05,
+      "loss": 1.7517,
+      "step": 178200
+    },
+    {
+      "epoch": 0.012000513510345558,
+      "grad_norm": 1.6129719018936157,
+      "learning_rate": 2.5808429430362734e-05,
+      "loss": 1.739,
+      "step": 178300
+    },
+    {
+      "epoch": 0.01227959521988848,
+      "grad_norm": 1.6314342021942139,
+      "learning_rate": 2.5786300213626623e-05,
+      "loss": 1.7373,
+      "step": 178400
+    },
+    {
+      "epoch": 0.0125586769294314,
+      "grad_norm": 1.4758597612380981,
+      "learning_rate": 2.576417038017841e-05,
+      "loss": 1.7512,
+      "step": 178500
+    },
+    {
+      "epoch": 0.012837758638974319,
+      "grad_norm": 1.6322437524795532,
+      "learning_rate": 2.574203994737498e-05,
+      "loss": 1.7529,
+      "step": 178600
+    },
+    {
+      "epoch": 0.013116840348517239,
+      "grad_norm": 1.6611186265945435,
+      "learning_rate": 2.5719908932573716e-05,
+      "loss": 1.7529,
+      "step": 178700
+    },
+    {
+      "epoch": 0.013395922058060158,
+      "grad_norm": 1.6254630088806152,
+      "learning_rate": 2.5697777353132434e-05,
+      "loss": 1.7548,
+      "step": 178800
+    },
+    {
+      "epoch": 0.013675003767603078,
+      "grad_norm": 1.6417994499206543,
+      "learning_rate": 2.567564522640942e-05,
+      "loss": 1.7501,
+      "step": 178900
+    },
+    {
+      "epoch": 0.013954085477146,
+      "grad_norm": 1.5359156131744385,
+      "learning_rate": 2.5653512569763377e-05,
+      "loss": 1.7562,
+      "step": 179000
+    },
+    {
+      "epoch": 0.013954085477146,
+      "eval_loss": 2.144591808319092,
+      "eval_runtime": 51.5364,
+      "eval_samples_per_second": 197.802,
+      "eval_steps_per_second": 1.552,
+      "step": 179000
+    },
+    {
+      "epoch": 0.01423316718668892,
+      "grad_norm": 1.5880595445632935,
+      "learning_rate": 2.5631379400553416e-05,
+      "loss": 1.75,
+      "step": 179100
+    },
+    {
+      "epoch": 0.014512248896231839,
+      "grad_norm": 1.6134679317474365,
+      "learning_rate": 2.560924573613906e-05,
+      "loss": 1.7508,
+      "step": 179200
+    },
+    {
+      "epoch": 0.014791330605774759,
+      "grad_norm": 1.5464352369308472,
+      "learning_rate": 2.5587111593880205e-05,
+      "loss": 1.7502,
+      "step": 179300
+    },
+    {
+      "epoch": 0.015070412315317678,
+      "grad_norm": 1.573649525642395,
+      "learning_rate": 2.556497699113714e-05,
+      "loss": 1.7435,
+      "step": 179400
+    },
+    {
+      "epoch": 0.015349494024860598,
+      "grad_norm": 1.5665711164474487,
+      "learning_rate": 2.554284194527051e-05,
+      "loss": 1.7462,
+      "step": 179500
+    },
+    {
+      "epoch": 0.01562857573440352,
+      "grad_norm": 1.606072187423706,
+      "learning_rate": 2.5520706473641316e-05,
+      "loss": 1.7516,
+      "step": 179600
+    },
+    {
+      "epoch": 0.015907657443946437,
+      "grad_norm": 1.5898959636688232,
+      "learning_rate": 2.549857059361086e-05,
+      "loss": 1.7482,
+      "step": 179700
+    },
+    {
+      "epoch": 0.01618673915348936,
+      "grad_norm": 1.6288598775863647,
+      "learning_rate": 2.547643432254081e-05,
+      "loss": 1.7365,
+      "step": 179800
+    },
+    {
+      "epoch": 0.01646582086303228,
+      "grad_norm": 1.5765552520751953,
+      "learning_rate": 2.545429767779311e-05,
+      "loss": 1.7346,
+      "step": 179900
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "grad_norm": 1.5909677743911743,
+      "learning_rate": 2.5432160676729994e-05,
+      "loss": 1.7493,
+      "step": 180000
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "eval_loss": 2.1469063758850098,
+      "eval_runtime": 52.5101,
+      "eval_samples_per_second": 194.134,
+      "eval_steps_per_second": 1.524,
+      "step": 180000
+    },
+    {
+      "epoch": 0.01702398428211812,
+      "grad_norm": 1.6108888387680054,
+      "learning_rate": 2.5410023336713996e-05,
+      "loss": 1.749,
+      "step": 180100
+    },
+    {
+      "epoch": 0.017303065991661037,
+      "grad_norm": 1.5427972078323364,
+      "learning_rate": 2.538788567510791e-05,
+      "loss": 1.738,
+      "step": 180200
+    },
+    {
+      "epoch": 0.01758214770120396,
+      "grad_norm": 1.5925029516220093,
+      "learning_rate": 2.5365747709274767e-05,
+      "loss": 1.7418,
+      "step": 180300
+    },
+    {
+      "epoch": 0.017861229410746877,
+      "grad_norm": 1.5784283876419067,
+      "learning_rate": 2.5343609456577867e-05,
+      "loss": 1.7417,
+      "step": 180400
+    },
+    {
+      "epoch": 0.018140311120289798,
+      "grad_norm": 1.623561978340149,
+      "learning_rate": 2.53214709343807e-05,
+      "loss": 1.7443,
+      "step": 180500
+    },
+    {
+      "epoch": 0.01841939282983272,
+      "grad_norm": 1.6505674123764038,
+      "learning_rate": 2.5299332160046985e-05,
+      "loss": 1.7454,
+      "step": 180600
+    },
+    {
+      "epoch": 0.018698474539375638,
+      "grad_norm": 1.5555040836334229,
+      "learning_rate": 2.5277193150940638e-05,
+      "loss": 1.7416,
+      "step": 180700
+    },
+    {
+      "epoch": 0.01897755624891856,
+      "grad_norm": 1.6162723302841187,
+      "learning_rate": 2.525505392442577e-05,
+      "loss": 1.7433,
+      "step": 180800
+    },
+    {
+      "epoch": 0.019256637958461477,
+      "grad_norm": 1.5440572500228882,
+      "learning_rate": 2.523291449786663e-05,
+      "loss": 1.7438,
+      "step": 180900
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "grad_norm": 1.596146583557129,
+      "learning_rate": 2.5210774888627664e-05,
+      "loss": 1.7425,
+      "step": 181000
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "eval_loss": 2.140672206878662,
+      "eval_runtime": 51.8004,
+      "eval_samples_per_second": 196.794,
+      "eval_steps_per_second": 1.544,
+      "step": 181000
+    },
+    {
+      "epoch": 0.01981480137754732,
+      "grad_norm": 1.6086748838424683,
+      "learning_rate": 2.5188635114073434e-05,
+      "loss": 1.7488,
+      "step": 181100
+    },
+    {
+      "epoch": 0.020093883087090238,
+      "grad_norm": 1.564663290977478,
+      "learning_rate": 2.516649519156864e-05,
+      "loss": 1.7452,
+      "step": 181200
+    },
+    {
+      "epoch": 0.02037296479663316,
+      "grad_norm": 1.5975944995880127,
+      "learning_rate": 2.51443551384781e-05,
+      "loss": 1.7419,
+      "step": 181300
+    },
+    {
+      "epoch": 0.020652046506176077,
+      "grad_norm": 1.6056960821151733,
+      "learning_rate": 2.5122214972166724e-05,
+      "loss": 1.7536,
+      "step": 181400
+    },
+    {
+      "epoch": 0.020931128215719,
+      "grad_norm": 1.6348010301589966,
+      "learning_rate": 2.5100074709999526e-05,
+      "loss": 1.7505,
+      "step": 181500
+    },
+    {
+      "epoch": 0.02121020992526192,
+      "grad_norm": 1.4651880264282227,
+      "learning_rate": 2.5077934369341594e-05,
+      "loss": 1.7474,
+      "step": 181600
+    },
+    {
+      "epoch": 0.021489291634804838,
+      "grad_norm": 1.6000345945358276,
+      "learning_rate": 2.505579396755806e-05,
+      "loss": 1.7455,
+      "step": 181700
+    },
+    {
+      "epoch": 0.02176837334434776,
+      "grad_norm": 1.6549137830734253,
+      "learning_rate": 2.503365352201413e-05,
+      "loss": 1.7404,
+      "step": 181800
+    },
+    {
+      "epoch": 0.022047455053890677,
+      "grad_norm": 1.6172484159469604,
+      "learning_rate": 2.5011513050075014e-05,
+      "loss": 1.7457,
+      "step": 181900
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "grad_norm": 1.6283797025680542,
+      "learning_rate": 2.4989372569105962e-05,
+      "loss": 1.7411,
+      "step": 182000
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "eval_loss": 2.1432528495788574,
+      "eval_runtime": 51.7742,
+      "eval_samples_per_second": 196.894,
+      "eval_steps_per_second": 1.545,
+      "step": 182000
+    },
+    {
+      "epoch": 0.022605618472976517,
+      "grad_norm": 1.5319279432296753,
+      "learning_rate": 2.4967232096472236e-05,
+      "loss": 1.76,
+      "step": 182100
+    },
+    {
+      "epoch": 0.022884700182519438,
+      "grad_norm": 1.600860595703125,
+      "learning_rate": 2.4945091649539086e-05,
+      "loss": 1.7416,
+      "step": 182200
+    },
+    {
+      "epoch": 0.02316378189206236,
+      "grad_norm": 1.5592856407165527,
+      "learning_rate": 2.4922951245671723e-05,
+      "loss": 1.7421,
+      "step": 182300
+    },
+    {
+      "epoch": 0.023442863601605277,
+      "grad_norm": 1.5361909866333008,
+      "learning_rate": 2.4900810902235356e-05,
+      "loss": 1.7436,
+      "step": 182400
+    },
+    {
+      "epoch": 0.0237219453111482,
+      "grad_norm": 1.526672124862671,
+      "learning_rate": 2.4878670636595117e-05,
+      "loss": 1.7418,
+      "step": 182500
+    },
+    {
+      "epoch": 0.024001027020691117,
+      "grad_norm": 1.5167595148086548,
+      "learning_rate": 2.4856530466116112e-05,
+      "loss": 1.7389,
+      "step": 182600
+    },
+    {
+      "epoch": 0.024280108730234038,
+      "grad_norm": 1.6046936511993408,
+      "learning_rate": 2.4834390408163324e-05,
+      "loss": 1.7459,
+      "step": 182700
+    },
+    {
+      "epoch": 0.02455919043977696,
+      "grad_norm": 1.572601079940796,
+      "learning_rate": 2.4812250480101693e-05,
+      "loss": 1.7464,
+      "step": 182800
+    },
+    {
+      "epoch": 0.024838272149319877,
+      "grad_norm": 1.5549017190933228,
+      "learning_rate": 2.479011069929603e-05,
+      "loss": 1.7356,
+      "step": 182900
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "grad_norm": 1.5163230895996094,
+      "learning_rate": 2.476797108311106e-05,
+      "loss": 1.7427,
+      "step": 183000
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "eval_loss": 2.1313729286193848,
+      "eval_runtime": 51.744,
+      "eval_samples_per_second": 197.009,
+      "eval_steps_per_second": 1.546,
+      "step": 183000
+    },
+    {
+      "epoch": 0.025396435568405717,
+      "grad_norm": 1.5936397314071655,
+      "learning_rate": 2.474583164891133e-05,
+      "loss": 1.7446,
+      "step": 183100
+    },
+    {
+      "epoch": 0.025675517277948638,
+      "grad_norm": 1.5533971786499023,
+      "learning_rate": 2.4723692414061295e-05,
+      "loss": 1.7452,
+      "step": 183200
+    },
+    {
+      "epoch": 0.025954598987491556,
+      "grad_norm": 1.6152623891830444,
+      "learning_rate": 2.4701553395925214e-05,
+      "loss": 1.7425,
+      "step": 183300
+    },
+    {
+      "epoch": 0.026233680697034478,
+      "grad_norm": 1.4908332824707031,
+      "learning_rate": 2.4679414611867214e-05,
+      "loss": 1.755,
+      "step": 183400
+    },
+    {
+      "epoch": 0.0265127624065774,
+      "grad_norm": 1.6560674905776978,
+      "learning_rate": 2.4657276079251194e-05,
+      "loss": 1.7477,
+      "step": 183500
+    },
+    {
+      "epoch": 0.026791844116120317,
+      "grad_norm": 1.7160277366638184,
+      "learning_rate": 2.4635137815440894e-05,
+      "loss": 1.7446,
+      "step": 183600
+    },
+    {
+      "epoch": 0.02707092582566324,
+      "grad_norm": 1.4447243213653564,
+      "learning_rate": 2.461299983779983e-05,
+      "loss": 1.7403,
+      "step": 183700
+    },
+    {
+      "epoch": 0.027350007535206156,
+      "grad_norm": 1.605068325996399,
+      "learning_rate": 2.459086216369129e-05,
+      "loss": 1.7439,
+      "step": 183800
+    },
+    {
+      "epoch": 0.027629089244749078,
+      "grad_norm": 1.6601132154464722,
+      "learning_rate": 2.4568724810478325e-05,
+      "loss": 1.7439,
+      "step": 183900
+    },
+    {
+      "epoch": 0.027908170954292,
+      "grad_norm": 1.546660304069519,
+      "learning_rate": 2.4546587795523733e-05,
+      "loss": 1.7339,
+      "step": 184000
+    },
+    {
+      "epoch": 0.027908170954292,
+      "eval_loss": 2.1373305320739746,
+      "eval_runtime": 51.7742,
+      "eval_samples_per_second": 196.893,
+      "eval_steps_per_second": 1.545,
+      "step": 184000
+    },
+    {
+      "epoch": 0.00027908170954291995,
+      "grad_norm": 1.6656001806259155,
+      "learning_rate": 2.4524451136190048e-05,
+      "loss": 1.8435,
+      "step": 184100
+    },
+    {
+      "epoch": 0.0005581634190858399,
+      "grad_norm": 1.6392732858657837,
+      "learning_rate": 2.4502314849839546e-05,
+      "loss": 1.8453,
+      "step": 184200
+    },
+    {
+      "epoch": 0.0008372451286287599,
+      "grad_norm": 1.7409366369247437,
+      "learning_rate": 2.4480178953834162e-05,
+      "loss": 1.8407,
+      "step": 184300
+    },
+    {
+      "epoch": 0.0011163268381716798,
+      "grad_norm": 1.5873730182647705,
+      "learning_rate": 2.445804346553557e-05,
+      "loss": 1.8428,
+      "step": 184400
+    },
+    {
+      "epoch": 0.0013954085477146,
+      "grad_norm": 1.5073753595352173,
+      "learning_rate": 2.4435908402305108e-05,
+      "loss": 1.8379,
+      "step": 184500
+    },
+    {
+      "epoch": 0.0016744902572575198,
+      "grad_norm": 2.3680567741394043,
+      "learning_rate": 2.4413773781503788e-05,
+      "loss": 1.83,
+      "step": 184600
+    },
+    {
+      "epoch": 0.00195357196680044,
+      "grad_norm": 1.6823689937591553,
+      "learning_rate": 2.4391639620492243e-05,
+      "loss": 1.8411,
+      "step": 184700
+    },
+    {
+      "epoch": 0.0022326536763433596,
+      "grad_norm": 1.5574064254760742,
+      "learning_rate": 2.4369505936630786e-05,
+      "loss": 1.8351,
+      "step": 184800
+    },
+    {
+      "epoch": 0.0025117353858862797,
+      "grad_norm": 2.146876096725464,
+      "learning_rate": 2.4347372747279337e-05,
+      "loss": 1.833,
+      "step": 184900
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "grad_norm": 1.6746612787246704,
+      "learning_rate": 2.4325240069797438e-05,
+      "loss": 1.8284,
+      "step": 185000
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "eval_loss": 2.133864641189575,
+      "eval_runtime": 52.0009,
+      "eval_samples_per_second": 196.035,
+      "eval_steps_per_second": 1.538,
+      "step": 185000
+    },
+    {
+      "epoch": 0.00306989880497212,
+      "grad_norm": 1.6454411745071411,
+      "learning_rate": 2.430310792154422e-05,
+      "loss": 1.8312,
+      "step": 185100
+    },
+    {
+      "epoch": 0.0033489805145150396,
+      "grad_norm": 1.8907885551452637,
+      "learning_rate": 2.4280976319878392e-05,
+      "loss": 1.8384,
+      "step": 185200
+    },
+    {
+      "epoch": 0.0036280622240579597,
+      "grad_norm": 1.6488444805145264,
+      "learning_rate": 2.425884528215825e-05,
+      "loss": 1.8241,
+      "step": 185300
+    },
+    {
+      "epoch": 0.00390714393360088,
+      "grad_norm": 1.6460552215576172,
+      "learning_rate": 2.423671482574164e-05,
+      "loss": 1.8318,
+      "step": 185400
+    },
+    {
+      "epoch": 0.0041862256431437995,
+      "grad_norm": 1.6229537725448608,
+      "learning_rate": 2.4214584967985962e-05,
+      "loss": 1.8349,
+      "step": 185500
+    },
+    {
+      "epoch": 0.004465307352686719,
+      "grad_norm": 1.5805400609970093,
+      "learning_rate": 2.419245572624812e-05,
+      "loss": 1.823,
+      "step": 185600
+    },
+    {
+      "epoch": 0.00474438906222964,
+      "grad_norm": 1.8274881839752197,
+      "learning_rate": 2.4170327117884562e-05,
+      "loss": 1.8363,
+      "step": 185700
+    },
+    {
+      "epoch": 0.005023470771772559,
+      "grad_norm": 1.5922763347625732,
+      "learning_rate": 2.4148199160251238e-05,
+      "loss": 1.8272,
+      "step": 185800
+    },
+    {
+      "epoch": 0.00530255248131548,
+      "grad_norm": 1.6500530242919922,
+      "learning_rate": 2.4126071870703574e-05,
+      "loss": 1.821,
+      "step": 185900
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "grad_norm": 1.6244685649871826,
+      "learning_rate": 2.410394526659647e-05,
+      "loss": 1.8287,
+      "step": 186000
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "eval_loss": 2.131998300552368,
+      "eval_runtime": 51.5187,
+      "eval_samples_per_second": 197.87,
+      "eval_steps_per_second": 1.553,
+      "step": 186000
     }
   ],
   "logging_steps": 100,
       "attributes": {}
     }
   },
+  "total_flos": 1.6232668270166016e+19,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9318402efc23f8b2e09dec877ba7b88863d76a00aceeef7c22f944e9f6a43e28
 size 5777

 version https://git-lfs.github.com/spec/v1
+oid sha256:6be0aaef9589a43e4cde380bc3e83ccd55ea3b262dc3f11f0bbc4b35fc934376
 size 5777