Upload 10 files

Browse files

Files changed (6) hide show

model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +2187 -3
training_args.bin +1 -1

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8cf6fd855c268d3938b4c6d6a77aa9284b5bd03c679875c89cb3579c489295b8
 size 598635032

 version https://git-lfs.github.com/spec/v1
+oid sha256:60ccf0628b701b3fbdbd8e47c124929d09ca765f44e1db4de84ca146c4892cb2
 size 598635032

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:20d57d69e8d4eb0bcf9143bb2a5722964a200d83b3b1c090ed18f98299556b3a
 size 1197359627

 version https://git-lfs.github.com/spec/v1
+oid sha256:90969ce2677fe59ebce6103f3db23c468384c1c32a2de10256b3b5076385d4ff
 size 1197359627

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:be293c0bc96c40007a1ca95bf99da704f29c24d932c7c8e19b962a361adfdc4c
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:b19a9b53a8ffcdf83e2c27bdb7c9e264673baa2e50d42027e774b79d1973943e
 size 14645

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:af5ee4dc438217ac40b2a125900214146b721f2725ba954be785ed61a3abe011
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:8ca7233d8acabb4ee394de5e172d0b6096e38585b946640bcf133642f5f83579
 size 1465

trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.0055816341908584,
   "eval_steps": 1000,
-  "global_step": 186000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -14523,6 +14523,2190 @@
       "eval_samples_per_second": 197.87,
       "eval_steps_per_second": 1.553,
       "step": 186000
     }
   ],
   "logging_steps": 100,
@@ -14542,7 +16726,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.6232668270166016e+19,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.041862256431438,
   "eval_steps": 1000,
+  "global_step": 214000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 197.87,
       "eval_steps_per_second": 1.553,
       "step": 186000
+    },
+    {
+      "epoch": 0.005860715900401319,
+      "grad_norm": 1.608132004737854,
+      "learning_rate": 2.40818193652843e-05,
+      "loss": 1.837,
+      "step": 186100
+    },
+    {
+      "epoch": 0.00613979760994424,
+      "grad_norm": 1.5261002779006958,
+      "learning_rate": 2.4059694184120883e-05,
+      "loss": 1.827,
+      "step": 186200
+    },
+    {
+      "epoch": 0.0064188793194871595,
+      "grad_norm": 1.604973316192627,
+      "learning_rate": 2.4037569740459486e-05,
+      "loss": 1.8157,
+      "step": 186300
+    },
+    {
+      "epoch": 0.006697961029030079,
+      "grad_norm": 1.6349529027938843,
+      "learning_rate": 2.401544605165276e-05,
+      "loss": 1.8381,
+      "step": 186400
+    },
+    {
+      "epoch": 0.006977042738573,
+      "grad_norm": 1.5540446043014526,
+      "learning_rate": 2.3993323135052806e-05,
+      "loss": 1.8383,
+      "step": 186500
+    },
+    {
+      "epoch": 0.0072561244481159195,
+      "grad_norm": 1.6200664043426514,
+      "learning_rate": 2.3971201008011093e-05,
+      "loss": 1.828,
+      "step": 186600
+    },
+    {
+      "epoch": 0.007535206157658839,
+      "grad_norm": 1.750746726989746,
+      "learning_rate": 2.3949079687878492e-05,
+      "loss": 1.8302,
+      "step": 186700
+    },
+    {
+      "epoch": 0.00781428786720176,
+      "grad_norm": 1.6309112310409546,
+      "learning_rate": 2.392695919200521e-05,
+      "loss": 1.8118,
+      "step": 186800
+    },
+    {
+      "epoch": 0.00809336957674468,
+      "grad_norm": 1.5920358896255493,
+      "learning_rate": 2.3904839537740837e-05,
+      "loss": 1.8226,
+      "step": 186900
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "grad_norm": 1.7713048458099365,
+      "learning_rate": 2.3882720742434294e-05,
+      "loss": 1.8197,
+      "step": 187000
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "eval_loss": 2.121570348739624,
+      "eval_runtime": 51.4105,
+      "eval_samples_per_second": 198.286,
+      "eval_steps_per_second": 1.556,
+      "step": 187000
+    },
+    {
+      "epoch": 0.008651532995830519,
+      "grad_norm": 1.674100637435913,
+      "learning_rate": 2.3860602823433825e-05,
+      "loss": 1.8338,
+      "step": 187100
+    },
+    {
+      "epoch": 0.008930614705373438,
+      "grad_norm": 1.6260745525360107,
+      "learning_rate": 2.3838485798086984e-05,
+      "loss": 1.8209,
+      "step": 187200
+    },
+    {
+      "epoch": 0.00920969641491636,
+      "grad_norm": 1.786022663116455,
+      "learning_rate": 2.3816369683740624e-05,
+      "loss": 1.8298,
+      "step": 187300
+    },
+    {
+      "epoch": 0.00948877812445928,
+      "grad_norm": 1.521037220954895,
+      "learning_rate": 2.3794254497740898e-05,
+      "loss": 1.8353,
+      "step": 187400
+    },
+    {
+      "epoch": 0.0097678598340022,
+      "grad_norm": 1.5519471168518066,
+      "learning_rate": 2.3772140257433223e-05,
+      "loss": 1.8361,
+      "step": 187500
+    },
+    {
+      "epoch": 0.010046941543545119,
+      "grad_norm": 1.5187164545059204,
+      "learning_rate": 2.3750026980162256e-05,
+      "loss": 1.8326,
+      "step": 187600
+    },
+    {
+      "epoch": 0.010326023253088039,
+      "grad_norm": 1.7430784702301025,
+      "learning_rate": 2.3727914683271922e-05,
+      "loss": 1.8308,
+      "step": 187700
+    },
+    {
+      "epoch": 0.01060510496263096,
+      "grad_norm": 1.6210083961486816,
+      "learning_rate": 2.3705803384105377e-05,
+      "loss": 1.8252,
+      "step": 187800
+    },
+    {
+      "epoch": 0.01088418667217388,
+      "grad_norm": 1.6390823125839233,
+      "learning_rate": 2.3683693100004985e-05,
+      "loss": 1.8287,
+      "step": 187900
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "grad_norm": 2.0330820083618164,
+      "learning_rate": 2.3661583848312303e-05,
+      "loss": 1.8347,
+      "step": 188000
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "eval_loss": 2.131164073944092,
+      "eval_runtime": 51.4325,
+      "eval_samples_per_second": 198.202,
+      "eval_steps_per_second": 1.555,
+      "step": 188000
+    },
+    {
+      "epoch": 0.011442350091259719,
+      "grad_norm": 1.5582841634750366,
+      "learning_rate": 2.36394756463681e-05,
+      "loss": 1.8215,
+      "step": 188100
+    },
+    {
+      "epoch": 0.011721431800802639,
+      "grad_norm": 1.5832375288009644,
+      "learning_rate": 2.361736851151231e-05,
+      "loss": 1.8316,
+      "step": 188200
+    },
+    {
+      "epoch": 0.012000513510345558,
+      "grad_norm": 1.578747272491455,
+      "learning_rate": 2.359526246108404e-05,
+      "loss": 1.828,
+      "step": 188300
+    },
+    {
+      "epoch": 0.01227959521988848,
+      "grad_norm": 1.6343365907669067,
+      "learning_rate": 2.3573157512421535e-05,
+      "loss": 1.8348,
+      "step": 188400
+    },
+    {
+      "epoch": 0.0125586769294314,
+      "grad_norm": 1.5738635063171387,
+      "learning_rate": 2.3551053682862177e-05,
+      "loss": 1.8271,
+      "step": 188500
+    },
+    {
+      "epoch": 0.012837758638974319,
+      "grad_norm": 1.6531946659088135,
+      "learning_rate": 2.3528950989742472e-05,
+      "loss": 1.8168,
+      "step": 188600
+    },
+    {
+      "epoch": 0.013116840348517239,
+      "grad_norm": 2.098233699798584,
+      "learning_rate": 2.350684945039804e-05,
+      "loss": 1.8323,
+      "step": 188700
+    },
+    {
+      "epoch": 0.013395922058060158,
+      "grad_norm": 1.6470394134521484,
+      "learning_rate": 2.3484749082163605e-05,
+      "loss": 1.8353,
+      "step": 188800
+    },
+    {
+      "epoch": 0.013675003767603078,
+      "grad_norm": 1.6183503866195679,
+      "learning_rate": 2.346264990237293e-05,
+      "loss": 1.8204,
+      "step": 188900
+    },
+    {
+      "epoch": 0.013954085477146,
+      "grad_norm": 1.60996675491333,
+      "learning_rate": 2.3440551928358894e-05,
+      "loss": 1.8291,
+      "step": 189000
+    },
+    {
+      "epoch": 0.013954085477146,
+      "eval_loss": 2.130070924758911,
+      "eval_runtime": 51.3411,
+      "eval_samples_per_second": 198.554,
+      "eval_steps_per_second": 1.558,
+      "step": 189000
+    },
+    {
+      "epoch": 0.01423316718668892,
+      "grad_norm": 1.5722655057907104,
+      "learning_rate": 2.3418455177453416e-05,
+      "loss": 1.8258,
+      "step": 189100
+    },
+    {
+      "epoch": 0.014512248896231839,
+      "grad_norm": 2.121628999710083,
+      "learning_rate": 2.339635966698745e-05,
+      "loss": 1.8324,
+      "step": 189200
+    },
+    {
+      "epoch": 0.014791330605774759,
+      "grad_norm": 1.6077678203582764,
+      "learning_rate": 2.3374265414290962e-05,
+      "loss": 1.8243,
+      "step": 189300
+    },
+    {
+      "epoch": 0.015070412315317678,
+      "grad_norm": 1.5904488563537598,
+      "learning_rate": 2.335217243669296e-05,
+      "loss": 1.825,
+      "step": 189400
+    },
+    {
+      "epoch": 0.015349494024860598,
+      "grad_norm": 1.536439061164856,
+      "learning_rate": 2.333008075152144e-05,
+      "loss": 1.8242,
+      "step": 189500
+    },
+    {
+      "epoch": 0.01562857573440352,
+      "grad_norm": 2.195769786834717,
+      "learning_rate": 2.3307990376103388e-05,
+      "loss": 1.8365,
+      "step": 189600
+    },
+    {
+      "epoch": 0.015907657443946437,
+      "grad_norm": 1.533521294593811,
+      "learning_rate": 2.328590132776475e-05,
+      "loss": 1.8266,
+      "step": 189700
+    },
+    {
+      "epoch": 0.01618673915348936,
+      "grad_norm": 1.5849336385726929,
+      "learning_rate": 2.326381362383045e-05,
+      "loss": 1.8206,
+      "step": 189800
+    },
+    {
+      "epoch": 0.01646582086303228,
+      "grad_norm": 1.5556162595748901,
+      "learning_rate": 2.3241727281624335e-05,
+      "loss": 1.8272,
+      "step": 189900
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "grad_norm": 1.6486213207244873,
+      "learning_rate": 2.3219642318469215e-05,
+      "loss": 1.8333,
+      "step": 190000
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "eval_loss": 2.1350369453430176,
+      "eval_runtime": 51.4386,
+      "eval_samples_per_second": 198.178,
+      "eval_steps_per_second": 1.555,
+      "step": 190000
+    },
+    {
+      "epoch": 0.01702398428211812,
+      "grad_norm": 1.6402443647384644,
+      "learning_rate": 2.3197558751686776e-05,
+      "loss": 1.83,
+      "step": 190100
+    },
+    {
+      "epoch": 0.017303065991661037,
+      "grad_norm": 1.5592520236968994,
+      "learning_rate": 2.3175476598597648e-05,
+      "loss": 1.8244,
+      "step": 190200
+    },
+    {
+      "epoch": 0.01758214770120396,
+      "grad_norm": 2.0347630977630615,
+      "learning_rate": 2.3153395876521336e-05,
+      "loss": 1.8385,
+      "step": 190300
+    },
+    {
+      "epoch": 0.017861229410746877,
+      "grad_norm": 1.547045350074768,
+      "learning_rate": 2.3131316602776232e-05,
+      "loss": 1.8216,
+      "step": 190400
+    },
+    {
+      "epoch": 0.018140311120289798,
+      "grad_norm": 1.564841628074646,
+      "learning_rate": 2.3109238794679568e-05,
+      "loss": 1.8232,
+      "step": 190500
+    },
+    {
+      "epoch": 0.01841939282983272,
+      "grad_norm": 1.8858461380004883,
+      "learning_rate": 2.3087162469547443e-05,
+      "loss": 1.8319,
+      "step": 190600
+    },
+    {
+      "epoch": 0.018698474539375638,
+      "grad_norm": 1.7047299146652222,
+      "learning_rate": 2.30650876446948e-05,
+      "loss": 1.8391,
+      "step": 190700
+    },
+    {
+      "epoch": 0.01897755624891856,
+      "grad_norm": 1.510563850402832,
+      "learning_rate": 2.30430143374354e-05,
+      "loss": 1.8226,
+      "step": 190800
+    },
+    {
+      "epoch": 0.019256637958461477,
+      "grad_norm": 2.209728956222534,
+      "learning_rate": 2.3020942565081798e-05,
+      "loss": 1.8307,
+      "step": 190900
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "grad_norm": 1.6156638860702515,
+      "learning_rate": 2.299887234494537e-05,
+      "loss": 1.8208,
+      "step": 191000
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "eval_loss": 2.121595621109009,
+      "eval_runtime": 51.6838,
+      "eval_samples_per_second": 197.238,
+      "eval_steps_per_second": 1.548,
+      "step": 191000
+    },
+    {
+      "epoch": 0.01981480137754732,
+      "grad_norm": 1.5259544849395752,
+      "learning_rate": 2.2976803694336256e-05,
+      "loss": 1.8279,
+      "step": 191100
+    },
+    {
+      "epoch": 0.020093883087090238,
+      "grad_norm": 1.6435580253601074,
+      "learning_rate": 2.2954736630563375e-05,
+      "loss": 1.8291,
+      "step": 191200
+    },
+    {
+      "epoch": 0.02037296479663316,
+      "grad_norm": 1.6680907011032104,
+      "learning_rate": 2.2932671170934405e-05,
+      "loss": 1.834,
+      "step": 191300
+    },
+    {
+      "epoch": 0.020652046506176077,
+      "grad_norm": 1.6637004613876343,
+      "learning_rate": 2.2910607332755744e-05,
+      "loss": 1.8067,
+      "step": 191400
+    },
+    {
+      "epoch": 0.020931128215719,
+      "grad_norm": 1.5594576597213745,
+      "learning_rate": 2.288854513333254e-05,
+      "loss": 1.8132,
+      "step": 191500
+    },
+    {
+      "epoch": 0.02121020992526192,
+      "grad_norm": 1.502920389175415,
+      "learning_rate": 2.2866484589968654e-05,
+      "loss": 1.8337,
+      "step": 191600
+    },
+    {
+      "epoch": 0.021489291634804838,
+      "grad_norm": 1.566256046295166,
+      "learning_rate": 2.2844425719966637e-05,
+      "loss": 1.8216,
+      "step": 191700
+    },
+    {
+      "epoch": 0.02176837334434776,
+      "grad_norm": 1.882520079612732,
+      "learning_rate": 2.2822368540627736e-05,
+      "loss": 1.8178,
+      "step": 191800
+    },
+    {
+      "epoch": 0.022047455053890677,
+      "grad_norm": 1.5686990022659302,
+      "learning_rate": 2.2800313069251867e-05,
+      "loss": 1.831,
+      "step": 191900
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "grad_norm": 1.6161882877349854,
+      "learning_rate": 2.2778259323137607e-05,
+      "loss": 1.8236,
+      "step": 192000
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "eval_loss": 2.1250271797180176,
+      "eval_runtime": 51.7856,
+      "eval_samples_per_second": 196.85,
+      "eval_steps_per_second": 1.545,
+      "step": 192000
+    },
+    {
+      "epoch": 0.022605618472976517,
+      "grad_norm": 1.9454728364944458,
+      "learning_rate": 2.27562073195822e-05,
+      "loss": 1.8262,
+      "step": 192100
+    },
+    {
+      "epoch": 0.022884700182519438,
+      "grad_norm": 1.568524956703186,
+      "learning_rate": 2.273415707588148e-05,
+      "loss": 1.8111,
+      "step": 192200
+    },
+    {
+      "epoch": 0.02316378189206236,
+      "grad_norm": 1.6108800172805786,
+      "learning_rate": 2.2712108609329933e-05,
+      "loss": 1.8097,
+      "step": 192300
+    },
+    {
+      "epoch": 0.023442863601605277,
+      "grad_norm": 1.5785143375396729,
+      "learning_rate": 2.2690061937220656e-05,
+      "loss": 1.8223,
+      "step": 192400
+    },
+    {
+      "epoch": 0.0237219453111482,
+      "grad_norm": 2.498911142349243,
+      "learning_rate": 2.2668017076845323e-05,
+      "loss": 2.0084,
+      "step": 192500
+    },
+    {
+      "epoch": 0.024001027020691117,
+      "grad_norm": 2.186514377593994,
+      "learning_rate": 2.2645974045494175e-05,
+      "loss": 2.48,
+      "step": 192600
+    },
+    {
+      "epoch": 0.024280108730234038,
+      "grad_norm": 2.3486995697021484,
+      "learning_rate": 2.2623932860456044e-05,
+      "loss": 2.4545,
+      "step": 192700
+    },
+    {
+      "epoch": 0.02455919043977696,
+      "grad_norm": 2.1500723361968994,
+      "learning_rate": 2.2601893539018305e-05,
+      "loss": 2.4442,
+      "step": 192800
+    },
+    {
+      "epoch": 0.024838272149319877,
+      "grad_norm": 2.1858279705047607,
+      "learning_rate": 2.2579856098466882e-05,
+      "loss": 2.4291,
+      "step": 192900
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "grad_norm": 2.4530797004699707,
+      "learning_rate": 2.2557820556086187e-05,
+      "loss": 2.4252,
+      "step": 193000
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "eval_loss": 2.1376500129699707,
+      "eval_runtime": 51.9091,
+      "eval_samples_per_second": 196.382,
+      "eval_steps_per_second": 1.541,
+      "step": 193000
+    },
+    {
+      "epoch": 0.025396435568405717,
+      "grad_norm": 2.192619562149048,
+      "learning_rate": 2.253578692915919e-05,
+      "loss": 2.4244,
+      "step": 193100
+    },
+    {
+      "epoch": 0.025675517277948638,
+      "grad_norm": 2.2540953159332275,
+      "learning_rate": 2.2513755234967317e-05,
+      "loss": 2.4187,
+      "step": 193200
+    },
+    {
+      "epoch": 0.025954598987491556,
+      "grad_norm": 2.1056604385375977,
+      "learning_rate": 2.2491725490790526e-05,
+      "loss": 2.4017,
+      "step": 193300
+    },
+    {
+      "epoch": 0.026233680697034478,
+      "grad_norm": 2.1589183807373047,
+      "learning_rate": 2.2469697713907186e-05,
+      "loss": 2.4083,
+      "step": 193400
+    },
+    {
+      "epoch": 0.0265127624065774,
+      "grad_norm": 2.1547043323516846,
+      "learning_rate": 2.244767192159417e-05,
+      "loss": 2.4065,
+      "step": 193500
+    },
+    {
+      "epoch": 0.026791844116120317,
+      "grad_norm": 2.057020425796509,
+      "learning_rate": 2.2425648131126777e-05,
+      "loss": 2.3981,
+      "step": 193600
+    },
+    {
+      "epoch": 0.02707092582566324,
+      "grad_norm": 2.380244255065918,
+      "learning_rate": 2.2403626359778753e-05,
+      "loss": 2.404,
+      "step": 193700
+    },
+    {
+      "epoch": 0.027350007535206156,
+      "grad_norm": 2.1975646018981934,
+      "learning_rate": 2.2381606624822228e-05,
+      "loss": 2.3931,
+      "step": 193800
+    },
+    {
+      "epoch": 0.027629089244749078,
+      "grad_norm": 2.0740909576416016,
+      "learning_rate": 2.2359588943527746e-05,
+      "loss": 2.4027,
+      "step": 193900
+    },
+    {
+      "epoch": 0.027908170954292,
+      "grad_norm": 2.2962470054626465,
+      "learning_rate": 2.233757333316426e-05,
+      "loss": 2.3949,
+      "step": 194000
+    },
+    {
+      "epoch": 0.027908170954292,
+      "eval_loss": 2.148186206817627,
+      "eval_runtime": 51.9071,
+      "eval_samples_per_second": 196.389,
+      "eval_steps_per_second": 1.541,
+      "step": 194000
+    },
+    {
+      "epoch": 0.028187252663834917,
+      "grad_norm": 2.1983277797698975,
+      "learning_rate": 2.2315559810999086e-05,
+      "loss": 2.3911,
+      "step": 194100
+    },
+    {
+      "epoch": 0.02846633437337784,
+      "grad_norm": 2.1726229190826416,
+      "learning_rate": 2.2293548394297893e-05,
+      "loss": 2.3763,
+      "step": 194200
+    },
+    {
+      "epoch": 0.028745416082920756,
+      "grad_norm": 2.190869092941284,
+      "learning_rate": 2.2271539100324705e-05,
+      "loss": 2.3822,
+      "step": 194300
+    },
+    {
+      "epoch": 0.029024497792463678,
+      "grad_norm": 2.150756359100342,
+      "learning_rate": 2.22495319463419e-05,
+      "loss": 2.383,
+      "step": 194400
+    },
+    {
+      "epoch": 0.0293035795020066,
+      "grad_norm": 2.159919500350952,
+      "learning_rate": 2.222752694961014e-05,
+      "loss": 2.3799,
+      "step": 194500
+    },
+    {
+      "epoch": 0.029582661211549517,
+      "grad_norm": 2.1796655654907227,
+      "learning_rate": 2.2205524127388438e-05,
+      "loss": 2.3804,
+      "step": 194600
+    },
+    {
+      "epoch": 0.02986174292109244,
+      "grad_norm": 2.313180446624756,
+      "learning_rate": 2.2183523496934052e-05,
+      "loss": 2.3574,
+      "step": 194700
+    },
+    {
+      "epoch": 0.030140824630635357,
+      "grad_norm": 2.2141001224517822,
+      "learning_rate": 2.2161525075502565e-05,
+      "loss": 2.3726,
+      "step": 194800
+    },
+    {
+      "epoch": 0.030419906340178278,
+      "grad_norm": 2.145921468734741,
+      "learning_rate": 2.2139528880347807e-05,
+      "loss": 2.3633,
+      "step": 194900
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "grad_norm": 2.279843330383301,
+      "learning_rate": 2.2117534928721878e-05,
+      "loss": 2.3747,
+      "step": 195000
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "eval_loss": 2.156066417694092,
+      "eval_runtime": 51.6132,
+      "eval_samples_per_second": 197.508,
+      "eval_steps_per_second": 1.55,
+      "step": 195000
+    },
+    {
+      "epoch": 0.030978069759264117,
+      "grad_norm": 2.107222557067871,
+      "learning_rate": 2.2095543237875088e-05,
+      "loss": 2.3612,
+      "step": 195100
+    },
+    {
+      "epoch": 0.03125715146880704,
+      "grad_norm": 2.1660873889923096,
+      "learning_rate": 2.207355382505599e-05,
+      "loss": 2.3562,
+      "step": 195200
+    },
+    {
+      "epoch": 0.03153623317834996,
+      "grad_norm": 2.206403970718384,
+      "learning_rate": 2.2051566707511362e-05,
+      "loss": 2.371,
+      "step": 195300
+    },
+    {
+      "epoch": 0.031815314887892875,
+      "grad_norm": 2.277531147003174,
+      "learning_rate": 2.2029581902486176e-05,
+      "loss": 2.3571,
+      "step": 195400
+    },
+    {
+      "epoch": 0.0320943965974358,
+      "grad_norm": 2.041177749633789,
+      "learning_rate": 2.200759942722357e-05,
+      "loss": 2.3658,
+      "step": 195500
+    },
+    {
+      "epoch": 0.03237347830697872,
+      "grad_norm": 2.2721259593963623,
+      "learning_rate": 2.1985619298964884e-05,
+      "loss": 2.3531,
+      "step": 195600
+    },
+    {
+      "epoch": 0.032652560016521635,
+      "grad_norm": 2.2664246559143066,
+      "learning_rate": 2.1963641534949597e-05,
+      "loss": 2.3522,
+      "step": 195700
+    },
+    {
+      "epoch": 0.03293164172606456,
+      "grad_norm": 2.323575258255005,
+      "learning_rate": 2.1941666152415343e-05,
+      "loss": 2.342,
+      "step": 195800
+    },
+    {
+      "epoch": 0.03321072343560748,
+      "grad_norm": 2.1575205326080322,
+      "learning_rate": 2.1919693168597887e-05,
+      "loss": 2.3505,
+      "step": 195900
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "grad_norm": 2.265693187713623,
+      "learning_rate": 2.1897722600731107e-05,
+      "loss": 2.3428,
+      "step": 196000
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "eval_loss": 2.1471190452575684,
+      "eval_runtime": 51.714,
+      "eval_samples_per_second": 197.123,
+      "eval_steps_per_second": 1.547,
+      "step": 196000
+    },
+    {
+      "epoch": 0.033768886854693314,
+      "grad_norm": 2.1313159465789795,
+      "learning_rate": 2.187575446604699e-05,
+      "loss": 2.344,
+      "step": 196100
+    },
+    {
+      "epoch": 0.03404796856423624,
+      "grad_norm": 2.165553569793701,
+      "learning_rate": 2.1853788781775626e-05,
+      "loss": 2.3369,
+      "step": 196200
+    },
+    {
+      "epoch": 0.03432705027377916,
+      "grad_norm": 2.348489999771118,
+      "learning_rate": 2.1831825565145155e-05,
+      "loss": 2.3325,
+      "step": 196300
+    },
+    {
+      "epoch": 0.034606131983322075,
+      "grad_norm": 2.2844085693359375,
+      "learning_rate": 2.1809864833381816e-05,
+      "loss": 2.3458,
+      "step": 196400
+    },
+    {
+      "epoch": 0.034885213692865,
+      "grad_norm": 2.1077094078063965,
+      "learning_rate": 2.1787906603709863e-05,
+      "loss": 2.3301,
+      "step": 196500
+    },
+    {
+      "epoch": 0.03516429540240792,
+      "grad_norm": 2.2360150814056396,
+      "learning_rate": 2.1765950893351627e-05,
+      "loss": 2.3357,
+      "step": 196600
+    },
+    {
+      "epoch": 0.035443377111950836,
+      "grad_norm": 2.1342897415161133,
+      "learning_rate": 2.1743997719527423e-05,
+      "loss": 2.3309,
+      "step": 196700
+    },
+    {
+      "epoch": 0.035722458821493754,
+      "grad_norm": 2.3143725395202637,
+      "learning_rate": 2.17220470994556e-05,
+      "loss": 2.3497,
+      "step": 196800
+    },
+    {
+      "epoch": 0.03600154053103668,
+      "grad_norm": 2.207287549972534,
+      "learning_rate": 2.170009905035251e-05,
+      "loss": 2.3268,
+      "step": 196900
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "grad_norm": 2.1440131664276123,
+      "learning_rate": 2.167815358943248e-05,
+      "loss": 2.3535,
+      "step": 197000
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "eval_loss": 2.1492607593536377,
+      "eval_runtime": 51.6958,
+      "eval_samples_per_second": 197.192,
+      "eval_steps_per_second": 1.548,
+      "step": 197000
+    },
+    {
+      "epoch": 0.036559703950122514,
+      "grad_norm": 2.189824342727661,
+      "learning_rate": 2.165621073390779e-05,
+      "loss": 2.3368,
+      "step": 197100
+    },
+    {
+      "epoch": 0.03683878565966544,
+      "grad_norm": 2.1990151405334473,
+      "learning_rate": 2.16342705009887e-05,
+      "loss": 2.3344,
+      "step": 197200
+    },
+    {
+      "epoch": 0.03711786736920836,
+      "grad_norm": 2.077488899230957,
+      "learning_rate": 2.1612332907883405e-05,
+      "loss": 2.3267,
+      "step": 197300
+    },
+    {
+      "epoch": 0.037396949078751275,
+      "grad_norm": 2.2698981761932373,
+      "learning_rate": 2.1590397971798025e-05,
+      "loss": 2.3285,
+      "step": 197400
+    },
+    {
+      "epoch": 0.0376760307882942,
+      "grad_norm": 2.2117862701416016,
+      "learning_rate": 2.1568465709936615e-05,
+      "loss": 2.322,
+      "step": 197500
+    },
+    {
+      "epoch": 0.03795511249783712,
+      "grad_norm": 2.194138288497925,
+      "learning_rate": 2.15465361395011e-05,
+      "loss": 2.3228,
+      "step": 197600
+    },
+    {
+      "epoch": 0.038234194207380036,
+      "grad_norm": 2.151017665863037,
+      "learning_rate": 2.1524609277691327e-05,
+      "loss": 2.3376,
+      "step": 197700
+    },
+    {
+      "epoch": 0.038513275916922954,
+      "grad_norm": 2.273414373397827,
+      "learning_rate": 2.1502685141704992e-05,
+      "loss": 2.3298,
+      "step": 197800
+    },
+    {
+      "epoch": 0.03879235762646588,
+      "grad_norm": 2.2569565773010254,
+      "learning_rate": 2.148076374873768e-05,
+      "loss": 2.3371,
+      "step": 197900
+    },
+    {
+      "epoch": 0.0390714393360088,
+      "grad_norm": 2.109938621520996,
+      "learning_rate": 2.1458845115982783e-05,
+      "loss": 2.3074,
+      "step": 198000
+    },
+    {
+      "epoch": 0.0390714393360088,
+      "eval_loss": 2.156459331512451,
+      "eval_runtime": 51.6968,
+      "eval_samples_per_second": 197.188,
+      "eval_steps_per_second": 1.547,
+      "step": 198000
+    },
+    {
+      "epoch": 0.039350521045551715,
+      "grad_norm": 2.1745998859405518,
+      "learning_rate": 2.1436929260631578e-05,
+      "loss": 2.3337,
+      "step": 198100
+    },
+    {
+      "epoch": 0.03962960275509464,
+      "grad_norm": 2.120976448059082,
+      "learning_rate": 2.141501619987313e-05,
+      "loss": 2.3231,
+      "step": 198200
+    },
+    {
+      "epoch": 0.03990868446463756,
+      "grad_norm": 2.1885461807250977,
+      "learning_rate": 2.139310595089434e-05,
+      "loss": 2.3277,
+      "step": 198300
+    },
+    {
+      "epoch": 0.040187766174180475,
+      "grad_norm": 2.2620506286621094,
+      "learning_rate": 2.137119853087986e-05,
+      "loss": 2.3335,
+      "step": 198400
+    },
+    {
+      "epoch": 0.04046684788372339,
+      "grad_norm": 2.1864798069000244,
+      "learning_rate": 2.1349293957012156e-05,
+      "loss": 2.3239,
+      "step": 198500
+    },
+    {
+      "epoch": 0.04074592959326632,
+      "grad_norm": 2.1792876720428467,
+      "learning_rate": 2.1327392246471463e-05,
+      "loss": 2.3166,
+      "step": 198600
+    },
+    {
+      "epoch": 0.041025011302809236,
+      "grad_norm": 2.264899730682373,
+      "learning_rate": 2.1305493416435765e-05,
+      "loss": 2.3171,
+      "step": 198700
+    },
+    {
+      "epoch": 0.041304093012352154,
+      "grad_norm": 1.9806472063064575,
+      "learning_rate": 2.1283597484080765e-05,
+      "loss": 2.3247,
+      "step": 198800
+    },
+    {
+      "epoch": 0.04158317472189508,
+      "grad_norm": 2.1849722862243652,
+      "learning_rate": 2.1261704466579928e-05,
+      "loss": 2.3158,
+      "step": 198900
+    },
+    {
+      "epoch": 0.041862256431438,
+      "grad_norm": 2.025466203689575,
+      "learning_rate": 2.1239814381104417e-05,
+      "loss": 2.3061,
+      "step": 199000
+    },
+    {
+      "epoch": 0.041862256431438,
+      "eval_loss": 2.146428108215332,
+      "eval_runtime": 51.7966,
+      "eval_samples_per_second": 196.808,
+      "eval_steps_per_second": 1.545,
+      "step": 199000
+    },
+    {
+      "epoch": 0.00027908170954291995,
+      "grad_norm": 2.0780715942382812,
+      "learning_rate": 2.1217927244823092e-05,
+      "loss": 2.3137,
+      "step": 199100
+    },
+    {
+      "epoch": 0.0005581634190858399,
+      "grad_norm": 2.1887388229370117,
+      "learning_rate": 2.1196043074902503e-05,
+      "loss": 2.311,
+      "step": 199200
+    },
+    {
+      "epoch": 0.0008372451286287599,
+      "grad_norm": 2.110805034637451,
+      "learning_rate": 2.1174161888506867e-05,
+      "loss": 2.3166,
+      "step": 199300
+    },
+    {
+      "epoch": 0.0011163268381716798,
+      "grad_norm": 2.1829800605773926,
+      "learning_rate": 2.1152283702798077e-05,
+      "loss": 2.3035,
+      "step": 199400
+    },
+    {
+      "epoch": 0.0013954085477146,
+      "grad_norm": 2.2523720264434814,
+      "learning_rate": 2.1130408534935664e-05,
+      "loss": 2.3104,
+      "step": 199500
+    },
+    {
+      "epoch": 0.0016744902572575198,
+      "grad_norm": 2.268869400024414,
+      "learning_rate": 2.1108536402076777e-05,
+      "loss": 2.3095,
+      "step": 199600
+    },
+    {
+      "epoch": 0.00195357196680044,
+      "grad_norm": 2.352266788482666,
+      "learning_rate": 2.108666732137622e-05,
+      "loss": 2.3235,
+      "step": 199700
+    },
+    {
+      "epoch": 0.0022326536763433596,
+      "grad_norm": 2.0702219009399414,
+      "learning_rate": 2.106480130998636e-05,
+      "loss": 2.301,
+      "step": 199800
+    },
+    {
+      "epoch": 0.0025117353858862797,
+      "grad_norm": 2.219024896621704,
+      "learning_rate": 2.1042938385057202e-05,
+      "loss": 2.2952,
+      "step": 199900
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "grad_norm": 2.2651102542877197,
+      "learning_rate": 2.102107856373628e-05,
+      "loss": 2.302,
+      "step": 200000
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "eval_loss": 2.1563775539398193,
+      "eval_runtime": 52.0565,
+      "eval_samples_per_second": 195.826,
+      "eval_steps_per_second": 1.537,
+      "step": 200000
+    },
+    {
+      "epoch": 0.00306989880497212,
+      "grad_norm": 2.2001454830169678,
+      "learning_rate": 2.0999221863168736e-05,
+      "loss": 2.3131,
+      "step": 200100
+    },
+    {
+      "epoch": 0.0033489805145150396,
+      "grad_norm": 2.1782033443450928,
+      "learning_rate": 2.0977368300497246e-05,
+      "loss": 2.3084,
+      "step": 200200
+    },
+    {
+      "epoch": 0.0036280622240579597,
+      "grad_norm": 2.282090663909912,
+      "learning_rate": 2.095551789286204e-05,
+      "loss": 2.2983,
+      "step": 200300
+    },
+    {
+      "epoch": 0.00390714393360088,
+      "grad_norm": 2.1379668712615967,
+      "learning_rate": 2.0933670657400838e-05,
+      "loss": 2.2989,
+      "step": 200400
+    },
+    {
+      "epoch": 0.0041862256431437995,
+      "grad_norm": 2.3254175186157227,
+      "learning_rate": 2.091182661124891e-05,
+      "loss": 2.3211,
+      "step": 200500
+    },
+    {
+      "epoch": 0.004465307352686719,
+      "grad_norm": 2.112151622772217,
+      "learning_rate": 2.0889985771539002e-05,
+      "loss": 2.288,
+      "step": 200600
+    },
+    {
+      "epoch": 0.00474438906222964,
+      "grad_norm": 2.2548341751098633,
+      "learning_rate": 2.0868148155401356e-05,
+      "loss": 2.3027,
+      "step": 200700
+    },
+    {
+      "epoch": 0.005023470771772559,
+      "grad_norm": 2.3280234336853027,
+      "learning_rate": 2.0846313779963696e-05,
+      "loss": 2.3049,
+      "step": 200800
+    },
+    {
+      "epoch": 0.00530255248131548,
+      "grad_norm": 2.256028175354004,
+      "learning_rate": 2.0824482662351167e-05,
+      "loss": 2.3023,
+      "step": 200900
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "grad_norm": 2.195711851119995,
+      "learning_rate": 2.0802654819686398e-05,
+      "loss": 2.2887,
+      "step": 201000
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "eval_loss": 2.14955997467041,
+      "eval_runtime": 51.6475,
+      "eval_samples_per_second": 197.376,
+      "eval_steps_per_second": 1.549,
+      "step": 201000
+    },
+    {
+      "epoch": 0.005860715900401319,
+      "grad_norm": 2.0905580520629883,
+      "learning_rate": 2.0780830269089423e-05,
+      "loss": 2.2914,
+      "step": 201100
+    },
+    {
+      "epoch": 0.00613979760994424,
+      "grad_norm": 2.1279406547546387,
+      "learning_rate": 2.0759009027677727e-05,
+      "loss": 2.3037,
+      "step": 201200
+    },
+    {
+      "epoch": 0.0064188793194871595,
+      "grad_norm": 2.178835868835449,
+      "learning_rate": 2.0737191112566146e-05,
+      "loss": 2.2989,
+      "step": 201300
+    },
+    {
+      "epoch": 0.006697961029030079,
+      "grad_norm": 2.2267632484436035,
+      "learning_rate": 2.071537654086696e-05,
+      "loss": 2.2928,
+      "step": 201400
+    },
+    {
+      "epoch": 0.006977042738573,
+      "grad_norm": 2.310661792755127,
+      "learning_rate": 2.0693565329689793e-05,
+      "loss": 2.3337,
+      "step": 201500
+    },
+    {
+      "epoch": 0.0072561244481159195,
+      "grad_norm": 2.2507314682006836,
+      "learning_rate": 2.0671757496141665e-05,
+      "loss": 2.3269,
+      "step": 201600
+    },
+    {
+      "epoch": 0.007535206157658839,
+      "grad_norm": 2.161654472351074,
+      "learning_rate": 2.0649953057326904e-05,
+      "loss": 2.3191,
+      "step": 201700
+    },
+    {
+      "epoch": 0.00781428786720176,
+      "grad_norm": 2.2663004398345947,
+      "learning_rate": 2.0628152030347214e-05,
+      "loss": 2.3153,
+      "step": 201800
+    },
+    {
+      "epoch": 0.00809336957674468,
+      "grad_norm": 2.2835566997528076,
+      "learning_rate": 2.06063544323016e-05,
+      "loss": 2.3127,
+      "step": 201900
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "grad_norm": 2.1445398330688477,
+      "learning_rate": 2.0584560280286397e-05,
+      "loss": 2.2974,
+      "step": 202000
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "eval_loss": 2.17156720161438,
+      "eval_runtime": 51.3282,
+      "eval_samples_per_second": 198.604,
+      "eval_steps_per_second": 1.559,
+      "step": 202000
+    },
+    {
+      "epoch": 0.008651532995830519,
+      "grad_norm": 2.2205893993377686,
+      "learning_rate": 2.0562769591395203e-05,
+      "loss": 2.3078,
+      "step": 202100
+    },
+    {
+      "epoch": 0.008930614705373438,
+      "grad_norm": 2.205244541168213,
+      "learning_rate": 2.054098238271894e-05,
+      "loss": 2.2938,
+      "step": 202200
+    },
+    {
+      "epoch": 0.00920969641491636,
+      "grad_norm": 2.2526943683624268,
+      "learning_rate": 2.0519198671345784e-05,
+      "loss": 2.2967,
+      "step": 202300
+    },
+    {
+      "epoch": 0.00948877812445928,
+      "grad_norm": 2.3271262645721436,
+      "learning_rate": 2.049741847436116e-05,
+      "loss": 2.2701,
+      "step": 202400
+    },
+    {
+      "epoch": 0.0097678598340022,
+      "grad_norm": 2.225120782852173,
+      "learning_rate": 2.047564180884775e-05,
+      "loss": 2.3035,
+      "step": 202500
+    },
+    {
+      "epoch": 0.010046941543545119,
+      "grad_norm": 2.1193225383758545,
+      "learning_rate": 2.0453868691885446e-05,
+      "loss": 2.287,
+      "step": 202600
+    },
+    {
+      "epoch": 0.010326023253088039,
+      "grad_norm": 2.305154800415039,
+      "learning_rate": 2.043209914055138e-05,
+      "loss": 2.2997,
+      "step": 202700
+    },
+    {
+      "epoch": 0.01060510496263096,
+      "grad_norm": 2.1183183193206787,
+      "learning_rate": 2.041033317191989e-05,
+      "loss": 2.3005,
+      "step": 202800
+    },
+    {
+      "epoch": 0.01088418667217388,
+      "grad_norm": 2.175776481628418,
+      "learning_rate": 2.0388570803062465e-05,
+      "loss": 2.2992,
+      "step": 202900
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "grad_norm": 2.2266392707824707,
+      "learning_rate": 2.036681205104782e-05,
+      "loss": 2.2959,
+      "step": 203000
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "eval_loss": 2.167436122894287,
+      "eval_runtime": 51.341,
+      "eval_samples_per_second": 198.555,
+      "eval_steps_per_second": 1.558,
+      "step": 203000
+    },
+    {
+      "epoch": 0.011442350091259719,
+      "grad_norm": 2.2803258895874023,
+      "learning_rate": 2.0345056932941793e-05,
+      "loss": 2.2866,
+      "step": 203100
+    },
+    {
+      "epoch": 0.011721431800802639,
+      "grad_norm": 2.1691677570343018,
+      "learning_rate": 2.032330546580741e-05,
+      "loss": 2.2798,
+      "step": 203200
+    },
+    {
+      "epoch": 0.012000513510345558,
+      "grad_norm": 2.0682337284088135,
+      "learning_rate": 2.0301557666704787e-05,
+      "loss": 2.2847,
+      "step": 203300
+    },
+    {
+      "epoch": 0.01227959521988848,
+      "grad_norm": 2.41520357131958,
+      "learning_rate": 2.0279813552691208e-05,
+      "loss": 2.2897,
+      "step": 203400
+    },
+    {
+      "epoch": 0.0125586769294314,
+      "grad_norm": 2.2019283771514893,
+      "learning_rate": 2.025807314082104e-05,
+      "loss": 2.2855,
+      "step": 203500
+    },
+    {
+      "epoch": 0.012837758638974319,
+      "grad_norm": 2.154576539993286,
+      "learning_rate": 2.0236336448145766e-05,
+      "loss": 2.2726,
+      "step": 203600
+    },
+    {
+      "epoch": 0.013116840348517239,
+      "grad_norm": 2.2569046020507812,
+      "learning_rate": 2.0214603491713928e-05,
+      "loss": 2.2666,
+      "step": 203700
+    },
+    {
+      "epoch": 0.013395922058060158,
+      "grad_norm": 2.306614875793457,
+      "learning_rate": 2.0192874288571152e-05,
+      "loss": 2.2826,
+      "step": 203800
+    },
+    {
+      "epoch": 0.013675003767603078,
+      "grad_norm": 2.2659449577331543,
+      "learning_rate": 2.017114885576012e-05,
+      "loss": 2.288,
+      "step": 203900
+    },
+    {
+      "epoch": 0.013954085477146,
+      "grad_norm": 2.14077091217041,
+      "learning_rate": 2.0149427210320545e-05,
+      "loss": 2.2729,
+      "step": 204000
+    },
+    {
+      "epoch": 0.013954085477146,
+      "eval_loss": 2.164825916290283,
+      "eval_runtime": 51.3793,
+      "eval_samples_per_second": 198.407,
+      "eval_steps_per_second": 1.557,
+      "step": 204000
+    },
+    {
+      "epoch": 0.01423316718668892,
+      "grad_norm": 2.265152931213379,
+      "learning_rate": 2.0127709369289202e-05,
+      "loss": 2.2654,
+      "step": 204100
+    },
+    {
+      "epoch": 0.014512248896231839,
+      "grad_norm": 2.0833661556243896,
+      "learning_rate": 2.0105995349699832e-05,
+      "loss": 2.2863,
+      "step": 204200
+    },
+    {
+      "epoch": 0.014791330605774759,
+      "grad_norm": 2.2797181606292725,
+      "learning_rate": 2.008428516858323e-05,
+      "loss": 2.2702,
+      "step": 204300
+    },
+    {
+      "epoch": 0.015070412315317678,
+      "grad_norm": 2.2614681720733643,
+      "learning_rate": 2.006257884296713e-05,
+      "loss": 2.2846,
+      "step": 204400
+    },
+    {
+      "epoch": 0.015349494024860598,
+      "grad_norm": 2.1245336532592773,
+      "learning_rate": 2.00408763898763e-05,
+      "loss": 2.2758,
+      "step": 204500
+    },
+    {
+      "epoch": 0.01562857573440352,
+      "grad_norm": 2.14581298828125,
+      "learning_rate": 2.001917782633241e-05,
+      "loss": 2.2624,
+      "step": 204600
+    },
+    {
+      "epoch": 0.015907657443946437,
+      "grad_norm": 2.240208864212036,
+      "learning_rate": 1.9997483169354124e-05,
+      "loss": 2.2563,
+      "step": 204700
+    },
+    {
+      "epoch": 0.01618673915348936,
+      "grad_norm": 2.290208578109741,
+      "learning_rate": 1.9975792435957024e-05,
+      "loss": 2.2733,
+      "step": 204800
+    },
+    {
+      "epoch": 0.01646582086303228,
+      "grad_norm": 2.309551954269409,
+      "learning_rate": 1.9954105643153624e-05,
+      "loss": 2.2575,
+      "step": 204900
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "grad_norm": 2.183645009994507,
+      "learning_rate": 1.9932422807953323e-05,
+      "loss": 2.2796,
+      "step": 205000
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "eval_loss": 2.1678764820098877,
+      "eval_runtime": 51.2196,
+      "eval_samples_per_second": 199.025,
+      "eval_steps_per_second": 1.562,
+      "step": 205000
+    },
+    {
+      "epoch": 0.01702398428211812,
+      "grad_norm": 2.1871604919433594,
+      "learning_rate": 1.9910743947362455e-05,
+      "loss": 2.2631,
+      "step": 205100
+    },
+    {
+      "epoch": 0.017303065991661037,
+      "grad_norm": 2.1617250442504883,
+      "learning_rate": 1.9889069078384193e-05,
+      "loss": 2.2609,
+      "step": 205200
+    },
+    {
+      "epoch": 0.01758214770120396,
+      "grad_norm": 2.183656692504883,
+      "learning_rate": 1.9867398218018624e-05,
+      "loss": 2.2568,
+      "step": 205300
+    },
+    {
+      "epoch": 0.017861229410746877,
+      "grad_norm": 2.2372233867645264,
+      "learning_rate": 1.9845731383262646e-05,
+      "loss": 2.2663,
+      "step": 205400
+    },
+    {
+      "epoch": 0.018140311120289798,
+      "grad_norm": 2.200566053390503,
+      "learning_rate": 1.9824068591110034e-05,
+      "loss": 2.2511,
+      "step": 205500
+    },
+    {
+      "epoch": 0.01841939282983272,
+      "grad_norm": 2.1325571537017822,
+      "learning_rate": 1.9802409858551382e-05,
+      "loss": 2.2628,
+      "step": 205600
+    },
+    {
+      "epoch": 0.018698474539375638,
+      "grad_norm": 2.1458706855773926,
+      "learning_rate": 1.9780755202574098e-05,
+      "loss": 2.2565,
+      "step": 205700
+    },
+    {
+      "epoch": 0.01897755624891856,
+      "grad_norm": 2.397474527359009,
+      "learning_rate": 1.9759104640162388e-05,
+      "loss": 2.2582,
+      "step": 205800
+    },
+    {
+      "epoch": 0.019256637958461477,
+      "grad_norm": 2.239386558532715,
+      "learning_rate": 1.9737458188297247e-05,
+      "loss": 2.2484,
+      "step": 205900
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "grad_norm": 2.17461895942688,
+      "learning_rate": 1.9715815863956462e-05,
+      "loss": 2.2536,
+      "step": 206000
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "eval_loss": 2.1656434535980225,
+      "eval_runtime": 51.4827,
+      "eval_samples_per_second": 198.008,
+      "eval_steps_per_second": 1.554,
+      "step": 206000
+    },
+    {
+      "epoch": 0.01981480137754732,
+      "grad_norm": 2.1970889568328857,
+      "learning_rate": 1.969417768411458e-05,
+      "loss": 2.269,
+      "step": 206100
+    },
+    {
+      "epoch": 0.020093883087090238,
+      "grad_norm": 2.151305913925171,
+      "learning_rate": 1.967254366574286e-05,
+      "loss": 2.2609,
+      "step": 206200
+    },
+    {
+      "epoch": 0.02037296479663316,
+      "grad_norm": 2.164149045944214,
+      "learning_rate": 1.965091382580935e-05,
+      "loss": 2.2608,
+      "step": 206300
+    },
+    {
+      "epoch": 0.020652046506176077,
+      "grad_norm": 2.203151226043701,
+      "learning_rate": 1.9629288181278795e-05,
+      "loss": 2.2616,
+      "step": 206400
+    },
+    {
+      "epoch": 0.020931128215719,
+      "grad_norm": 2.1855273246765137,
+      "learning_rate": 1.960766674911264e-05,
+      "loss": 2.2614,
+      "step": 206500
+    },
+    {
+      "epoch": 0.02121020992526192,
+      "grad_norm": 2.124351978302002,
+      "learning_rate": 1.958604954626906e-05,
+      "loss": 2.2448,
+      "step": 206600
+    },
+    {
+      "epoch": 0.021489291634804838,
+      "grad_norm": 2.177095890045166,
+      "learning_rate": 1.9564436589702864e-05,
+      "loss": 2.2519,
+      "step": 206700
+    },
+    {
+      "epoch": 0.02176837334434776,
+      "grad_norm": 2.1898281574249268,
+      "learning_rate": 1.9542827896365568e-05,
+      "loss": 2.2608,
+      "step": 206800
+    },
+    {
+      "epoch": 0.022047455053890677,
+      "grad_norm": 2.2773730754852295,
+      "learning_rate": 1.9521223483205342e-05,
+      "loss": 2.262,
+      "step": 206900
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "grad_norm": 2.2109436988830566,
+      "learning_rate": 1.9499623367166982e-05,
+      "loss": 2.2448,
+      "step": 207000
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "eval_loss": 2.164100408554077,
+      "eval_runtime": 51.5664,
+      "eval_samples_per_second": 197.687,
+      "eval_steps_per_second": 1.551,
+      "step": 207000
+    },
+    {
+      "epoch": 0.022605618472976517,
+      "grad_norm": 2.2141733169555664,
+      "learning_rate": 1.9478027565191922e-05,
+      "loss": 2.2537,
+      "step": 207100
+    },
+    {
+      "epoch": 0.022884700182519438,
+      "grad_norm": 2.2592718601226807,
+      "learning_rate": 1.945643609421821e-05,
+      "loss": 2.2441,
+      "step": 207200
+    },
+    {
+      "epoch": 0.02316378189206236,
+      "grad_norm": 2.2082977294921875,
+      "learning_rate": 1.94348489711805e-05,
+      "loss": 2.2529,
+      "step": 207300
+    },
+    {
+      "epoch": 0.023442863601605277,
+      "grad_norm": 2.2095062732696533,
+      "learning_rate": 1.941326621301005e-05,
+      "loss": 2.2597,
+      "step": 207400
+    },
+    {
+      "epoch": 0.0237219453111482,
+      "grad_norm": 2.189436674118042,
+      "learning_rate": 1.939168783663466e-05,
+      "loss": 2.2455,
+      "step": 207500
+    },
+    {
+      "epoch": 0.024001027020691117,
+      "grad_norm": 2.218168258666992,
+      "learning_rate": 1.9370113858978722e-05,
+      "loss": 2.2485,
+      "step": 207600
+    },
+    {
+      "epoch": 0.024280108730234038,
+      "grad_norm": 2.1648590564727783,
+      "learning_rate": 1.9348544296963165e-05,
+      "loss": 2.2456,
+      "step": 207700
+    },
+    {
+      "epoch": 0.02455919043977696,
+      "grad_norm": 2.121211051940918,
+      "learning_rate": 1.9326979167505474e-05,
+      "loss": 2.2364,
+      "step": 207800
+    },
+    {
+      "epoch": 0.024838272149319877,
+      "grad_norm": 2.271167039871216,
+      "learning_rate": 1.9305418487519617e-05,
+      "loss": 2.2561,
+      "step": 207900
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "grad_norm": 2.3215372562408447,
+      "learning_rate": 1.9283862273916116e-05,
+      "loss": 2.2397,
+      "step": 208000
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "eval_loss": 2.164187431335449,
+      "eval_runtime": 51.5373,
+      "eval_samples_per_second": 197.799,
+      "eval_steps_per_second": 1.552,
+      "step": 208000
+    },
+    {
+      "epoch": 0.025396435568405717,
+      "grad_norm": 2.174811363220215,
+      "learning_rate": 1.9262310543601962e-05,
+      "loss": 2.2412,
+      "step": 208100
+    },
+    {
+      "epoch": 0.025675517277948638,
+      "grad_norm": 2.1047627925872803,
+      "learning_rate": 1.9240763313480655e-05,
+      "loss": 2.2363,
+      "step": 208200
+    },
+    {
+      "epoch": 0.025954598987491556,
+      "grad_norm": 2.2328543663024902,
+      "learning_rate": 1.9219220600452127e-05,
+      "loss": 2.2537,
+      "step": 208300
+    },
+    {
+      "epoch": 0.026233680697034478,
+      "grad_norm": 2.1852455139160156,
+      "learning_rate": 1.919768242141281e-05,
+      "loss": 2.2472,
+      "step": 208400
+    },
+    {
+      "epoch": 0.0265127624065774,
+      "grad_norm": 2.23559832572937,
+      "learning_rate": 1.9176148793255543e-05,
+      "loss": 2.243,
+      "step": 208500
+    },
+    {
+      "epoch": 0.026791844116120317,
+      "grad_norm": 2.195355176925659,
+      "learning_rate": 1.9154619732869626e-05,
+      "loss": 2.2463,
+      "step": 208600
+    },
+    {
+      "epoch": 0.02707092582566324,
+      "grad_norm": 2.295536994934082,
+      "learning_rate": 1.913309525714075e-05,
+      "loss": 2.2413,
+      "step": 208700
+    },
+    {
+      "epoch": 0.027350007535206156,
+      "grad_norm": 2.373781681060791,
+      "learning_rate": 1.9111575382951026e-05,
+      "loss": 2.2385,
+      "step": 208800
+    },
+    {
+      "epoch": 0.027629089244749078,
+      "grad_norm": 2.3178882598876953,
+      "learning_rate": 1.909006012717896e-05,
+      "loss": 2.2454,
+      "step": 208900
+    },
+    {
+      "epoch": 0.027908170954292,
+      "grad_norm": 2.2002763748168945,
+      "learning_rate": 1.9068549506699425e-05,
+      "loss": 2.236,
+      "step": 209000
+    },
+    {
+      "epoch": 0.027908170954292,
+      "eval_loss": 2.1654672622680664,
+      "eval_runtime": 51.5727,
+      "eval_samples_per_second": 197.663,
+      "eval_steps_per_second": 1.551,
+      "step": 209000
+    },
+    {
+      "epoch": 0.028187252663834917,
+      "grad_norm": 2.2618346214294434,
+      "learning_rate": 1.9047043538383662e-05,
+      "loss": 2.2211,
+      "step": 209100
+    },
+    {
+      "epoch": 0.02846633437337784,
+      "grad_norm": 2.2079176902770996,
+      "learning_rate": 1.9025542239099252e-05,
+      "loss": 2.2456,
+      "step": 209200
+    },
+    {
+      "epoch": 0.028745416082920756,
+      "grad_norm": 2.119337797164917,
+      "learning_rate": 1.9004045625710136e-05,
+      "loss": 2.2356,
+      "step": 209300
+    },
+    {
+      "epoch": 0.029024497792463678,
+      "grad_norm": 2.2664501667022705,
+      "learning_rate": 1.8982553715076583e-05,
+      "loss": 2.2403,
+      "step": 209400
+    },
+    {
+      "epoch": 0.0293035795020066,
+      "grad_norm": 2.2333970069885254,
+      "learning_rate": 1.8961066524055128e-05,
+      "loss": 2.2522,
+      "step": 209500
+    },
+    {
+      "epoch": 0.029582661211549517,
+      "grad_norm": 2.1713504791259766,
+      "learning_rate": 1.8939584069498647e-05,
+      "loss": 2.2488,
+      "step": 209600
+    },
+    {
+      "epoch": 0.02986174292109244,
+      "grad_norm": 2.1721699237823486,
+      "learning_rate": 1.8918106368256302e-05,
+      "loss": 2.2418,
+      "step": 209700
+    },
+    {
+      "epoch": 0.030140824630635357,
+      "grad_norm": 2.102562189102173,
+      "learning_rate": 1.88966334371735e-05,
+      "loss": 2.2346,
+      "step": 209800
+    },
+    {
+      "epoch": 0.030419906340178278,
+      "grad_norm": 2.1796703338623047,
+      "learning_rate": 1.8875165293091936e-05,
+      "loss": 2.2445,
+      "step": 209900
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "grad_norm": 2.25935697555542,
+      "learning_rate": 1.885370195284952e-05,
+      "loss": 2.2407,
+      "step": 210000
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "eval_loss": 2.174961566925049,
+      "eval_runtime": 51.6538,
+      "eval_samples_per_second": 197.352,
+      "eval_steps_per_second": 1.549,
+      "step": 210000
+    },
+    {
+      "epoch": 0.030978069759264117,
+      "grad_norm": 2.1532399654388428,
+      "learning_rate": 1.8832243433280412e-05,
+      "loss": 2.2312,
+      "step": 210100
+    },
+    {
+      "epoch": 0.03125715146880704,
+      "grad_norm": 2.322571277618408,
+      "learning_rate": 1.8810789751215e-05,
+      "loss": 2.235,
+      "step": 210200
+    },
+    {
+      "epoch": 0.03153623317834996,
+      "grad_norm": 2.1225528717041016,
+      "learning_rate": 1.8789340923479862e-05,
+      "loss": 2.2175,
+      "step": 210300
+    },
+    {
+      "epoch": 0.031815314887892875,
+      "grad_norm": 2.2108681201934814,
+      "learning_rate": 1.8767896966897768e-05,
+      "loss": 2.239,
+      "step": 210400
+    },
+    {
+      "epoch": 0.0320943965974358,
+      "grad_norm": 2.227198839187622,
+      "learning_rate": 1.8746457898287673e-05,
+      "loss": 2.2274,
+      "step": 210500
+    },
+    {
+      "epoch": 0.03237347830697872,
+      "grad_norm": 2.250565528869629,
+      "learning_rate": 1.8725023734464702e-05,
+      "loss": 2.2318,
+      "step": 210600
+    },
+    {
+      "epoch": 0.032652560016521635,
+      "grad_norm": 2.1811561584472656,
+      "learning_rate": 1.8703594492240138e-05,
+      "loss": 2.2033,
+      "step": 210700
+    },
+    {
+      "epoch": 0.03293164172606456,
+      "grad_norm": 2.1336236000061035,
+      "learning_rate": 1.8682170188421375e-05,
+      "loss": 2.1952,
+      "step": 210800
+    },
+    {
+      "epoch": 0.03321072343560748,
+      "grad_norm": 2.2047863006591797,
+      "learning_rate": 1.8660750839811963e-05,
+      "loss": 2.1909,
+      "step": 210900
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "grad_norm": 2.003309965133667,
+      "learning_rate": 1.8639336463211566e-05,
+      "loss": 2.1693,
+      "step": 211000
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "eval_loss": 2.171804189682007,
+      "eval_runtime": 51.5992,
+      "eval_samples_per_second": 197.561,
+      "eval_steps_per_second": 1.55,
+      "step": 211000
+    },
+    {
+      "epoch": 0.033768886854693314,
+      "grad_norm": 2.105639934539795,
+      "learning_rate": 1.861792707541593e-05,
+      "loss": 2.1683,
+      "step": 211100
+    },
+    {
+      "epoch": 0.03404796856423624,
+      "grad_norm": 2.2332839965820312,
+      "learning_rate": 1.8596522693216888e-05,
+      "loss": 2.1594,
+      "step": 211200
+    },
+    {
+      "epoch": 0.03432705027377916,
+      "grad_norm": 2.2061290740966797,
+      "learning_rate": 1.8575123333402367e-05,
+      "loss": 2.1593,
+      "step": 211300
+    },
+    {
+      "epoch": 0.034606131983322075,
+      "grad_norm": 2.0589332580566406,
+      "learning_rate": 1.855372901275634e-05,
+      "loss": 2.1437,
+      "step": 211400
+    },
+    {
+      "epoch": 0.034885213692865,
+      "grad_norm": 2.1569809913635254,
+      "learning_rate": 1.8532339748058844e-05,
+      "loss": 2.1533,
+      "step": 211500
+    },
+    {
+      "epoch": 0.03516429540240792,
+      "grad_norm": 2.1025686264038086,
+      "learning_rate": 1.8510955556085915e-05,
+      "loss": 2.1525,
+      "step": 211600
+    },
+    {
+      "epoch": 0.035443377111950836,
+      "grad_norm": 2.19555926322937,
+      "learning_rate": 1.848957645360965e-05,
+      "loss": 2.1447,
+      "step": 211700
+    },
+    {
+      "epoch": 0.035722458821493754,
+      "grad_norm": 2.095914840698242,
+      "learning_rate": 1.8468202457398126e-05,
+      "loss": 2.1421,
+      "step": 211800
+    },
+    {
+      "epoch": 0.03600154053103668,
+      "grad_norm": 2.1924917697906494,
+      "learning_rate": 1.8446833584215444e-05,
+      "loss": 2.1416,
+      "step": 211900
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "grad_norm": 2.123359203338623,
+      "learning_rate": 1.8425469850821648e-05,
+      "loss": 2.1465,
+      "step": 212000
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "eval_loss": 2.1811015605926514,
+      "eval_runtime": 51.5948,
+      "eval_samples_per_second": 197.578,
+      "eval_steps_per_second": 1.551,
+      "step": 212000
+    },
+    {
+      "epoch": 0.036559703950122514,
+      "grad_norm": 2.065702438354492,
+      "learning_rate": 1.840411127397278e-05,
+      "loss": 2.1352,
+      "step": 212100
+    },
+    {
+      "epoch": 0.03683878565966544,
+      "grad_norm": 2.0806708335876465,
+      "learning_rate": 1.838275787042083e-05,
+      "loss": 2.1432,
+      "step": 212200
+    },
+    {
+      "epoch": 0.03711786736920836,
+      "grad_norm": 2.1028740406036377,
+      "learning_rate": 1.8361409656913744e-05,
+      "loss": 2.1349,
+      "step": 212300
+    },
+    {
+      "epoch": 0.037396949078751275,
+      "grad_norm": 2.1603927612304688,
+      "learning_rate": 1.8340066650195363e-05,
+      "loss": 2.1307,
+      "step": 212400
+    },
+    {
+      "epoch": 0.0376760307882942,
+      "grad_norm": 2.016268014907837,
+      "learning_rate": 1.831872886700547e-05,
+      "loss": 2.129,
+      "step": 212500
+    },
+    {
+      "epoch": 0.03795511249783712,
+      "grad_norm": 1.9362486600875854,
+      "learning_rate": 1.829739632407975e-05,
+      "loss": 2.1187,
+      "step": 212600
+    },
+    {
+      "epoch": 0.038234194207380036,
+      "grad_norm": 2.1569607257843018,
+      "learning_rate": 1.827606903814977e-05,
+      "loss": 2.1314,
+      "step": 212700
+    },
+    {
+      "epoch": 0.038513275916922954,
+      "grad_norm": 2.0166728496551514,
+      "learning_rate": 1.825474702594299e-05,
+      "loss": 2.1274,
+      "step": 212800
+    },
+    {
+      "epoch": 0.03879235762646588,
+      "grad_norm": 2.1779658794403076,
+      "learning_rate": 1.8233430304182704e-05,
+      "loss": 2.1183,
+      "step": 212900
+    },
+    {
+      "epoch": 0.0390714393360088,
+      "grad_norm": 2.1090939044952393,
+      "learning_rate": 1.821211888958808e-05,
+      "loss": 2.126,
+      "step": 213000
+    },
+    {
+      "epoch": 0.0390714393360088,
+      "eval_loss": 2.1809489727020264,
+      "eval_runtime": 51.5547,
+      "eval_samples_per_second": 197.732,
+      "eval_steps_per_second": 1.552,
+      "step": 213000
+    },
+    {
+      "epoch": 0.039350521045551715,
+      "grad_norm": 2.2175374031066895,
+      "learning_rate": 1.819081279887411e-05,
+      "loss": 2.1201,
+      "step": 213100
+    },
+    {
+      "epoch": 0.03962960275509464,
+      "grad_norm": 2.0139071941375732,
+      "learning_rate": 1.8169512048751648e-05,
+      "loss": 2.1207,
+      "step": 213200
+    },
+    {
+      "epoch": 0.03990868446463756,
+      "grad_norm": 2.101840019226074,
+      "learning_rate": 1.814821665592729e-05,
+      "loss": 2.1145,
+      "step": 213300
+    },
+    {
+      "epoch": 0.040187766174180475,
+      "grad_norm": 2.199965238571167,
+      "learning_rate": 1.8126926637103484e-05,
+      "loss": 2.1256,
+      "step": 213400
+    },
+    {
+      "epoch": 0.04046684788372339,
+      "grad_norm": 2.042839288711548,
+      "learning_rate": 1.8105642008978458e-05,
+      "loss": 2.1096,
+      "step": 213500
+    },
+    {
+      "epoch": 0.04074592959326632,
+      "grad_norm": 2.233668804168701,
+      "learning_rate": 1.808436278824619e-05,
+      "loss": 2.1099,
+      "step": 213600
+    },
+    {
+      "epoch": 0.041025011302809236,
+      "grad_norm": 2.0933728218078613,
+      "learning_rate": 1.8063088991596437e-05,
+      "loss": 2.1014,
+      "step": 213700
+    },
+    {
+      "epoch": 0.041304093012352154,
+      "grad_norm": 2.1422884464263916,
+      "learning_rate": 1.8041820635714682e-05,
+      "loss": 2.1034,
+      "step": 213800
+    },
+    {
+      "epoch": 0.04158317472189508,
+      "grad_norm": 2.0475480556488037,
+      "learning_rate": 1.802055773728216e-05,
+      "loss": 2.1116,
+      "step": 213900
+    },
+    {
+      "epoch": 0.041862256431438,
+      "grad_norm": 2.0574936866760254,
+      "learning_rate": 1.799930031297583e-05,
+      "loss": 2.1181,
+      "step": 214000
+    },
+    {
+      "epoch": 0.041862256431438,
+      "eval_loss": 2.1702778339385986,
+      "eval_runtime": 51.7407,
+      "eval_samples_per_second": 197.021,
+      "eval_steps_per_second": 1.546,
+      "step": 214000
     }
   ],
   "logging_steps": 100,
       "attributes": {}
     }
   },
+  "total_flos": 1.8676295751696384e+19,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6be0aaef9589a43e4cde380bc3e83ccd55ea3b262dc3f11f0bbc4b35fc934376
 size 5777

 version https://git-lfs.github.com/spec/v1
+oid sha256:8b369d4c284193104629459ff70a317184ca3f350753d5cc563977de982dd1e9
 size 5777