PurplelinkPL commited on
Commit
c8ffb41
·
verified ·
1 Parent(s): 40d61ef

Upload 10 files

Browse files
Files changed (6) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1485 -3
  6. training_args.bin +1 -1
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4baee20d911bded3ac972714a9c339be4051aac75f3be17c5dd47c3bb0a04e63
3
  size 598635032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cf6fd855c268d3938b4c6d6a77aa9284b5bd03c679875c89cb3579c489295b8
3
  size 598635032
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dffb20f09d581f1a8db94110ac7014fac958626dee3c29e960da4cb1c9f38e85
3
  size 1197359627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20d57d69e8d4eb0bcf9143bb2a5722964a200d83b3b1c090ed18f98299556b3a
3
  size 1197359627
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db4c787397c7bd17a5fb6bef85caf0ed539cdf41b0fe201e17b766ab049c2a38
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be293c0bc96c40007a1ca95bf99da704f29c24d932c7c8e19b962a361adfdc4c
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:528826677c74cc85ac3103f0a7ddc5d791ae235096533cc53310113c112a4947
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af5ee4dc438217ac40b2a125900214146b721f2725ba954be785ed61a3abe011
3
  size 1465
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.0195357196680044,
6
  "eval_steps": 1000,
7
- "global_step": 167000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -13041,6 +13041,1488 @@
13041
  "eval_samples_per_second": 195.51,
13042
  "eval_steps_per_second": 1.534,
13043
  "step": 167000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13044
  }
13045
  ],
13046
  "logging_steps": 100,
@@ -13060,7 +14542,7 @@
13060
  "attributes": {}
13061
  }
13062
  },
13063
- "total_flos": 1.4574492479127552e+19,
13064
  "train_batch_size": 128,
13065
  "trial_name": null,
13066
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.0055816341908584,
6
  "eval_steps": 1000,
7
+ "global_step": 186000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
13041
  "eval_samples_per_second": 195.51,
13042
  "eval_steps_per_second": 1.534,
13043
  "step": 167000
13044
+ },
13045
+ {
13046
+ "epoch": 0.01981480137754732,
13047
+ "grad_norm": 1.5712428092956543,
13048
+ "learning_rate": 2.8278830871985708e-05,
13049
+ "loss": 1.7747,
13050
+ "step": 167100
13051
+ },
13052
+ {
13053
+ "epoch": 0.020093883087090238,
13054
+ "grad_norm": 1.5386340618133545,
13055
+ "learning_rate": 2.8256880354422098e-05,
13056
+ "loss": 1.7738,
13057
+ "step": 167200
13058
+ },
13059
+ {
13060
+ "epoch": 0.02037296479663316,
13061
+ "grad_norm": 1.5471428632736206,
13062
+ "learning_rate": 2.8234927282417417e-05,
13063
+ "loss": 1.779,
13064
+ "step": 167300
13065
+ },
13066
+ {
13067
+ "epoch": 0.020652046506176077,
13068
+ "grad_norm": 1.5163718461990356,
13069
+ "learning_rate": 2.821297167318992e-05,
13070
+ "loss": 1.7741,
13071
+ "step": 167400
13072
+ },
13073
+ {
13074
+ "epoch": 0.020931128215719,
13075
+ "grad_norm": 1.5554001331329346,
13076
+ "learning_rate": 2.819101354395986e-05,
13077
+ "loss": 1.7825,
13078
+ "step": 167500
13079
+ },
13080
+ {
13081
+ "epoch": 0.02121020992526192,
13082
+ "grad_norm": 1.4839155673980713,
13083
+ "learning_rate": 2.8169052911949484e-05,
13084
+ "loss": 1.7729,
13085
+ "step": 167600
13086
+ },
13087
+ {
13088
+ "epoch": 0.021489291634804838,
13089
+ "grad_norm": 1.5696512460708618,
13090
+ "learning_rate": 2.8147089794382965e-05,
13091
+ "loss": 1.7754,
13092
+ "step": 167700
13093
+ },
13094
+ {
13095
+ "epoch": 0.02176837334434776,
13096
+ "grad_norm": 1.6730250120162964,
13097
+ "learning_rate": 2.8125124208486465e-05,
13098
+ "loss": 1.7736,
13099
+ "step": 167800
13100
+ },
13101
+ {
13102
+ "epoch": 0.022047455053890677,
13103
+ "grad_norm": 1.6201075315475464,
13104
+ "learning_rate": 2.810315617148806e-05,
13105
+ "loss": 1.7771,
13106
+ "step": 167900
13107
+ },
13108
+ {
13109
+ "epoch": 0.0223265367634336,
13110
+ "grad_norm": 1.6662862300872803,
13111
+ "learning_rate": 2.8081185700617746e-05,
13112
+ "loss": 1.7761,
13113
+ "step": 168000
13114
+ },
13115
+ {
13116
+ "epoch": 0.0223265367634336,
13117
+ "eval_loss": 2.161256790161133,
13118
+ "eval_runtime": 52.0964,
13119
+ "eval_samples_per_second": 195.676,
13120
+ "eval_steps_per_second": 1.536,
13121
+ "step": 168000
13122
+ },
13123
+ {
13124
+ "epoch": 0.022605618472976517,
13125
+ "grad_norm": 1.4697953462600708,
13126
+ "learning_rate": 2.8059212813107438e-05,
13127
+ "loss": 1.7894,
13128
+ "step": 168100
13129
+ },
13130
+ {
13131
+ "epoch": 0.022884700182519438,
13132
+ "grad_norm": 1.6400997638702393,
13133
+ "learning_rate": 2.803723752619094e-05,
13134
+ "loss": 1.7779,
13135
+ "step": 168200
13136
+ },
13137
+ {
13138
+ "epoch": 0.02316378189206236,
13139
+ "grad_norm": 1.5220052003860474,
13140
+ "learning_rate": 2.8015259857103942e-05,
13141
+ "loss": 1.7732,
13142
+ "step": 168300
13143
+ },
13144
+ {
13145
+ "epoch": 0.023442863601605277,
13146
+ "grad_norm": 1.542869210243225,
13147
+ "learning_rate": 2.7993279823084007e-05,
13148
+ "loss": 1.7771,
13149
+ "step": 168400
13150
+ },
13151
+ {
13152
+ "epoch": 0.0237219453111482,
13153
+ "grad_norm": 1.4953099489212036,
13154
+ "learning_rate": 2.7971297441370542e-05,
13155
+ "loss": 1.7774,
13156
+ "step": 168500
13157
+ },
13158
+ {
13159
+ "epoch": 0.024001027020691117,
13160
+ "grad_norm": 1.5665849447250366,
13161
+ "learning_rate": 2.7949312729204803e-05,
13162
+ "loss": 1.7633,
13163
+ "step": 168600
13164
+ },
13165
+ {
13166
+ "epoch": 0.024280108730234038,
13167
+ "grad_norm": 1.5860687494277954,
13168
+ "learning_rate": 2.792732570382986e-05,
13169
+ "loss": 1.7798,
13170
+ "step": 168700
13171
+ },
13172
+ {
13173
+ "epoch": 0.02455919043977696,
13174
+ "grad_norm": 1.602845311164856,
13175
+ "learning_rate": 2.790533638249062e-05,
13176
+ "loss": 1.7694,
13177
+ "step": 168800
13178
+ },
13179
+ {
13180
+ "epoch": 0.024838272149319877,
13181
+ "grad_norm": 1.5015400648117065,
13182
+ "learning_rate": 2.7883344782433774e-05,
13183
+ "loss": 1.7628,
13184
+ "step": 168900
13185
+ },
13186
+ {
13187
+ "epoch": 0.0251173538588628,
13188
+ "grad_norm": 1.5296344757080078,
13189
+ "learning_rate": 2.7861350920907807e-05,
13190
+ "loss": 1.7753,
13191
+ "step": 169000
13192
+ },
13193
+ {
13194
+ "epoch": 0.0251173538588628,
13195
+ "eval_loss": 2.1639742851257324,
13196
+ "eval_runtime": 52.1527,
13197
+ "eval_samples_per_second": 195.465,
13198
+ "eval_steps_per_second": 1.534,
13199
+ "step": 169000
13200
+ },
13201
+ {
13202
+ "epoch": 0.025396435568405717,
13203
+ "grad_norm": 1.591369390487671,
13204
+ "learning_rate": 2.783935481516297e-05,
13205
+ "loss": 1.7695,
13206
+ "step": 169100
13207
+ },
13208
+ {
13209
+ "epoch": 0.025675517277948638,
13210
+ "grad_norm": 1.5569419860839844,
13211
+ "learning_rate": 2.7817356482451297e-05,
13212
+ "loss": 1.7689,
13213
+ "step": 169200
13214
+ },
13215
+ {
13216
+ "epoch": 0.025954598987491556,
13217
+ "grad_norm": 1.6080352067947388,
13218
+ "learning_rate": 2.779535594002654e-05,
13219
+ "loss": 1.767,
13220
+ "step": 169300
13221
+ },
13222
+ {
13223
+ "epoch": 0.026233680697034478,
13224
+ "grad_norm": 1.47182035446167,
13225
+ "learning_rate": 2.77733532051442e-05,
13226
+ "loss": 1.7717,
13227
+ "step": 169400
13228
+ },
13229
+ {
13230
+ "epoch": 0.0265127624065774,
13231
+ "grad_norm": 1.6706403493881226,
13232
+ "learning_rate": 2.775134829506148e-05,
13233
+ "loss": 1.7787,
13234
+ "step": 169500
13235
+ },
13236
+ {
13237
+ "epoch": 0.026791844116120317,
13238
+ "grad_norm": 1.6530786752700806,
13239
+ "learning_rate": 2.7729341227037313e-05,
13240
+ "loss": 1.7726,
13241
+ "step": 169600
13242
+ },
13243
+ {
13244
+ "epoch": 0.02707092582566324,
13245
+ "grad_norm": 1.4457296133041382,
13246
+ "learning_rate": 2.7707332018332323e-05,
13247
+ "loss": 1.7697,
13248
+ "step": 169700
13249
+ },
13250
+ {
13251
+ "epoch": 0.027350007535206156,
13252
+ "grad_norm": 1.5824190378189087,
13253
+ "learning_rate": 2.7685320686208793e-05,
13254
+ "loss": 1.7734,
13255
+ "step": 169800
13256
+ },
13257
+ {
13258
+ "epoch": 0.027629089244749078,
13259
+ "grad_norm": 1.6177047491073608,
13260
+ "learning_rate": 2.7663307247930686e-05,
13261
+ "loss": 1.7782,
13262
+ "step": 169900
13263
+ },
13264
+ {
13265
+ "epoch": 0.027908170954292,
13266
+ "grad_norm": 1.505018949508667,
13267
+ "learning_rate": 2.7641291720763612e-05,
13268
+ "loss": 1.7659,
13269
+ "step": 170000
13270
+ },
13271
+ {
13272
+ "epoch": 0.027908170954292,
13273
+ "eval_loss": 2.1508195400238037,
13274
+ "eval_runtime": 52.1147,
13275
+ "eval_samples_per_second": 195.607,
13276
+ "eval_steps_per_second": 1.535,
13277
+ "step": 170000
13278
+ },
13279
+ {
13280
+ "epoch": 0.028187252663834917,
13281
+ "grad_norm": 1.6319383382797241,
13282
+ "learning_rate": 2.7619274121974825e-05,
13283
+ "loss": 1.7709,
13284
+ "step": 170100
13285
+ },
13286
+ {
13287
+ "epoch": 0.02846633437337784,
13288
+ "grad_norm": 1.6314260959625244,
13289
+ "learning_rate": 2.759725446883319e-05,
13290
+ "loss": 1.7675,
13291
+ "step": 170200
13292
+ },
13293
+ {
13294
+ "epoch": 0.028745416082920756,
13295
+ "grad_norm": 1.471872329711914,
13296
+ "learning_rate": 2.7575232778609206e-05,
13297
+ "loss": 1.771,
13298
+ "step": 170300
13299
+ },
13300
+ {
13301
+ "epoch": 0.029024497792463678,
13302
+ "grad_norm": 1.5450881719589233,
13303
+ "learning_rate": 2.755320906857494e-05,
13304
+ "loss": 1.836,
13305
+ "step": 170400
13306
+ },
13307
+ {
13308
+ "epoch": 0.0293035795020066,
13309
+ "grad_norm": 1.5527344942092896,
13310
+ "learning_rate": 2.753118335600408e-05,
13311
+ "loss": 1.8808,
13312
+ "step": 170500
13313
+ },
13314
+ {
13315
+ "epoch": 0.029582661211549517,
13316
+ "grad_norm": 1.8364976644515991,
13317
+ "learning_rate": 2.7509155658171852e-05,
13318
+ "loss": 1.8776,
13319
+ "step": 170600
13320
+ },
13321
+ {
13322
+ "epoch": 0.02986174292109244,
13323
+ "grad_norm": 1.4847674369812012,
13324
+ "learning_rate": 2.7487125992355058e-05,
13325
+ "loss": 1.8724,
13326
+ "step": 170700
13327
+ },
13328
+ {
13329
+ "epoch": 0.030140824630635357,
13330
+ "grad_norm": 1.5595808029174805,
13331
+ "learning_rate": 2.7465094375832028e-05,
13332
+ "loss": 1.8799,
13333
+ "step": 170800
13334
+ },
13335
+ {
13336
+ "epoch": 0.030419906340178278,
13337
+ "grad_norm": 1.54868483543396,
13338
+ "learning_rate": 2.744306082588264e-05,
13339
+ "loss": 1.8704,
13340
+ "step": 170900
13341
+ },
13342
+ {
13343
+ "epoch": 0.030698988049721196,
13344
+ "grad_norm": 1.8504784107208252,
13345
+ "learning_rate": 2.742102535978827e-05,
13346
+ "loss": 1.8736,
13347
+ "step": 171000
13348
+ },
13349
+ {
13350
+ "epoch": 0.030698988049721196,
13351
+ "eval_loss": 2.1541635990142822,
13352
+ "eval_runtime": 52.1803,
13353
+ "eval_samples_per_second": 195.361,
13354
+ "eval_steps_per_second": 1.533,
13355
+ "step": 171000
13356
+ },
13357
+ {
13358
+ "epoch": 0.030978069759264117,
13359
+ "grad_norm": 1.6168150901794434,
13360
+ "learning_rate": 2.7398987994831822e-05,
13361
+ "loss": 1.8737,
13362
+ "step": 171100
13363
+ },
13364
+ {
13365
+ "epoch": 0.03125715146880704,
13366
+ "grad_norm": 1.6291587352752686,
13367
+ "learning_rate": 2.737694874829766e-05,
13368
+ "loss": 1.8691,
13369
+ "step": 171200
13370
+ },
13371
+ {
13372
+ "epoch": 0.03153623317834996,
13373
+ "grad_norm": 1.5887749195098877,
13374
+ "learning_rate": 2.735490763747164e-05,
13375
+ "loss": 1.8725,
13376
+ "step": 171300
13377
+ },
13378
+ {
13379
+ "epoch": 0.031815314887892875,
13380
+ "grad_norm": 1.6395853757858276,
13381
+ "learning_rate": 2.733286467964108e-05,
13382
+ "loss": 1.8857,
13383
+ "step": 171400
13384
+ },
13385
+ {
13386
+ "epoch": 0.0320943965974358,
13387
+ "grad_norm": 1.5826025009155273,
13388
+ "learning_rate": 2.7310819892094742e-05,
13389
+ "loss": 1.8546,
13390
+ "step": 171500
13391
+ },
13392
+ {
13393
+ "epoch": 0.03237347830697872,
13394
+ "grad_norm": 2.160349130630493,
13395
+ "learning_rate": 2.7288773292122827e-05,
13396
+ "loss": 1.8623,
13397
+ "step": 171600
13398
+ },
13399
+ {
13400
+ "epoch": 0.032652560016521635,
13401
+ "grad_norm": 1.6130859851837158,
13402
+ "learning_rate": 2.726672489701696e-05,
13403
+ "loss": 1.8629,
13404
+ "step": 171700
13405
+ },
13406
+ {
13407
+ "epoch": 0.03293164172606456,
13408
+ "grad_norm": 1.619787335395813,
13409
+ "learning_rate": 2.7244674724070163e-05,
13410
+ "loss": 1.8646,
13411
+ "step": 171800
13412
+ },
13413
+ {
13414
+ "epoch": 0.03321072343560748,
13415
+ "grad_norm": 2.099820375442505,
13416
+ "learning_rate": 2.722262279057687e-05,
13417
+ "loss": 1.8679,
13418
+ "step": 171900
13419
+ },
13420
+ {
13421
+ "epoch": 0.033489805145150396,
13422
+ "grad_norm": 1.7083640098571777,
13423
+ "learning_rate": 2.720056911383287e-05,
13424
+ "loss": 1.8554,
13425
+ "step": 172000
13426
+ },
13427
+ {
13428
+ "epoch": 0.033489805145150396,
13429
+ "eval_loss": 2.1523571014404297,
13430
+ "eval_runtime": 52.1491,
13431
+ "eval_samples_per_second": 195.478,
13432
+ "eval_steps_per_second": 1.534,
13433
+ "step": 172000
13434
+ },
13435
+ {
13436
+ "epoch": 0.033768886854693314,
13437
+ "grad_norm": 1.5392628908157349,
13438
+ "learning_rate": 2.717851371113534e-05,
13439
+ "loss": 1.8658,
13440
+ "step": 172100
13441
+ },
13442
+ {
13443
+ "epoch": 0.03404796856423624,
13444
+ "grad_norm": 2.007720708847046,
13445
+ "learning_rate": 2.715645659978281e-05,
13446
+ "loss": 1.861,
13447
+ "step": 172200
13448
+ },
13449
+ {
13450
+ "epoch": 0.03432705027377916,
13451
+ "grad_norm": 1.566613793373108,
13452
+ "learning_rate": 2.7134397797075145e-05,
13453
+ "loss": 1.8669,
13454
+ "step": 172300
13455
+ },
13456
+ {
13457
+ "epoch": 0.034606131983322075,
13458
+ "grad_norm": 1.588408350944519,
13459
+ "learning_rate": 2.7112337320313524e-05,
13460
+ "loss": 1.8568,
13461
+ "step": 172400
13462
+ },
13463
+ {
13464
+ "epoch": 0.034885213692865,
13465
+ "grad_norm": 1.6406699419021606,
13466
+ "learning_rate": 2.7090275186800474e-05,
13467
+ "loss": 1.8713,
13468
+ "step": 172500
13469
+ },
13470
+ {
13471
+ "epoch": 0.03516429540240792,
13472
+ "grad_norm": 1.5397433042526245,
13473
+ "learning_rate": 2.7068211413839782e-05,
13474
+ "loss": 1.8629,
13475
+ "step": 172600
13476
+ },
13477
+ {
13478
+ "epoch": 0.035443377111950836,
13479
+ "grad_norm": 1.5865190029144287,
13480
+ "learning_rate": 2.704614601873654e-05,
13481
+ "loss": 1.8579,
13482
+ "step": 172700
13483
+ },
13484
+ {
13485
+ "epoch": 0.035722458821493754,
13486
+ "grad_norm": 1.7077267169952393,
13487
+ "learning_rate": 2.702407901879712e-05,
13488
+ "loss": 1.8616,
13489
+ "step": 172800
13490
+ },
13491
+ {
13492
+ "epoch": 0.03600154053103668,
13493
+ "grad_norm": 1.727586269378662,
13494
+ "learning_rate": 2.7002010431329134e-05,
13495
+ "loss": 1.8574,
13496
+ "step": 172900
13497
+ },
13498
+ {
13499
+ "epoch": 0.036280622240579596,
13500
+ "grad_norm": 1.5238264799118042,
13501
+ "learning_rate": 2.6979940273641453e-05,
13502
+ "loss": 1.8595,
13503
+ "step": 173000
13504
+ },
13505
+ {
13506
+ "epoch": 0.036280622240579596,
13507
+ "eval_loss": 2.141134738922119,
13508
+ "eval_runtime": 52.1591,
13509
+ "eval_samples_per_second": 195.441,
13510
+ "eval_steps_per_second": 1.534,
13511
+ "step": 173000
13512
+ },
13513
+ {
13514
+ "epoch": 0.036559703950122514,
13515
+ "grad_norm": 1.5688259601593018,
13516
+ "learning_rate": 2.6957868563044176e-05,
13517
+ "loss": 1.8674,
13518
+ "step": 173100
13519
+ },
13520
+ {
13521
+ "epoch": 0.03683878565966544,
13522
+ "grad_norm": 1.5195534229278564,
13523
+ "learning_rate": 2.6935795316848612e-05,
13524
+ "loss": 1.8653,
13525
+ "step": 173200
13526
+ },
13527
+ {
13528
+ "epoch": 0.03711786736920836,
13529
+ "grad_norm": 1.6201164722442627,
13530
+ "learning_rate": 2.691372055236728e-05,
13531
+ "loss": 1.8579,
13532
+ "step": 173300
13533
+ },
13534
+ {
13535
+ "epoch": 0.037396949078751275,
13536
+ "grad_norm": 1.8065686225891113,
13537
+ "learning_rate": 2.6891644286913897e-05,
13538
+ "loss": 1.8755,
13539
+ "step": 173400
13540
+ },
13541
+ {
13542
+ "epoch": 0.0376760307882942,
13543
+ "grad_norm": 1.5661702156066895,
13544
+ "learning_rate": 2.6869566537803347e-05,
13545
+ "loss": 1.8552,
13546
+ "step": 173500
13547
+ },
13548
+ {
13549
+ "epoch": 0.03795511249783712,
13550
+ "grad_norm": 1.6565943956375122,
13551
+ "learning_rate": 2.6847487322351694e-05,
13552
+ "loss": 1.8664,
13553
+ "step": 173600
13554
+ },
13555
+ {
13556
+ "epoch": 0.038234194207380036,
13557
+ "grad_norm": 1.49613356590271,
13558
+ "learning_rate": 2.6825406657876123e-05,
13559
+ "loss": 1.8524,
13560
+ "step": 173700
13561
+ },
13562
+ {
13563
+ "epoch": 0.038513275916922954,
13564
+ "grad_norm": 1.5829864740371704,
13565
+ "learning_rate": 2.6803324561694988e-05,
13566
+ "loss": 1.8732,
13567
+ "step": 173800
13568
+ },
13569
+ {
13570
+ "epoch": 0.03879235762646588,
13571
+ "grad_norm": 1.6095563173294067,
13572
+ "learning_rate": 2.6781241051127738e-05,
13573
+ "loss": 1.8503,
13574
+ "step": 173900
13575
+ },
13576
+ {
13577
+ "epoch": 0.0390714393360088,
13578
+ "grad_norm": 1.5767251253128052,
13579
+ "learning_rate": 2.675915614349495e-05,
13580
+ "loss": 1.856,
13581
+ "step": 174000
13582
+ },
13583
+ {
13584
+ "epoch": 0.0390714393360088,
13585
+ "eval_loss": 2.1416378021240234,
13586
+ "eval_runtime": 52.1112,
13587
+ "eval_samples_per_second": 195.62,
13588
+ "eval_steps_per_second": 1.535,
13589
+ "step": 174000
13590
+ },
13591
+ {
13592
+ "epoch": 0.00027908170954291995,
13593
+ "grad_norm": 1.5513286590576172,
13594
+ "learning_rate": 2.6737069856118284e-05,
13595
+ "loss": 1.7542,
13596
+ "step": 174100
13597
+ },
13598
+ {
13599
+ "epoch": 0.0005581634190858399,
13600
+ "grad_norm": 1.5664585828781128,
13601
+ "learning_rate": 2.67149822063205e-05,
13602
+ "loss": 1.7515,
13603
+ "step": 174200
13604
+ },
13605
+ {
13606
+ "epoch": 0.0008372451286287599,
13607
+ "grad_norm": 1.5423948764801025,
13608
+ "learning_rate": 2.66928932114254e-05,
13609
+ "loss": 1.7557,
13610
+ "step": 174300
13611
+ },
13612
+ {
13613
+ "epoch": 0.0011163268381716798,
13614
+ "grad_norm": 1.5535671710968018,
13615
+ "learning_rate": 2.667080288875788e-05,
13616
+ "loss": 1.7569,
13617
+ "step": 174400
13618
+ },
13619
+ {
13620
+ "epoch": 0.0013954085477146,
13621
+ "grad_norm": 1.5592520236968994,
13622
+ "learning_rate": 2.6648711255643828e-05,
13623
+ "loss": 1.7506,
13624
+ "step": 174500
13625
+ },
13626
+ {
13627
+ "epoch": 0.0016744902572575198,
13628
+ "grad_norm": 1.5440510511398315,
13629
+ "learning_rate": 2.6626618329410198e-05,
13630
+ "loss": 1.7618,
13631
+ "step": 174600
13632
+ },
13633
+ {
13634
+ "epoch": 0.00195357196680044,
13635
+ "grad_norm": 1.54314124584198,
13636
+ "learning_rate": 2.6604524127384937e-05,
13637
+ "loss": 1.7491,
13638
+ "step": 174700
13639
+ },
13640
+ {
13641
+ "epoch": 0.0022326536763433596,
13642
+ "grad_norm": 1.592208743095398,
13643
+ "learning_rate": 2.658242866689702e-05,
13644
+ "loss": 1.7458,
13645
+ "step": 174800
13646
+ },
13647
+ {
13648
+ "epoch": 0.0025117353858862797,
13649
+ "grad_norm": 1.5204849243164062,
13650
+ "learning_rate": 2.6560331965276363e-05,
13651
+ "loss": 1.7523,
13652
+ "step": 174900
13653
+ },
13654
+ {
13655
+ "epoch": 0.0027908170954292,
13656
+ "grad_norm": 1.5259612798690796,
13657
+ "learning_rate": 2.653823403985391e-05,
13658
+ "loss": 1.7535,
13659
+ "step": 175000
13660
+ },
13661
+ {
13662
+ "epoch": 0.0027908170954292,
13663
+ "eval_loss": 2.1326749324798584,
13664
+ "eval_runtime": 52.049,
13665
+ "eval_samples_per_second": 195.854,
13666
+ "eval_steps_per_second": 1.537,
13667
+ "step": 175000
13668
+ },
13669
+ {
13670
+ "epoch": 0.00306989880497212,
13671
+ "grad_norm": 1.52047598361969,
13672
+ "learning_rate": 2.651613490796152e-05,
13673
+ "loss": 1.7447,
13674
+ "step": 175100
13675
+ },
13676
+ {
13677
+ "epoch": 0.0033489805145150396,
13678
+ "grad_norm": 1.5134586095809937,
13679
+ "learning_rate": 2.6494034586932027e-05,
13680
+ "loss": 1.7452,
13681
+ "step": 175200
13682
+ },
13683
+ {
13684
+ "epoch": 0.0036280622240579597,
13685
+ "grad_norm": 1.572095513343811,
13686
+ "learning_rate": 2.6471933094099177e-05,
13687
+ "loss": 1.7571,
13688
+ "step": 175300
13689
+ },
13690
+ {
13691
+ "epoch": 0.00390714393360088,
13692
+ "grad_norm": 1.5933750867843628,
13693
+ "learning_rate": 2.6449830446797653e-05,
13694
+ "loss": 1.745,
13695
+ "step": 175400
13696
+ },
13697
+ {
13698
+ "epoch": 0.0041862256431437995,
13699
+ "grad_norm": 1.6601353883743286,
13700
+ "learning_rate": 2.6427726662363023e-05,
13701
+ "loss": 1.7462,
13702
+ "step": 175500
13703
+ },
13704
+ {
13705
+ "epoch": 0.004465307352686719,
13706
+ "grad_norm": 1.5466818809509277,
13707
+ "learning_rate": 2.640562175813177e-05,
13708
+ "loss": 1.7573,
13709
+ "step": 175600
13710
+ },
13711
+ {
13712
+ "epoch": 0.00474438906222964,
13713
+ "grad_norm": 1.5273200273513794,
13714
+ "learning_rate": 2.6383515751441234e-05,
13715
+ "loss": 1.7578,
13716
+ "step": 175700
13717
+ },
13718
+ {
13719
+ "epoch": 0.005023470771772559,
13720
+ "grad_norm": 1.609778881072998,
13721
+ "learning_rate": 2.636140865962965e-05,
13722
+ "loss": 1.7513,
13723
+ "step": 175800
13724
+ },
13725
+ {
13726
+ "epoch": 0.00530255248131548,
13727
+ "grad_norm": 1.6019160747528076,
13728
+ "learning_rate": 2.633930050003606e-05,
13729
+ "loss": 1.7557,
13730
+ "step": 175900
13731
+ },
13732
+ {
13733
+ "epoch": 0.0055816341908584,
13734
+ "grad_norm": 1.5547572374343872,
13735
+ "learning_rate": 2.6317191290000383e-05,
13736
+ "loss": 1.7645,
13737
+ "step": 176000
13738
+ },
13739
+ {
13740
+ "epoch": 0.0055816341908584,
13741
+ "eval_loss": 2.141494035720825,
13742
+ "eval_runtime": 51.4645,
13743
+ "eval_samples_per_second": 198.078,
13744
+ "eval_steps_per_second": 1.554,
13745
+ "step": 176000
13746
+ },
13747
+ {
13748
+ "epoch": 0.005860715900401319,
13749
+ "grad_norm": 1.6100679636001587,
13750
+ "learning_rate": 2.629508104686334e-05,
13751
+ "loss": 1.7566,
13752
+ "step": 176100
13753
+ },
13754
+ {
13755
+ "epoch": 0.00613979760994424,
13756
+ "grad_norm": 1.5966265201568604,
13757
+ "learning_rate": 2.6272969787966466e-05,
13758
+ "loss": 1.7511,
13759
+ "step": 176200
13760
+ },
13761
+ {
13762
+ "epoch": 0.0064188793194871595,
13763
+ "grad_norm": 1.5519967079162598,
13764
+ "learning_rate": 2.6250857530652113e-05,
13765
+ "loss": 1.7534,
13766
+ "step": 176300
13767
+ },
13768
+ {
13769
+ "epoch": 0.006697961029030079,
13770
+ "grad_norm": 1.5537617206573486,
13771
+ "learning_rate": 2.6228744292263367e-05,
13772
+ "loss": 1.7448,
13773
+ "step": 176400
13774
+ },
13775
+ {
13776
+ "epoch": 0.006977042738573,
13777
+ "grad_norm": 1.5397429466247559,
13778
+ "learning_rate": 2.6206630090144153e-05,
13779
+ "loss": 1.7456,
13780
+ "step": 176500
13781
+ },
13782
+ {
13783
+ "epoch": 0.0072561244481159195,
13784
+ "grad_norm": 1.5131994485855103,
13785
+ "learning_rate": 2.618451494163908e-05,
13786
+ "loss": 1.7472,
13787
+ "step": 176600
13788
+ },
13789
+ {
13790
+ "epoch": 0.007535206157658839,
13791
+ "grad_norm": 1.553226113319397,
13792
+ "learning_rate": 2.6162398864093553e-05,
13793
+ "loss": 1.7588,
13794
+ "step": 176700
13795
+ },
13796
+ {
13797
+ "epoch": 0.00781428786720176,
13798
+ "grad_norm": 1.5782634019851685,
13799
+ "learning_rate": 2.6140281874853666e-05,
13800
+ "loss": 1.7498,
13801
+ "step": 176800
13802
+ },
13803
+ {
13804
+ "epoch": 0.00809336957674468,
13805
+ "grad_norm": 1.5181629657745361,
13806
+ "learning_rate": 2.6118163991266275e-05,
13807
+ "loss": 1.7525,
13808
+ "step": 176900
13809
+ },
13810
+ {
13811
+ "epoch": 0.008372451286287599,
13812
+ "grad_norm": 1.622118353843689,
13813
+ "learning_rate": 2.6096045230678888e-05,
13814
+ "loss": 1.7472,
13815
+ "step": 177000
13816
+ },
13817
+ {
13818
+ "epoch": 0.008372451286287599,
13819
+ "eval_loss": 2.1567530632019043,
13820
+ "eval_runtime": 51.4987,
13821
+ "eval_samples_per_second": 197.947,
13822
+ "eval_steps_per_second": 1.553,
13823
+ "step": 177000
13824
+ },
13825
+ {
13826
+ "epoch": 0.008651532995830519,
13827
+ "grad_norm": 1.5844262838363647,
13828
+ "learning_rate": 2.6073925610439738e-05,
13829
+ "loss": 1.7489,
13830
+ "step": 177100
13831
+ },
13832
+ {
13833
+ "epoch": 0.008930614705373438,
13834
+ "grad_norm": 1.4944721460342407,
13835
+ "learning_rate": 2.6051805147897713e-05,
13836
+ "loss": 1.7535,
13837
+ "step": 177200
13838
+ },
13839
+ {
13840
+ "epoch": 0.00920969641491636,
13841
+ "grad_norm": 1.607365608215332,
13842
+ "learning_rate": 2.602968386040236e-05,
13843
+ "loss": 1.7476,
13844
+ "step": 177300
13845
+ },
13846
+ {
13847
+ "epoch": 0.00948877812445928,
13848
+ "grad_norm": 1.5790349245071411,
13849
+ "learning_rate": 2.6007561765303878e-05,
13850
+ "loss": 1.7465,
13851
+ "step": 177400
13852
+ },
13853
+ {
13854
+ "epoch": 0.0097678598340022,
13855
+ "grad_norm": 1.5833547115325928,
13856
+ "learning_rate": 2.5985438879953107e-05,
13857
+ "loss": 1.7581,
13858
+ "step": 177500
13859
+ },
13860
+ {
13861
+ "epoch": 0.010046941543545119,
13862
+ "grad_norm": 1.5244640111923218,
13863
+ "learning_rate": 2.5963315221701496e-05,
13864
+ "loss": 1.7489,
13865
+ "step": 177600
13866
+ },
13867
+ {
13868
+ "epoch": 0.010326023253088039,
13869
+ "grad_norm": 1.6332496404647827,
13870
+ "learning_rate": 2.5941190807901117e-05,
13871
+ "loss": 1.7593,
13872
+ "step": 177700
13873
+ },
13874
+ {
13875
+ "epoch": 0.01060510496263096,
13876
+ "grad_norm": 1.4967930316925049,
13877
+ "learning_rate": 2.5919065655904606e-05,
13878
+ "loss": 1.7487,
13879
+ "step": 177800
13880
+ },
13881
+ {
13882
+ "epoch": 0.01088418667217388,
13883
+ "grad_norm": 1.5874158143997192,
13884
+ "learning_rate": 2.5896939783065198e-05,
13885
+ "loss": 1.7488,
13886
+ "step": 177900
13887
+ },
13888
+ {
13889
+ "epoch": 0.0111632683817168,
13890
+ "grad_norm": 1.6334315538406372,
13891
+ "learning_rate": 2.587481320673669e-05,
13892
+ "loss": 1.7558,
13893
+ "step": 178000
13894
+ },
13895
+ {
13896
+ "epoch": 0.0111632683817168,
13897
+ "eval_loss": 2.1407663822174072,
13898
+ "eval_runtime": 51.564,
13899
+ "eval_samples_per_second": 197.696,
13900
+ "eval_steps_per_second": 1.551,
13901
+ "step": 178000
13902
+ },
13903
+ {
13904
+ "epoch": 0.011442350091259719,
13905
+ "grad_norm": 1.5070706605911255,
13906
+ "learning_rate": 2.5852685944273437e-05,
13907
+ "loss": 1.7515,
13908
+ "step": 178100
13909
+ },
13910
+ {
13911
+ "epoch": 0.011721431800802639,
13912
+ "grad_norm": 1.675197958946228,
13913
+ "learning_rate": 2.583055801303031e-05,
13914
+ "loss": 1.7517,
13915
+ "step": 178200
13916
+ },
13917
+ {
13918
+ "epoch": 0.012000513510345558,
13919
+ "grad_norm": 1.6129719018936157,
13920
+ "learning_rate": 2.5808429430362734e-05,
13921
+ "loss": 1.739,
13922
+ "step": 178300
13923
+ },
13924
+ {
13925
+ "epoch": 0.01227959521988848,
13926
+ "grad_norm": 1.6314342021942139,
13927
+ "learning_rate": 2.5786300213626623e-05,
13928
+ "loss": 1.7373,
13929
+ "step": 178400
13930
+ },
13931
+ {
13932
+ "epoch": 0.0125586769294314,
13933
+ "grad_norm": 1.4758597612380981,
13934
+ "learning_rate": 2.576417038017841e-05,
13935
+ "loss": 1.7512,
13936
+ "step": 178500
13937
+ },
13938
+ {
13939
+ "epoch": 0.012837758638974319,
13940
+ "grad_norm": 1.6322437524795532,
13941
+ "learning_rate": 2.574203994737498e-05,
13942
+ "loss": 1.7529,
13943
+ "step": 178600
13944
+ },
13945
+ {
13946
+ "epoch": 0.013116840348517239,
13947
+ "grad_norm": 1.6611186265945435,
13948
+ "learning_rate": 2.5719908932573716e-05,
13949
+ "loss": 1.7529,
13950
+ "step": 178700
13951
+ },
13952
+ {
13953
+ "epoch": 0.013395922058060158,
13954
+ "grad_norm": 1.6254630088806152,
13955
+ "learning_rate": 2.5697777353132434e-05,
13956
+ "loss": 1.7548,
13957
+ "step": 178800
13958
+ },
13959
+ {
13960
+ "epoch": 0.013675003767603078,
13961
+ "grad_norm": 1.6417994499206543,
13962
+ "learning_rate": 2.567564522640942e-05,
13963
+ "loss": 1.7501,
13964
+ "step": 178900
13965
+ },
13966
+ {
13967
+ "epoch": 0.013954085477146,
13968
+ "grad_norm": 1.5359156131744385,
13969
+ "learning_rate": 2.5653512569763377e-05,
13970
+ "loss": 1.7562,
13971
+ "step": 179000
13972
+ },
13973
+ {
13974
+ "epoch": 0.013954085477146,
13975
+ "eval_loss": 2.144591808319092,
13976
+ "eval_runtime": 51.5364,
13977
+ "eval_samples_per_second": 197.802,
13978
+ "eval_steps_per_second": 1.552,
13979
+ "step": 179000
13980
+ },
13981
+ {
13982
+ "epoch": 0.01423316718668892,
13983
+ "grad_norm": 1.5880595445632935,
13984
+ "learning_rate": 2.5631379400553416e-05,
13985
+ "loss": 1.75,
13986
+ "step": 179100
13987
+ },
13988
+ {
13989
+ "epoch": 0.014512248896231839,
13990
+ "grad_norm": 1.6134679317474365,
13991
+ "learning_rate": 2.560924573613906e-05,
13992
+ "loss": 1.7508,
13993
+ "step": 179200
13994
+ },
13995
+ {
13996
+ "epoch": 0.014791330605774759,
13997
+ "grad_norm": 1.5464352369308472,
13998
+ "learning_rate": 2.5587111593880205e-05,
13999
+ "loss": 1.7502,
14000
+ "step": 179300
14001
+ },
14002
+ {
14003
+ "epoch": 0.015070412315317678,
14004
+ "grad_norm": 1.573649525642395,
14005
+ "learning_rate": 2.556497699113714e-05,
14006
+ "loss": 1.7435,
14007
+ "step": 179400
14008
+ },
14009
+ {
14010
+ "epoch": 0.015349494024860598,
14011
+ "grad_norm": 1.5665711164474487,
14012
+ "learning_rate": 2.554284194527051e-05,
14013
+ "loss": 1.7462,
14014
+ "step": 179500
14015
+ },
14016
+ {
14017
+ "epoch": 0.01562857573440352,
14018
+ "grad_norm": 1.606072187423706,
14019
+ "learning_rate": 2.5520706473641316e-05,
14020
+ "loss": 1.7516,
14021
+ "step": 179600
14022
+ },
14023
+ {
14024
+ "epoch": 0.015907657443946437,
14025
+ "grad_norm": 1.5898959636688232,
14026
+ "learning_rate": 2.549857059361086e-05,
14027
+ "loss": 1.7482,
14028
+ "step": 179700
14029
+ },
14030
+ {
14031
+ "epoch": 0.01618673915348936,
14032
+ "grad_norm": 1.6288598775863647,
14033
+ "learning_rate": 2.547643432254081e-05,
14034
+ "loss": 1.7365,
14035
+ "step": 179800
14036
+ },
14037
+ {
14038
+ "epoch": 0.01646582086303228,
14039
+ "grad_norm": 1.5765552520751953,
14040
+ "learning_rate": 2.545429767779311e-05,
14041
+ "loss": 1.7346,
14042
+ "step": 179900
14043
+ },
14044
+ {
14045
+ "epoch": 0.016744902572575198,
14046
+ "grad_norm": 1.5909677743911743,
14047
+ "learning_rate": 2.5432160676729994e-05,
14048
+ "loss": 1.7493,
14049
+ "step": 180000
14050
+ },
14051
+ {
14052
+ "epoch": 0.016744902572575198,
14053
+ "eval_loss": 2.1469063758850098,
14054
+ "eval_runtime": 52.5101,
14055
+ "eval_samples_per_second": 194.134,
14056
+ "eval_steps_per_second": 1.524,
14057
+ "step": 180000
14058
+ },
14059
+ {
14060
+ "epoch": 0.01702398428211812,
14061
+ "grad_norm": 1.6108888387680054,
14062
+ "learning_rate": 2.5410023336713996e-05,
14063
+ "loss": 1.749,
14064
+ "step": 180100
14065
+ },
14066
+ {
14067
+ "epoch": 0.017303065991661037,
14068
+ "grad_norm": 1.5427972078323364,
14069
+ "learning_rate": 2.538788567510791e-05,
14070
+ "loss": 1.738,
14071
+ "step": 180200
14072
+ },
14073
+ {
14074
+ "epoch": 0.01758214770120396,
14075
+ "grad_norm": 1.5925029516220093,
14076
+ "learning_rate": 2.5365747709274767e-05,
14077
+ "loss": 1.7418,
14078
+ "step": 180300
14079
+ },
14080
+ {
14081
+ "epoch": 0.017861229410746877,
14082
+ "grad_norm": 1.5784283876419067,
14083
+ "learning_rate": 2.5343609456577867e-05,
14084
+ "loss": 1.7417,
14085
+ "step": 180400
14086
+ },
14087
+ {
14088
+ "epoch": 0.018140311120289798,
14089
+ "grad_norm": 1.623561978340149,
14090
+ "learning_rate": 2.53214709343807e-05,
14091
+ "loss": 1.7443,
14092
+ "step": 180500
14093
+ },
14094
+ {
14095
+ "epoch": 0.01841939282983272,
14096
+ "grad_norm": 1.6505674123764038,
14097
+ "learning_rate": 2.5299332160046985e-05,
14098
+ "loss": 1.7454,
14099
+ "step": 180600
14100
+ },
14101
+ {
14102
+ "epoch": 0.018698474539375638,
14103
+ "grad_norm": 1.5555040836334229,
14104
+ "learning_rate": 2.5277193150940638e-05,
14105
+ "loss": 1.7416,
14106
+ "step": 180700
14107
+ },
14108
+ {
14109
+ "epoch": 0.01897755624891856,
14110
+ "grad_norm": 1.6162723302841187,
14111
+ "learning_rate": 2.525505392442577e-05,
14112
+ "loss": 1.7433,
14113
+ "step": 180800
14114
+ },
14115
+ {
14116
+ "epoch": 0.019256637958461477,
14117
+ "grad_norm": 1.5440572500228882,
14118
+ "learning_rate": 2.523291449786663e-05,
14119
+ "loss": 1.7438,
14120
+ "step": 180900
14121
+ },
14122
+ {
14123
+ "epoch": 0.0195357196680044,
14124
+ "grad_norm": 1.596146583557129,
14125
+ "learning_rate": 2.5210774888627664e-05,
14126
+ "loss": 1.7425,
14127
+ "step": 181000
14128
+ },
14129
+ {
14130
+ "epoch": 0.0195357196680044,
14131
+ "eval_loss": 2.140672206878662,
14132
+ "eval_runtime": 51.8004,
14133
+ "eval_samples_per_second": 196.794,
14134
+ "eval_steps_per_second": 1.544,
14135
+ "step": 181000
14136
+ },
14137
+ {
14138
+ "epoch": 0.01981480137754732,
14139
+ "grad_norm": 1.6086748838424683,
14140
+ "learning_rate": 2.5188635114073434e-05,
14141
+ "loss": 1.7488,
14142
+ "step": 181100
14143
+ },
14144
+ {
14145
+ "epoch": 0.020093883087090238,
14146
+ "grad_norm": 1.564663290977478,
14147
+ "learning_rate": 2.516649519156864e-05,
14148
+ "loss": 1.7452,
14149
+ "step": 181200
14150
+ },
14151
+ {
14152
+ "epoch": 0.02037296479663316,
14153
+ "grad_norm": 1.5975944995880127,
14154
+ "learning_rate": 2.51443551384781e-05,
14155
+ "loss": 1.7419,
14156
+ "step": 181300
14157
+ },
14158
+ {
14159
+ "epoch": 0.020652046506176077,
14160
+ "grad_norm": 1.6056960821151733,
14161
+ "learning_rate": 2.5122214972166724e-05,
14162
+ "loss": 1.7536,
14163
+ "step": 181400
14164
+ },
14165
+ {
14166
+ "epoch": 0.020931128215719,
14167
+ "grad_norm": 1.6348010301589966,
14168
+ "learning_rate": 2.5100074709999526e-05,
14169
+ "loss": 1.7505,
14170
+ "step": 181500
14171
+ },
14172
+ {
14173
+ "epoch": 0.02121020992526192,
14174
+ "grad_norm": 1.4651880264282227,
14175
+ "learning_rate": 2.5077934369341594e-05,
14176
+ "loss": 1.7474,
14177
+ "step": 181600
14178
+ },
14179
+ {
14180
+ "epoch": 0.021489291634804838,
14181
+ "grad_norm": 1.6000345945358276,
14182
+ "learning_rate": 2.505579396755806e-05,
14183
+ "loss": 1.7455,
14184
+ "step": 181700
14185
+ },
14186
+ {
14187
+ "epoch": 0.02176837334434776,
14188
+ "grad_norm": 1.6549137830734253,
14189
+ "learning_rate": 2.503365352201413e-05,
14190
+ "loss": 1.7404,
14191
+ "step": 181800
14192
+ },
14193
+ {
14194
+ "epoch": 0.022047455053890677,
14195
+ "grad_norm": 1.6172484159469604,
14196
+ "learning_rate": 2.5011513050075014e-05,
14197
+ "loss": 1.7457,
14198
+ "step": 181900
14199
+ },
14200
+ {
14201
+ "epoch": 0.0223265367634336,
14202
+ "grad_norm": 1.6283797025680542,
14203
+ "learning_rate": 2.4989372569105962e-05,
14204
+ "loss": 1.7411,
14205
+ "step": 182000
14206
+ },
14207
+ {
14208
+ "epoch": 0.0223265367634336,
14209
+ "eval_loss": 2.1432528495788574,
14210
+ "eval_runtime": 51.7742,
14211
+ "eval_samples_per_second": 196.894,
14212
+ "eval_steps_per_second": 1.545,
14213
+ "step": 182000
14214
+ },
14215
+ {
14216
+ "epoch": 0.022605618472976517,
14217
+ "grad_norm": 1.5319279432296753,
14218
+ "learning_rate": 2.4967232096472236e-05,
14219
+ "loss": 1.76,
14220
+ "step": 182100
14221
+ },
14222
+ {
14223
+ "epoch": 0.022884700182519438,
14224
+ "grad_norm": 1.600860595703125,
14225
+ "learning_rate": 2.4945091649539086e-05,
14226
+ "loss": 1.7416,
14227
+ "step": 182200
14228
+ },
14229
+ {
14230
+ "epoch": 0.02316378189206236,
14231
+ "grad_norm": 1.5592856407165527,
14232
+ "learning_rate": 2.4922951245671723e-05,
14233
+ "loss": 1.7421,
14234
+ "step": 182300
14235
+ },
14236
+ {
14237
+ "epoch": 0.023442863601605277,
14238
+ "grad_norm": 1.5361909866333008,
14239
+ "learning_rate": 2.4900810902235356e-05,
14240
+ "loss": 1.7436,
14241
+ "step": 182400
14242
+ },
14243
+ {
14244
+ "epoch": 0.0237219453111482,
14245
+ "grad_norm": 1.526672124862671,
14246
+ "learning_rate": 2.4878670636595117e-05,
14247
+ "loss": 1.7418,
14248
+ "step": 182500
14249
+ },
14250
+ {
14251
+ "epoch": 0.024001027020691117,
14252
+ "grad_norm": 1.5167595148086548,
14253
+ "learning_rate": 2.4856530466116112e-05,
14254
+ "loss": 1.7389,
14255
+ "step": 182600
14256
+ },
14257
+ {
14258
+ "epoch": 0.024280108730234038,
14259
+ "grad_norm": 1.6046936511993408,
14260
+ "learning_rate": 2.4834390408163324e-05,
14261
+ "loss": 1.7459,
14262
+ "step": 182700
14263
+ },
14264
+ {
14265
+ "epoch": 0.02455919043977696,
14266
+ "grad_norm": 1.572601079940796,
14267
+ "learning_rate": 2.4812250480101693e-05,
14268
+ "loss": 1.7464,
14269
+ "step": 182800
14270
+ },
14271
+ {
14272
+ "epoch": 0.024838272149319877,
14273
+ "grad_norm": 1.5549017190933228,
14274
+ "learning_rate": 2.479011069929603e-05,
14275
+ "loss": 1.7356,
14276
+ "step": 182900
14277
+ },
14278
+ {
14279
+ "epoch": 0.0251173538588628,
14280
+ "grad_norm": 1.5163230895996094,
14281
+ "learning_rate": 2.476797108311106e-05,
14282
+ "loss": 1.7427,
14283
+ "step": 183000
14284
+ },
14285
+ {
14286
+ "epoch": 0.0251173538588628,
14287
+ "eval_loss": 2.1313729286193848,
14288
+ "eval_runtime": 51.744,
14289
+ "eval_samples_per_second": 197.009,
14290
+ "eval_steps_per_second": 1.546,
14291
+ "step": 183000
14292
+ },
14293
+ {
14294
+ "epoch": 0.025396435568405717,
14295
+ "grad_norm": 1.5936397314071655,
14296
+ "learning_rate": 2.474583164891133e-05,
14297
+ "loss": 1.7446,
14298
+ "step": 183100
14299
+ },
14300
+ {
14301
+ "epoch": 0.025675517277948638,
14302
+ "grad_norm": 1.5533971786499023,
14303
+ "learning_rate": 2.4723692414061295e-05,
14304
+ "loss": 1.7452,
14305
+ "step": 183200
14306
+ },
14307
+ {
14308
+ "epoch": 0.025954598987491556,
14309
+ "grad_norm": 1.6152623891830444,
14310
+ "learning_rate": 2.4701553395925214e-05,
14311
+ "loss": 1.7425,
14312
+ "step": 183300
14313
+ },
14314
+ {
14315
+ "epoch": 0.026233680697034478,
14316
+ "grad_norm": 1.4908332824707031,
14317
+ "learning_rate": 2.4679414611867214e-05,
14318
+ "loss": 1.755,
14319
+ "step": 183400
14320
+ },
14321
+ {
14322
+ "epoch": 0.0265127624065774,
14323
+ "grad_norm": 1.6560674905776978,
14324
+ "learning_rate": 2.4657276079251194e-05,
14325
+ "loss": 1.7477,
14326
+ "step": 183500
14327
+ },
14328
+ {
14329
+ "epoch": 0.026791844116120317,
14330
+ "grad_norm": 1.7160277366638184,
14331
+ "learning_rate": 2.4635137815440894e-05,
14332
+ "loss": 1.7446,
14333
+ "step": 183600
14334
+ },
14335
+ {
14336
+ "epoch": 0.02707092582566324,
14337
+ "grad_norm": 1.4447243213653564,
14338
+ "learning_rate": 2.461299983779983e-05,
14339
+ "loss": 1.7403,
14340
+ "step": 183700
14341
+ },
14342
+ {
14343
+ "epoch": 0.027350007535206156,
14344
+ "grad_norm": 1.605068325996399,
14345
+ "learning_rate": 2.459086216369129e-05,
14346
+ "loss": 1.7439,
14347
+ "step": 183800
14348
+ },
14349
+ {
14350
+ "epoch": 0.027629089244749078,
14351
+ "grad_norm": 1.6601132154464722,
14352
+ "learning_rate": 2.4568724810478325e-05,
14353
+ "loss": 1.7439,
14354
+ "step": 183900
14355
+ },
14356
+ {
14357
+ "epoch": 0.027908170954292,
14358
+ "grad_norm": 1.546660304069519,
14359
+ "learning_rate": 2.4546587795523733e-05,
14360
+ "loss": 1.7339,
14361
+ "step": 184000
14362
+ },
14363
+ {
14364
+ "epoch": 0.027908170954292,
14365
+ "eval_loss": 2.1373305320739746,
14366
+ "eval_runtime": 51.7742,
14367
+ "eval_samples_per_second": 196.893,
14368
+ "eval_steps_per_second": 1.545,
14369
+ "step": 184000
14370
+ },
14371
+ {
14372
+ "epoch": 0.00027908170954291995,
14373
+ "grad_norm": 1.6656001806259155,
14374
+ "learning_rate": 2.4524451136190048e-05,
14375
+ "loss": 1.8435,
14376
+ "step": 184100
14377
+ },
14378
+ {
14379
+ "epoch": 0.0005581634190858399,
14380
+ "grad_norm": 1.6392732858657837,
14381
+ "learning_rate": 2.4502314849839546e-05,
14382
+ "loss": 1.8453,
14383
+ "step": 184200
14384
+ },
14385
+ {
14386
+ "epoch": 0.0008372451286287599,
14387
+ "grad_norm": 1.7409366369247437,
14388
+ "learning_rate": 2.4480178953834162e-05,
14389
+ "loss": 1.8407,
14390
+ "step": 184300
14391
+ },
14392
+ {
14393
+ "epoch": 0.0011163268381716798,
14394
+ "grad_norm": 1.5873730182647705,
14395
+ "learning_rate": 2.445804346553557e-05,
14396
+ "loss": 1.8428,
14397
+ "step": 184400
14398
+ },
14399
+ {
14400
+ "epoch": 0.0013954085477146,
14401
+ "grad_norm": 1.5073753595352173,
14402
+ "learning_rate": 2.4435908402305108e-05,
14403
+ "loss": 1.8379,
14404
+ "step": 184500
14405
+ },
14406
+ {
14407
+ "epoch": 0.0016744902572575198,
14408
+ "grad_norm": 2.3680567741394043,
14409
+ "learning_rate": 2.4413773781503788e-05,
14410
+ "loss": 1.83,
14411
+ "step": 184600
14412
+ },
14413
+ {
14414
+ "epoch": 0.00195357196680044,
14415
+ "grad_norm": 1.6823689937591553,
14416
+ "learning_rate": 2.4391639620492243e-05,
14417
+ "loss": 1.8411,
14418
+ "step": 184700
14419
+ },
14420
+ {
14421
+ "epoch": 0.0022326536763433596,
14422
+ "grad_norm": 1.5574064254760742,
14423
+ "learning_rate": 2.4369505936630786e-05,
14424
+ "loss": 1.8351,
14425
+ "step": 184800
14426
+ },
14427
+ {
14428
+ "epoch": 0.0025117353858862797,
14429
+ "grad_norm": 2.146876096725464,
14430
+ "learning_rate": 2.4347372747279337e-05,
14431
+ "loss": 1.833,
14432
+ "step": 184900
14433
+ },
14434
+ {
14435
+ "epoch": 0.0027908170954292,
14436
+ "grad_norm": 1.6746612787246704,
14437
+ "learning_rate": 2.4325240069797438e-05,
14438
+ "loss": 1.8284,
14439
+ "step": 185000
14440
+ },
14441
+ {
14442
+ "epoch": 0.0027908170954292,
14443
+ "eval_loss": 2.133864641189575,
14444
+ "eval_runtime": 52.0009,
14445
+ "eval_samples_per_second": 196.035,
14446
+ "eval_steps_per_second": 1.538,
14447
+ "step": 185000
14448
+ },
14449
+ {
14450
+ "epoch": 0.00306989880497212,
14451
+ "grad_norm": 1.6454411745071411,
14452
+ "learning_rate": 2.430310792154422e-05,
14453
+ "loss": 1.8312,
14454
+ "step": 185100
14455
+ },
14456
+ {
14457
+ "epoch": 0.0033489805145150396,
14458
+ "grad_norm": 1.8907885551452637,
14459
+ "learning_rate": 2.4280976319878392e-05,
14460
+ "loss": 1.8384,
14461
+ "step": 185200
14462
+ },
14463
+ {
14464
+ "epoch": 0.0036280622240579597,
14465
+ "grad_norm": 1.6488444805145264,
14466
+ "learning_rate": 2.425884528215825e-05,
14467
+ "loss": 1.8241,
14468
+ "step": 185300
14469
+ },
14470
+ {
14471
+ "epoch": 0.00390714393360088,
14472
+ "grad_norm": 1.6460552215576172,
14473
+ "learning_rate": 2.423671482574164e-05,
14474
+ "loss": 1.8318,
14475
+ "step": 185400
14476
+ },
14477
+ {
14478
+ "epoch": 0.0041862256431437995,
14479
+ "grad_norm": 1.6229537725448608,
14480
+ "learning_rate": 2.4214584967985962e-05,
14481
+ "loss": 1.8349,
14482
+ "step": 185500
14483
+ },
14484
+ {
14485
+ "epoch": 0.004465307352686719,
14486
+ "grad_norm": 1.5805400609970093,
14487
+ "learning_rate": 2.419245572624812e-05,
14488
+ "loss": 1.823,
14489
+ "step": 185600
14490
+ },
14491
+ {
14492
+ "epoch": 0.00474438906222964,
14493
+ "grad_norm": 1.8274881839752197,
14494
+ "learning_rate": 2.4170327117884562e-05,
14495
+ "loss": 1.8363,
14496
+ "step": 185700
14497
+ },
14498
+ {
14499
+ "epoch": 0.005023470771772559,
14500
+ "grad_norm": 1.5922763347625732,
14501
+ "learning_rate": 2.4148199160251238e-05,
14502
+ "loss": 1.8272,
14503
+ "step": 185800
14504
+ },
14505
+ {
14506
+ "epoch": 0.00530255248131548,
14507
+ "grad_norm": 1.6500530242919922,
14508
+ "learning_rate": 2.4126071870703574e-05,
14509
+ "loss": 1.821,
14510
+ "step": 185900
14511
+ },
14512
+ {
14513
+ "epoch": 0.0055816341908584,
14514
+ "grad_norm": 1.6244685649871826,
14515
+ "learning_rate": 2.410394526659647e-05,
14516
+ "loss": 1.8287,
14517
+ "step": 186000
14518
+ },
14519
+ {
14520
+ "epoch": 0.0055816341908584,
14521
+ "eval_loss": 2.131998300552368,
14522
+ "eval_runtime": 51.5187,
14523
+ "eval_samples_per_second": 197.87,
14524
+ "eval_steps_per_second": 1.553,
14525
+ "step": 186000
14526
  }
14527
  ],
14528
  "logging_steps": 100,
 
14542
  "attributes": {}
14543
  }
14544
  },
14545
+ "total_flos": 1.6232668270166016e+19,
14546
  "train_batch_size": 128,
14547
  "trial_name": null,
14548
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9318402efc23f8b2e09dec877ba7b88863d76a00aceeef7c22f944e9f6a43e28
3
  size 5777
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6be0aaef9589a43e4cde380bc3e83ccd55ea3b262dc3f11f0bbc4b35fc934376
3
  size 5777