Model save
Browse files- README.md +1 -1
- all_results.json +3 -3
- train_results.json +3 -3
- trainer_state.json +255 -255
README.md
CHANGED
|
@@ -27,7 +27,7 @@ print(output["generated_text"])
|
|
| 27 |
|
| 28 |
## Training procedure
|
| 29 |
|
| 30 |
-
[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/causalai/huggingface/runs/
|
| 31 |
|
| 32 |
|
| 33 |
This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
|
|
|
|
| 27 |
|
| 28 |
## Training procedure
|
| 29 |
|
| 30 |
+
[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/causalai/huggingface/runs/ds70toql)
|
| 31 |
|
| 32 |
|
| 33 |
This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
|
all_results.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
{
|
| 2 |
"total_flos": 0.0,
|
| 3 |
-
"train_loss": 0.
|
| 4 |
-
"train_runtime":
|
| 5 |
"train_samples": 17056,
|
| 6 |
-
"train_samples_per_second": 0.
|
| 7 |
"train_steps_per_second": 0.001
|
| 8 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"total_flos": 0.0,
|
| 3 |
+
"train_loss": 0.051685971918662914,
|
| 4 |
+
"train_runtime": 185838.8689,
|
| 5 |
"train_samples": 17056,
|
| 6 |
+
"train_samples_per_second": 0.092,
|
| 7 |
"train_steps_per_second": 0.001
|
| 8 |
}
|
train_results.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
{
|
| 2 |
"total_flos": 0.0,
|
| 3 |
-
"train_loss": 0.
|
| 4 |
-
"train_runtime":
|
| 5 |
"train_samples": 17056,
|
| 6 |
-
"train_samples_per_second": 0.
|
| 7 |
"train_steps_per_second": 0.001
|
| 8 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"total_flos": 0.0,
|
| 3 |
+
"train_loss": 0.051685971918662914,
|
| 4 |
+
"train_runtime": 185838.8689,
|
| 5 |
"train_samples": 17056,
|
| 6 |
+
"train_samples_per_second": 0.092,
|
| 7 |
"train_steps_per_second": 0.001
|
| 8 |
}
|
trainer_state.json
CHANGED
|
@@ -9,398 +9,398 @@
|
|
| 9 |
"is_world_process_zero": true,
|
| 10 |
"log_history": [
|
| 11 |
{
|
| 12 |
-
"completion_length":
|
| 13 |
"epoch": 0.0375234521575985,
|
| 14 |
-
"grad_norm":
|
| 15 |
-
"kl": 0.
|
| 16 |
"learning_rate": 7.1428571428571436e-06,
|
| 17 |
-
"loss": 0.
|
| 18 |
-
"reward": 0.
|
| 19 |
-
"reward_std": 0.
|
| 20 |
-
"rewards/accuracy_reward": 0.
|
| 21 |
-
"rewards/format_reward": 0.
|
| 22 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 23 |
"step": 5
|
| 24 |
},
|
| 25 |
{
|
| 26 |
-
"completion_length":
|
| 27 |
"epoch": 0.075046904315197,
|
| 28 |
-
"grad_norm":
|
| 29 |
-
"kl":
|
| 30 |
"learning_rate": 1.4285714285714287e-05,
|
| 31 |
-
"loss": 0.
|
| 32 |
-
"reward": 0.
|
| 33 |
-
"reward_std": 0.
|
| 34 |
-
"rewards/accuracy_reward": 0.
|
| 35 |
-
"rewards/format_reward": 0.
|
| 36 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 37 |
"step": 10
|
| 38 |
},
|
| 39 |
{
|
| 40 |
-
"completion_length":
|
| 41 |
"epoch": 0.1125703564727955,
|
| 42 |
-
"grad_norm":
|
| 43 |
-
"kl":
|
| 44 |
"learning_rate": 1.9996515418688493e-05,
|
| 45 |
-
"loss": 0.
|
| 46 |
-
"reward":
|
| 47 |
-
"reward_std": 0.
|
| 48 |
-
"rewards/accuracy_reward": 0.
|
| 49 |
-
"rewards/format_reward": 0.
|
| 50 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 51 |
"step": 15
|
| 52 |
},
|
| 53 |
{
|
| 54 |
-
"completion_length":
|
| 55 |
"epoch": 0.150093808630394,
|
| 56 |
-
"grad_norm":
|
| 57 |
-
"kl":
|
| 58 |
"learning_rate": 1.9874809871741877e-05,
|
| 59 |
-
"loss": 0.
|
| 60 |
-
"reward":
|
| 61 |
-
"reward_std": 0.
|
| 62 |
-
"rewards/accuracy_reward": 0.
|
| 63 |
-
"rewards/format_reward": 0.
|
| 64 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 65 |
"step": 20
|
| 66 |
},
|
| 67 |
{
|
| 68 |
-
"completion_length":
|
| 69 |
"epoch": 0.18761726078799248,
|
| 70 |
-
"grad_norm":
|
| 71 |
-
"kl":
|
| 72 |
"learning_rate": 1.9581296124106682e-05,
|
| 73 |
-
"loss": 0.
|
| 74 |
-
"reward":
|
| 75 |
-
"reward_std": 0.
|
| 76 |
-
"rewards/accuracy_reward": 0.
|
| 77 |
-
"rewards/format_reward": 0.
|
| 78 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 79 |
"step": 25
|
| 80 |
},
|
| 81 |
{
|
| 82 |
-
"completion_length":
|
| 83 |
"epoch": 0.225140712945591,
|
| 84 |
-
"grad_norm":
|
| 85 |
-
"kl":
|
| 86 |
"learning_rate": 1.912108091398988e-05,
|
| 87 |
-
"loss": 0.
|
| 88 |
-
"reward": 0.
|
| 89 |
-
"reward_std": 0.
|
| 90 |
-
"rewards/accuracy_reward": 0.
|
| 91 |
-
"rewards/format_reward": 0.
|
| 92 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 93 |
"step": 30
|
| 94 |
},
|
| 95 |
{
|
| 96 |
-
"completion_length":
|
| 97 |
"epoch": 0.2626641651031895,
|
| 98 |
-
"grad_norm":
|
| 99 |
-
"kl":
|
| 100 |
"learning_rate": 1.8502171357296144e-05,
|
| 101 |
-
"loss": 0.
|
| 102 |
-
"reward": 0.
|
| 103 |
-
"reward_std": 0.
|
| 104 |
-
"rewards/accuracy_reward": 0.
|
| 105 |
-
"rewards/format_reward": 0.
|
| 106 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 107 |
"step": 35
|
| 108 |
},
|
| 109 |
{
|
| 110 |
-
"completion_length":
|
| 111 |
"epoch": 0.300187617260788,
|
| 112 |
-
"grad_norm":
|
| 113 |
-
"kl": 1.
|
| 114 |
"learning_rate": 1.773533563475053e-05,
|
| 115 |
-
"loss": 0.
|
| 116 |
-
"reward":
|
| 117 |
-
"reward_std": 0.
|
| 118 |
-
"rewards/accuracy_reward": 0.
|
| 119 |
-
"rewards/format_reward": 0.
|
| 120 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 121 |
"step": 40
|
| 122 |
},
|
| 123 |
{
|
| 124 |
-
"completion_length":
|
| 125 |
"epoch": 0.33771106941838647,
|
| 126 |
-
"grad_norm":
|
| 127 |
-
"kl": 1.
|
| 128 |
"learning_rate": 1.6833915640265485e-05,
|
| 129 |
-
"loss": 0.
|
| 130 |
-
"reward":
|
| 131 |
-
"reward_std": 0.
|
| 132 |
-
"rewards/accuracy_reward": 0.
|
| 133 |
-
"rewards/format_reward": 0.
|
| 134 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 135 |
"step": 45
|
| 136 |
},
|
| 137 |
{
|
| 138 |
-
"completion_length":
|
| 139 |
"epoch": 0.37523452157598497,
|
| 140 |
-
"grad_norm":
|
| 141 |
-
"kl":
|
| 142 |
"learning_rate": 1.58135948502146e-05,
|
| 143 |
-
"loss": 0.
|
| 144 |
-
"reward":
|
| 145 |
-
"reward_std": 0.
|
| 146 |
-
"rewards/accuracy_reward": 0.
|
| 147 |
-
"rewards/format_reward": 0.
|
| 148 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 149 |
"step": 50
|
| 150 |
},
|
| 151 |
{
|
| 152 |
-
"completion_length":
|
| 153 |
"epoch": 0.41275797373358347,
|
| 154 |
-
"grad_norm":
|
| 155 |
-
"kl":
|
| 156 |
"learning_rate": 1.4692125452370664e-05,
|
| 157 |
-
"loss": 0.
|
| 158 |
-
"reward":
|
| 159 |
-
"reward_std": 0.
|
| 160 |
-
"rewards/accuracy_reward": 0.
|
| 161 |
-
"rewards/format_reward": 0.
|
| 162 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 163 |
"step": 55
|
| 164 |
},
|
| 165 |
{
|
| 166 |
-
"completion_length":
|
| 167 |
"epoch": 0.450281425891182,
|
| 168 |
-
"grad_norm":
|
| 169 |
-
"kl":
|
| 170 |
"learning_rate": 1.348901948209167e-05,
|
| 171 |
-
"loss": 0.
|
| 172 |
-
"reward":
|
| 173 |
-
"reward_std": 0.
|
| 174 |
-
"rewards/accuracy_reward": 0.
|
| 175 |
-
"rewards/format_reward": 0.
|
| 176 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 177 |
"step": 60
|
| 178 |
},
|
| 179 |
{
|
| 180 |
-
"completion_length":
|
| 181 |
"epoch": 0.4878048780487805,
|
| 182 |
-
"grad_norm":
|
| 183 |
-
"kl":
|
| 184 |
"learning_rate": 1.2225209339563144e-05,
|
| 185 |
-
"loss": 0.
|
| 186 |
-
"reward":
|
| 187 |
-
"reward_std": 0.
|
| 188 |
-
"rewards/accuracy_reward": 0.
|
| 189 |
-
"rewards/format_reward": 0.
|
| 190 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 191 |
"step": 65
|
| 192 |
},
|
| 193 |
{
|
| 194 |
-
"completion_length":
|
| 195 |
"epoch": 0.525328330206379,
|
| 196 |
-
"grad_norm":
|
| 197 |
-
"kl":
|
| 198 |
"learning_rate": 1.092268359463302e-05,
|
| 199 |
-
"loss": 0.
|
| 200 |
-
"reward":
|
| 201 |
-
"reward_std": 0.
|
| 202 |
-
"rewards/accuracy_reward": 0.
|
| 203 |
-
"rewards/format_reward": 0.
|
| 204 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 205 |
"step": 70
|
| 206 |
},
|
| 207 |
{
|
| 208 |
-
"completion_length":
|
| 209 |
"epoch": 0.5628517823639775,
|
| 210 |
-
"grad_norm":
|
| 211 |
-
"kl":
|
| 212 |
"learning_rate": 9.604104415737309e-06,
|
| 213 |
-
"loss": 0.
|
| 214 |
-
"reward":
|
| 215 |
-
"reward_std": 0.
|
| 216 |
-
"rewards/accuracy_reward": 0.
|
| 217 |
-
"rewards/format_reward": 0.
|
| 218 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 219 |
"step": 75
|
| 220 |
},
|
| 221 |
{
|
| 222 |
-
"completion_length":
|
| 223 |
"epoch": 0.600375234521576,
|
| 224 |
-
"grad_norm":
|
| 225 |
-
"kl":
|
| 226 |
"learning_rate": 8.292413279130625e-06,
|
| 227 |
-
"loss": 0.
|
| 228 |
-
"reward":
|
| 229 |
-
"reward_std": 0.
|
| 230 |
-
"rewards/accuracy_reward": 0.
|
| 231 |
-
"rewards/format_reward": 0.
|
| 232 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 233 |
"step": 80
|
| 234 |
},
|
| 235 |
{
|
| 236 |
-
"completion_length":
|
| 237 |
"epoch": 0.6378986866791745,
|
| 238 |
-
"grad_norm":
|
| 239 |
-
"kl":
|
| 240 |
"learning_rate": 7.010431818542298e-06,
|
| 241 |
-
"loss": 0.
|
| 242 |
-
"reward":
|
| 243 |
-
"reward_std": 0.
|
| 244 |
-
"rewards/accuracy_reward": 0.
|
| 245 |
-
"rewards/format_reward": 0.
|
| 246 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 247 |
"step": 85
|
| 248 |
},
|
| 249 |
{
|
| 250 |
-
"completion_length":
|
| 251 |
"epoch": 0.6754221388367729,
|
| 252 |
-
"grad_norm":
|
| 253 |
-
"kl":
|
| 254 |
"learning_rate": 5.780464759928623e-06,
|
| 255 |
-
"loss": 0.
|
| 256 |
-
"reward":
|
| 257 |
-
"reward_std": 0.
|
| 258 |
-
"rewards/accuracy_reward": 0.
|
| 259 |
-
"rewards/format_reward": 0.
|
| 260 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 261 |
"step": 90
|
| 262 |
},
|
| 263 |
{
|
| 264 |
-
"completion_length":
|
| 265 |
"epoch": 0.7129455909943715,
|
| 266 |
-
"grad_norm":
|
| 267 |
-
"kl":
|
| 268 |
"learning_rate": 4.623911849714226e-06,
|
| 269 |
-
"loss": 0.
|
| 270 |
-
"reward":
|
| 271 |
-
"reward_std": 0.
|
| 272 |
-
"rewards/accuracy_reward": 0.
|
| 273 |
-
"rewards/format_reward": 0.
|
| 274 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 275 |
"step": 95
|
| 276 |
},
|
| 277 |
{
|
| 278 |
-
"completion_length":
|
| 279 |
"epoch": 0.7504690431519699,
|
| 280 |
-
"grad_norm":
|
| 281 |
-
"kl":
|
| 282 |
"learning_rate": 3.560895528440844e-06,
|
| 283 |
-
"loss": 0.
|
| 284 |
-
"reward":
|
| 285 |
-
"reward_std": 0.
|
| 286 |
-
"rewards/accuracy_reward": 0.
|
| 287 |
-
"rewards/format_reward": 0.
|
| 288 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 289 |
"step": 100
|
| 290 |
},
|
| 291 |
{
|
| 292 |
"epoch": 0.7504690431519699,
|
| 293 |
-
"eval_completion_length":
|
| 294 |
-
"eval_kl":
|
| 295 |
-
"eval_loss": 0.
|
| 296 |
-
"eval_reward":
|
| 297 |
-
"eval_reward_std": 0.
|
| 298 |
-
"eval_rewards/accuracy_reward": 0.
|
| 299 |
-
"eval_rewards/format_reward": 0.
|
| 300 |
-
"eval_rewards/relaxed_accuracy_reward": 0.
|
| 301 |
-
"eval_runtime":
|
| 302 |
-
"eval_samples_per_second":
|
| 303 |
-
"eval_steps_per_second": 0.
|
| 304 |
"step": 100
|
| 305 |
},
|
| 306 |
{
|
| 307 |
-
"completion_length":
|
| 308 |
"epoch": 0.7879924953095685,
|
| 309 |
-
"grad_norm":
|
| 310 |
-
"kl":
|
| 311 |
"learning_rate": 2.6099108277934105e-06,
|
| 312 |
-
"loss": 0.
|
| 313 |
-
"reward":
|
| 314 |
-
"reward_std": 0.
|
| 315 |
-
"rewards/accuracy_reward": 0.
|
| 316 |
-
"rewards/format_reward": 0.
|
| 317 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 318 |
"step": 105
|
| 319 |
},
|
| 320 |
{
|
| 321 |
-
"completion_length":
|
| 322 |
"epoch": 0.8255159474671669,
|
| 323 |
-
"grad_norm":
|
| 324 |
-
"kl":
|
| 325 |
"learning_rate": 1.7875035823168641e-06,
|
| 326 |
-
"loss": 0.
|
| 327 |
-
"reward":
|
| 328 |
-
"reward_std": 0.
|
| 329 |
-
"rewards/accuracy_reward": 0.
|
| 330 |
-
"rewards/format_reward": 0.
|
| 331 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 332 |
"step": 110
|
| 333 |
},
|
| 334 |
{
|
| 335 |
-
"completion_length":
|
| 336 |
"epoch": 0.8630393996247655,
|
| 337 |
-
"grad_norm":
|
| 338 |
-
"kl":
|
| 339 |
"learning_rate": 1.1079825545001887e-06,
|
| 340 |
-
"loss": 0.
|
| 341 |
-
"reward":
|
| 342 |
-
"reward_std": 0.
|
| 343 |
-
"rewards/accuracy_reward": 0.
|
| 344 |
-
"rewards/format_reward": 0.
|
| 345 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 346 |
"step": 115
|
| 347 |
},
|
| 348 |
{
|
| 349 |
-
"completion_length":
|
| 350 |
"epoch": 0.900562851782364,
|
| 351 |
-
"grad_norm":
|
| 352 |
-
"kl":
|
| 353 |
"learning_rate": 5.831704818578842e-07,
|
| 354 |
-
"loss": 0.
|
| 355 |
-
"reward":
|
| 356 |
-
"reward_std": 0.
|
| 357 |
-
"rewards/accuracy_reward": 0.
|
| 358 |
-
"rewards/format_reward": 0.
|
| 359 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 360 |
"step": 120
|
| 361 |
},
|
| 362 |
{
|
| 363 |
-
"completion_length":
|
| 364 |
"epoch": 0.9380863039399625,
|
| 365 |
-
"grad_norm":
|
| 366 |
-
"kl":
|
| 367 |
"learning_rate": 2.2219837744959284e-07,
|
| 368 |
-
"loss": 0.
|
| 369 |
-
"reward":
|
| 370 |
-
"reward_std": 0.
|
| 371 |
-
"rewards/accuracy_reward": 0.
|
| 372 |
-
"rewards/format_reward": 0.
|
| 373 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 374 |
"step": 125
|
| 375 |
},
|
| 376 |
{
|
| 377 |
-
"completion_length":
|
| 378 |
"epoch": 0.975609756097561,
|
| 379 |
-
"grad_norm":
|
| 380 |
-
"kl":
|
| 381 |
"learning_rate": 3.134666272774034e-08,
|
| 382 |
-
"loss": 0.
|
| 383 |
-
"reward":
|
| 384 |
-
"reward_std": 0.
|
| 385 |
-
"rewards/accuracy_reward": 0.
|
| 386 |
-
"rewards/format_reward": 0.
|
| 387 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 388 |
"step": 130
|
| 389 |
},
|
| 390 |
{
|
| 391 |
-
"completion_length":
|
| 392 |
"epoch": 0.99812382739212,
|
| 393 |
-
"kl":
|
| 394 |
-
"reward":
|
| 395 |
-
"reward_std": 0.
|
| 396 |
-
"rewards/accuracy_reward": 0.
|
| 397 |
-
"rewards/format_reward": 0.
|
| 398 |
-
"rewards/relaxed_accuracy_reward": 0.
|
| 399 |
"step": 133,
|
| 400 |
"total_flos": 0.0,
|
| 401 |
-
"train_loss": 0.
|
| 402 |
-
"train_runtime":
|
| 403 |
-
"train_samples_per_second": 0.
|
| 404 |
"train_steps_per_second": 0.001
|
| 405 |
}
|
| 406 |
],
|
|
|
|
| 9 |
"is_world_process_zero": true,
|
| 10 |
"log_history": [
|
| 11 |
{
|
| 12 |
+
"completion_length": 90.7151068329811,
|
| 13 |
"epoch": 0.0375234521575985,
|
| 14 |
+
"grad_norm": 2.802288055419922,
|
| 15 |
+
"kl": 0.01843569278717041,
|
| 16 |
"learning_rate": 7.1428571428571436e-06,
|
| 17 |
+
"loss": 0.0016,
|
| 18 |
+
"reward": 0.061718751792795956,
|
| 19 |
+
"reward_std": 0.13259942815639078,
|
| 20 |
+
"rewards/accuracy_reward": 0.016406250395812096,
|
| 21 |
+
"rewards/format_reward": 0.026302084024064244,
|
| 22 |
+
"rewards/relaxed_accuracy_reward": 0.019010417140088977,
|
| 23 |
"step": 5
|
| 24 |
},
|
| 25 |
{
|
| 26 |
+
"completion_length": 52.738803672790525,
|
| 27 |
"epoch": 0.075046904315197,
|
| 28 |
+
"grad_norm": 4.877239227294922,
|
| 29 |
+
"kl": 0.3423828125,
|
| 30 |
"learning_rate": 1.4285714285714287e-05,
|
| 31 |
+
"loss": 0.0019,
|
| 32 |
+
"reward": 0.9867187785916031,
|
| 33 |
+
"reward_std": 0.4639298491179943,
|
| 34 |
+
"rewards/accuracy_reward": 0.13906250363215805,
|
| 35 |
+
"rewards/format_reward": 0.6864583493210376,
|
| 36 |
+
"rewards/relaxed_accuracy_reward": 0.16119792128447444,
|
| 37 |
"step": 10
|
| 38 |
},
|
| 39 |
{
|
| 40 |
+
"completion_length": 146.94818139076233,
|
| 41 |
"epoch": 0.1125703564727955,
|
| 42 |
+
"grad_norm": 2.6658596992492676,
|
| 43 |
+
"kl": 0.64903564453125,
|
| 44 |
"learning_rate": 1.9996515418688493e-05,
|
| 45 |
+
"loss": 0.0405,
|
| 46 |
+
"reward": 1.1822917029261588,
|
| 47 |
+
"reward_std": 0.4913041713181883,
|
| 48 |
+
"rewards/accuracy_reward": 0.14843750416766852,
|
| 49 |
+
"rewards/format_reward": 0.8632812697440386,
|
| 50 |
+
"rewards/relaxed_accuracy_reward": 0.17057292158715426,
|
| 51 |
"step": 15
|
| 52 |
},
|
| 53 |
{
|
| 54 |
+
"completion_length": 47.01458476781845,
|
| 55 |
"epoch": 0.150093808630394,
|
| 56 |
+
"grad_norm": 1.1637159585952759,
|
| 57 |
+
"kl": 0.74886474609375,
|
| 58 |
"learning_rate": 1.9874809871741877e-05,
|
| 59 |
+
"loss": 0.025,
|
| 60 |
+
"reward": 1.0914062693715096,
|
| 61 |
+
"reward_std": 0.19755753134377302,
|
| 62 |
+
"rewards/accuracy_reward": 0.05182291832752526,
|
| 63 |
+
"rewards/format_reward": 0.9742187555879355,
|
| 64 |
+
"rewards/relaxed_accuracy_reward": 0.06536458558402955,
|
| 65 |
"step": 20
|
| 66 |
},
|
| 67 |
{
|
| 68 |
+
"completion_length": 113.31823229789734,
|
| 69 |
"epoch": 0.18761726078799248,
|
| 70 |
+
"grad_norm": 6.727182388305664,
|
| 71 |
+
"kl": 0.301470947265625,
|
| 72 |
"learning_rate": 1.9581296124106682e-05,
|
| 73 |
+
"loss": 0.0255,
|
| 74 |
+
"reward": 1.4166667103767394,
|
| 75 |
+
"reward_std": 0.39920547502115367,
|
| 76 |
+
"rewards/accuracy_reward": 0.18359375651925802,
|
| 77 |
+
"rewards/format_reward": 0.9898437581956386,
|
| 78 |
+
"rewards/relaxed_accuracy_reward": 0.24322917410172523,
|
| 79 |
"step": 25
|
| 80 |
},
|
| 81 |
{
|
| 82 |
+
"completion_length": 65.79661650806665,
|
| 83 |
"epoch": 0.225140712945591,
|
| 84 |
+
"grad_norm": 3.8793492317199707,
|
| 85 |
+
"kl": 12.40894775390625,
|
| 86 |
"learning_rate": 1.912108091398988e-05,
|
| 87 |
+
"loss": 0.4861,
|
| 88 |
+
"reward": 0.8789062738418579,
|
| 89 |
+
"reward_std": 0.249878820637241,
|
| 90 |
+
"rewards/accuracy_reward": 0.12656250381842254,
|
| 91 |
+
"rewards/format_reward": 0.5846354231238365,
|
| 92 |
+
"rewards/relaxed_accuracy_reward": 0.16770833763293921,
|
| 93 |
"step": 30
|
| 94 |
},
|
| 95 |
{
|
| 96 |
+
"completion_length": 65.01015776395798,
|
| 97 |
"epoch": 0.2626641651031895,
|
| 98 |
+
"grad_norm": 127.46144104003906,
|
| 99 |
+
"kl": 0.706707763671875,
|
| 100 |
"learning_rate": 1.8502171357296144e-05,
|
| 101 |
+
"loss": 0.0008,
|
| 102 |
+
"reward": 0.4466145946178585,
|
| 103 |
+
"reward_std": 0.16547852829098703,
|
| 104 |
+
"rewards/accuracy_reward": 0.02786458395421505,
|
| 105 |
+
"rewards/format_reward": 0.3843750098953024,
|
| 106 |
+
"rewards/relaxed_accuracy_reward": 0.034375000698491934,
|
| 107 |
"step": 35
|
| 108 |
},
|
| 109 |
{
|
| 110 |
+
"completion_length": 44.66328253149986,
|
| 111 |
"epoch": 0.300187617260788,
|
| 112 |
+
"grad_norm": 1.7230250835418701,
|
| 113 |
+
"kl": 1.5087890625,
|
| 114 |
"learning_rate": 1.773533563475053e-05,
|
| 115 |
+
"loss": 0.0685,
|
| 116 |
+
"reward": 1.279427120834589,
|
| 117 |
+
"reward_std": 0.3529853185173124,
|
| 118 |
+
"rewards/accuracy_reward": 0.13802083663176745,
|
| 119 |
+
"rewards/format_reward": 0.9729166731238366,
|
| 120 |
+
"rewards/relaxed_accuracy_reward": 0.16848958830814809,
|
| 121 |
"step": 40
|
| 122 |
},
|
| 123 |
{
|
| 124 |
+
"completion_length": 18.572396397590637,
|
| 125 |
"epoch": 0.33771106941838647,
|
| 126 |
+
"grad_norm": 197.8912811279297,
|
| 127 |
+
"kl": 1.648193359375,
|
| 128 |
"learning_rate": 1.6833915640265485e-05,
|
| 129 |
+
"loss": 0.0695,
|
| 130 |
+
"reward": 1.4171875409781933,
|
| 131 |
+
"reward_std": 0.25054811174049973,
|
| 132 |
+
"rewards/accuracy_reward": 0.18385417158715428,
|
| 133 |
+
"rewards/format_reward": 0.9942708384245634,
|
| 134 |
+
"rewards/relaxed_accuracy_reward": 0.23906250612344593,
|
| 135 |
"step": 45
|
| 136 |
},
|
| 137 |
{
|
| 138 |
+
"completion_length": 36.354167491197586,
|
| 139 |
"epoch": 0.37523452157598497,
|
| 140 |
+
"grad_norm": 2.2742254734039307,
|
| 141 |
+
"kl": 4.847802734375,
|
| 142 |
"learning_rate": 1.58135948502146e-05,
|
| 143 |
+
"loss": 0.2069,
|
| 144 |
+
"reward": 1.3265625357627868,
|
| 145 |
+
"reward_std": 0.322022933838889,
|
| 146 |
+
"rewards/accuracy_reward": 0.1640625043073669,
|
| 147 |
+
"rewards/format_reward": 0.9533854331821203,
|
| 148 |
+
"rewards/relaxed_accuracy_reward": 0.2091145885642618,
|
| 149 |
"step": 50
|
| 150 |
},
|
| 151 |
{
|
| 152 |
+
"completion_length": 32.32109480500221,
|
| 153 |
"epoch": 0.41275797373358347,
|
| 154 |
+
"grad_norm": 1.68628990650177,
|
| 155 |
+
"kl": 2.0571533203125,
|
| 156 |
"learning_rate": 1.4692125452370664e-05,
|
| 157 |
+
"loss": 0.07,
|
| 158 |
+
"reward": 1.4083333723247051,
|
| 159 |
+
"reward_std": 0.32693559252656995,
|
| 160 |
+
"rewards/accuracy_reward": 0.18463542151730508,
|
| 161 |
+
"rewards/format_reward": 0.9885416757315397,
|
| 162 |
+
"rewards/relaxed_accuracy_reward": 0.23515625537838786,
|
| 163 |
"step": 55
|
| 164 |
},
|
| 165 |
{
|
| 166 |
+
"completion_length": 108.25755517482757,
|
| 167 |
"epoch": 0.450281425891182,
|
| 168 |
+
"grad_norm": 1.052840232849121,
|
| 169 |
+
"kl": 0.5584716796875,
|
| 170 |
"learning_rate": 1.348901948209167e-05,
|
| 171 |
+
"loss": 0.0306,
|
| 172 |
+
"reward": 1.3903646256774664,
|
| 173 |
+
"reward_std": 0.40279707135632636,
|
| 174 |
+
"rewards/accuracy_reward": 0.185156254703179,
|
| 175 |
+
"rewards/format_reward": 0.9695312656462193,
|
| 176 |
+
"rewards/relaxed_accuracy_reward": 0.23567708909977228,
|
| 177 |
"step": 60
|
| 178 |
},
|
| 179 |
{
|
| 180 |
+
"completion_length": 56.58411636352539,
|
| 181 |
"epoch": 0.4878048780487805,
|
| 182 |
+
"grad_norm": 1.2224760055541992,
|
| 183 |
+
"kl": 0.54796142578125,
|
| 184 |
"learning_rate": 1.2225209339563144e-05,
|
| 185 |
+
"loss": 0.0342,
|
| 186 |
+
"reward": 1.4695312902331352,
|
| 187 |
+
"reward_std": 0.36167940208688376,
|
| 188 |
+
"rewards/accuracy_reward": 0.21458333889022468,
|
| 189 |
+
"rewards/format_reward": 0.9908854246139527,
|
| 190 |
+
"rewards/relaxed_accuracy_reward": 0.2640625067986548,
|
| 191 |
"step": 65
|
| 192 |
},
|
| 193 |
{
|
| 194 |
+
"completion_length": 61.634637117385864,
|
| 195 |
"epoch": 0.525328330206379,
|
| 196 |
+
"grad_norm": 3.6565780639648438,
|
| 197 |
+
"kl": 0.692724609375,
|
| 198 |
"learning_rate": 1.092268359463302e-05,
|
| 199 |
+
"loss": 0.0236,
|
| 200 |
+
"reward": 1.4825521290302277,
|
| 201 |
+
"reward_std": 0.305070091644302,
|
| 202 |
+
"rewards/accuracy_reward": 0.2138020895421505,
|
| 203 |
+
"rewards/format_reward": 0.9934895880520344,
|
| 204 |
+
"rewards/relaxed_accuracy_reward": 0.2752604250796139,
|
| 205 |
"step": 70
|
| 206 |
},
|
| 207 |
{
|
| 208 |
+
"completion_length": 84.00182542800903,
|
| 209 |
"epoch": 0.5628517823639775,
|
| 210 |
+
"grad_norm": 2.023277997970581,
|
| 211 |
+
"kl": 0.6501220703125,
|
| 212 |
"learning_rate": 9.604104415737309e-06,
|
| 213 |
+
"loss": 0.0362,
|
| 214 |
+
"reward": 1.452604204416275,
|
| 215 |
+
"reward_std": 0.35850795153528453,
|
| 216 |
+
"rewards/accuracy_reward": 0.21250000558793544,
|
| 217 |
+
"rewards/format_reward": 0.9677083436399698,
|
| 218 |
+
"rewards/relaxed_accuracy_reward": 0.2723958401707932,
|
| 219 |
"step": 75
|
| 220 |
},
|
| 221 |
{
|
| 222 |
+
"completion_length": 84.90573143959045,
|
| 223 |
"epoch": 0.600375234521576,
|
| 224 |
+
"grad_norm": 4.130603313446045,
|
| 225 |
+
"kl": 0.583837890625,
|
| 226 |
"learning_rate": 8.292413279130625e-06,
|
| 227 |
+
"loss": 0.0256,
|
| 228 |
+
"reward": 1.0875000283122063,
|
| 229 |
+
"reward_std": 0.6249250227585434,
|
| 230 |
+
"rewards/accuracy_reward": 0.1419270873069763,
|
| 231 |
+
"rewards/format_reward": 0.7656250182539225,
|
| 232 |
+
"rewards/relaxed_accuracy_reward": 0.1799479213077575,
|
| 233 |
"step": 80
|
| 234 |
},
|
| 235 |
{
|
| 236 |
+
"completion_length": 64.10599145889282,
|
| 237 |
"epoch": 0.6378986866791745,
|
| 238 |
+
"grad_norm": 1.4160608053207397,
|
| 239 |
+
"kl": 0.508251953125,
|
| 240 |
"learning_rate": 7.010431818542298e-06,
|
| 241 |
+
"loss": 0.0107,
|
| 242 |
+
"reward": 1.1182292014360429,
|
| 243 |
+
"reward_std": 0.46313118319958446,
|
| 244 |
+
"rewards/accuracy_reward": 0.10078125314321369,
|
| 245 |
+
"rewards/format_reward": 0.8817708536982536,
|
| 246 |
+
"rewards/relaxed_accuracy_reward": 0.13567708695773034,
|
| 247 |
"step": 85
|
| 248 |
},
|
| 249 |
{
|
| 250 |
+
"completion_length": 64.69922063350677,
|
| 251 |
"epoch": 0.6754221388367729,
|
| 252 |
+
"grad_norm": 1.4829213619232178,
|
| 253 |
+
"kl": 0.46568603515625,
|
| 254 |
"learning_rate": 5.780464759928623e-06,
|
| 255 |
+
"loss": 0.0158,
|
| 256 |
+
"reward": 1.262760452926159,
|
| 257 |
+
"reward_std": 0.3677686099894345,
|
| 258 |
+
"rewards/accuracy_reward": 0.12369792014360428,
|
| 259 |
+
"rewards/format_reward": 0.9700520999729634,
|
| 260 |
+
"rewards/relaxed_accuracy_reward": 0.16901042116805912,
|
| 261 |
"step": 90
|
| 262 |
},
|
| 263 |
{
|
| 264 |
+
"completion_length": 77.71198177337646,
|
| 265 |
"epoch": 0.7129455909943715,
|
| 266 |
+
"grad_norm": 1.331533670425415,
|
| 267 |
+
"kl": 0.4433837890625,
|
| 268 |
"learning_rate": 4.623911849714226e-06,
|
| 269 |
+
"loss": 0.023,
|
| 270 |
+
"reward": 1.3916667073965072,
|
| 271 |
+
"reward_std": 0.3710032233502716,
|
| 272 |
+
"rewards/accuracy_reward": 0.17578125505242498,
|
| 273 |
+
"rewards/format_reward": 0.9848958436399698,
|
| 274 |
+
"rewards/relaxed_accuracy_reward": 0.2309895897982642,
|
| 275 |
"step": 95
|
| 276 |
},
|
| 277 |
{
|
| 278 |
+
"completion_length": 97.82890903949738,
|
| 279 |
"epoch": 0.7504690431519699,
|
| 280 |
+
"grad_norm": 1.2931978702545166,
|
| 281 |
+
"kl": 0.49661865234375,
|
| 282 |
"learning_rate": 3.560895528440844e-06,
|
| 283 |
+
"loss": 0.0402,
|
| 284 |
+
"reward": 1.4742187947034835,
|
| 285 |
+
"reward_std": 0.37555707613937556,
|
| 286 |
+
"rewards/accuracy_reward": 0.21562500512227417,
|
| 287 |
+
"rewards/format_reward": 0.9763020973652601,
|
| 288 |
+
"rewards/relaxed_accuracy_reward": 0.282291673310101,
|
| 289 |
"step": 100
|
| 290 |
},
|
| 291 |
{
|
| 292 |
"epoch": 0.7504690431519699,
|
| 293 |
+
"eval_completion_length": 98.48214570155127,
|
| 294 |
+
"eval_kl": 0.5256492269163763,
|
| 295 |
+
"eval_loss": 0.028766795992851257,
|
| 296 |
+
"eval_reward": 1.4547038757427229,
|
| 297 |
+
"eval_reward_std": 0.4197673304882614,
|
| 298 |
+
"eval_rewards/accuracy_reward": 0.21239838048244603,
|
| 299 |
+
"eval_rewards/format_reward": 0.9732868901943911,
|
| 300 |
+
"eval_rewards/relaxed_accuracy_reward": 0.26901859094353087,
|
| 301 |
+
"eval_runtime": 3804.2499,
|
| 302 |
+
"eval_samples_per_second": 0.301,
|
| 303 |
+
"eval_steps_per_second": 0.075,
|
| 304 |
"step": 100
|
| 305 |
},
|
| 306 |
{
|
| 307 |
+
"completion_length": 99.60547137260437,
|
| 308 |
"epoch": 0.7879924953095685,
|
| 309 |
+
"grad_norm": 1.4268443584442139,
|
| 310 |
+
"kl": 0.5451171875,
|
| 311 |
"learning_rate": 2.6099108277934105e-06,
|
| 312 |
+
"loss": 0.0419,
|
| 313 |
+
"reward": 1.4901042107492686,
|
| 314 |
+
"reward_std": 0.4182614594232291,
|
| 315 |
+
"rewards/accuracy_reward": 0.23645834045019,
|
| 316 |
+
"rewards/format_reward": 0.9674479342997074,
|
| 317 |
+
"rewards/relaxed_accuracy_reward": 0.2861979248933494,
|
| 318 |
"step": 105
|
| 319 |
},
|
| 320 |
{
|
| 321 |
+
"completion_length": 96.49583611488342,
|
| 322 |
"epoch": 0.8255159474671669,
|
| 323 |
+
"grad_norm": 1.0921823978424072,
|
| 324 |
+
"kl": 0.50526123046875,
|
| 325 |
"learning_rate": 1.7875035823168641e-06,
|
| 326 |
+
"loss": 0.0214,
|
| 327 |
+
"reward": 1.4427083771675826,
|
| 328 |
+
"reward_std": 0.3818355408497155,
|
| 329 |
+
"rewards/accuracy_reward": 0.2046875060768798,
|
| 330 |
+
"rewards/format_reward": 0.9671875163912773,
|
| 331 |
+
"rewards/relaxed_accuracy_reward": 0.27083334026392547,
|
| 332 |
"step": 110
|
| 333 |
},
|
| 334 |
{
|
| 335 |
+
"completion_length": 90.97396085262298,
|
| 336 |
"epoch": 0.8630393996247655,
|
| 337 |
+
"grad_norm": 1.486311674118042,
|
| 338 |
+
"kl": 0.50943603515625,
|
| 339 |
"learning_rate": 1.1079825545001887e-06,
|
| 340 |
+
"loss": 0.0204,
|
| 341 |
+
"reward": 1.4427083786576986,
|
| 342 |
+
"reward_std": 0.37329327603802087,
|
| 343 |
+
"rewards/accuracy_reward": 0.20286458970513194,
|
| 344 |
+
"rewards/format_reward": 0.9742187630385161,
|
| 345 |
+
"rewards/relaxed_accuracy_reward": 0.26562500847503545,
|
| 346 |
"step": 115
|
| 347 |
},
|
| 348 |
{
|
| 349 |
+
"completion_length": 86.1117213010788,
|
| 350 |
"epoch": 0.900562851782364,
|
| 351 |
+
"grad_norm": 2.1096079349517822,
|
| 352 |
+
"kl": 0.4741455078125,
|
| 353 |
"learning_rate": 5.831704818578842e-07,
|
| 354 |
+
"loss": 0.0203,
|
| 355 |
+
"reward": 1.4510417070239783,
|
| 356 |
+
"reward_std": 0.3531476927921176,
|
| 357 |
+
"rewards/accuracy_reward": 0.21015625533182175,
|
| 358 |
+
"rewards/format_reward": 0.9757812641561031,
|
| 359 |
+
"rewards/relaxed_accuracy_reward": 0.2651041731471196,
|
| 360 |
"step": 120
|
| 361 |
},
|
| 362 |
{
|
| 363 |
+
"completion_length": 80.04765882492066,
|
| 364 |
"epoch": 0.9380863039399625,
|
| 365 |
+
"grad_norm": 1.8568741083145142,
|
| 366 |
+
"kl": 0.51519775390625,
|
| 367 |
"learning_rate": 2.2219837744959284e-07,
|
| 368 |
+
"loss": 0.0225,
|
| 369 |
+
"reward": 1.507031300663948,
|
| 370 |
+
"reward_std": 0.4000093450304121,
|
| 371 |
+
"rewards/accuracy_reward": 0.2270833398681134,
|
| 372 |
+
"rewards/format_reward": 0.9763020988553762,
|
| 373 |
+
"rewards/relaxed_accuracy_reward": 0.3036458430346102,
|
| 374 |
"step": 125
|
| 375 |
},
|
| 376 |
{
|
| 377 |
+
"completion_length": 80.31172132492065,
|
| 378 |
"epoch": 0.975609756097561,
|
| 379 |
+
"grad_norm": 7.431880474090576,
|
| 380 |
+
"kl": 0.50054931640625,
|
| 381 |
"learning_rate": 3.134666272774034e-08,
|
| 382 |
+
"loss": 0.013,
|
| 383 |
+
"reward": 1.4830729607492685,
|
| 384 |
+
"reward_std": 0.3494715398177505,
|
| 385 |
+
"rewards/accuracy_reward": 0.2234375062864274,
|
| 386 |
+
"rewards/format_reward": 0.9770833492279053,
|
| 387 |
+
"rewards/relaxed_accuracy_reward": 0.2825520922895521,
|
| 388 |
"step": 130
|
| 389 |
},
|
| 390 |
{
|
| 391 |
+
"completion_length": 80.43663430213928,
|
| 392 |
"epoch": 0.99812382739212,
|
| 393 |
+
"kl": 0.5088704427083334,
|
| 394 |
+
"reward": 1.4366319850087166,
|
| 395 |
+
"reward_std": 0.36855422138857347,
|
| 396 |
+
"rewards/accuracy_reward": 0.194444449346823,
|
| 397 |
+
"rewards/format_reward": 0.9687500180055698,
|
| 398 |
+
"rewards/relaxed_accuracy_reward": 0.2734375084983185,
|
| 399 |
"step": 133,
|
| 400 |
"total_flos": 0.0,
|
| 401 |
+
"train_loss": 0.051685971918662914,
|
| 402 |
+
"train_runtime": 185838.8689,
|
| 403 |
+
"train_samples_per_second": 0.092,
|
| 404 |
"train_steps_per_second": 0.001
|
| 405 |
}
|
| 406 |
],
|