diff --git a/.gitattributes b/.gitattributes index 5186aa522450731f62d31d11f6a05d74f1101777..20bffb257279288a167b5be5fd007db8b6abb549 100644 --- a/.gitattributes +++ b/.gitattributes @@ -45,3 +45,11 @@ checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text checkpoint-450/tokenizer.json filter=lfs diff=lfs merge=lfs -text checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text checkpoint-550/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-650/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-750/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-850/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-950/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoint-600/README.md b/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..342a23987f57b711334f1f7c4b72004ab4751d11 --- /dev/null +++ b/checkpoint-600/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/checkpoint-600/adapter_config.json b/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca69f90ffbea02ffd530ac27f43588458c02af39 --- /dev/null +++ b/checkpoint-600/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "up_proj", + "o_proj", + "k_proj", + "gate_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-600/adapter_model.safetensors b/checkpoint-600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..396b39a496c12fa146517979ebea4250098a2aac --- /dev/null +++ b/checkpoint-600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f578679590f00f526abf0cd359997842706b7244b666ba9648de49ada5b7ad28 +size 778096664 diff --git a/checkpoint-600/optimizer.pt b/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ac612ae768d3619962810d0b9024cdfdf0d2648 --- /dev/null +++ b/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4736090e3f0672cd44514479c56a1835871d8df4de0111316ea4425fb65b7b1 +size 395571252 diff --git a/checkpoint-600/rng_state.pth b/checkpoint-600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..64b4e674c00a10f9d5aeaa272de1d5f2daed39fc --- /dev/null +++ b/checkpoint-600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0362c4a8c12885b7f0d72e807ce5ab6659ebd24423ccb7f3fbea37ffee2247b9 +size 14244 diff --git a/checkpoint-600/scheduler.pt b/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1aba67a39c50acdf894d631951594cea9e388847 --- /dev/null +++ b/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:147cdcccd744ef24e23a5fcac78596052a0683a52452407a2fad2e8f917925d9 +size 1064 diff --git a/checkpoint-600/special_tokens_map.json b/checkpoint-600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-600/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-600/tokenizer.json b/checkpoint-600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-600/tokenizer_config.json b/checkpoint-600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f29bafcf7d24e386a389486e71a4e81dfef0f5c2 --- /dev/null +++ b/checkpoint-600/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-600/trainer_state.json b/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..02c9cecdba775feca14292f192dc1b4c54c28d9a --- /dev/null +++ b/checkpoint-600/trainer_state.json @@ -0,0 +1,9033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.097902097902098, + "eval_steps": 500, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 399.0, + "epoch": 0.0034965034965034965, + "grad_norm": 0.9857833385467529, + "kl": 0.0, + "learning_rate": 2.5000000000000002e-08, + "loss": 0.0, + "reward": 1.75, + "reward_std": 1.069111704826355, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4166666865348816, + "step": 1 + }, + { + "completion_length": 305.3333435058594, + "epoch": 0.006993006993006993, + "grad_norm": 1.3122953176498413, + "kl": 0.0, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.0, + "reward": 1.0500000715255737, + "reward_std": 0.6340347528457642, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 2 + }, + { + "completion_length": 475.3333435058594, + "epoch": 0.01048951048951049, + "grad_norm": 6.344944953918457, + "kl": 0.0006356238736771047, + "learning_rate": 7.500000000000001e-08, + "loss": 0.0, + "reward": 1.4750001430511475, + "reward_std": 1.025061011314392, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 3 + }, + { + "completion_length": 378.3333435058594, + "epoch": 0.013986013986013986, + "grad_norm": 0.9831988215446472, + "kl": 0.0006719424272887409, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.0, + "reward": 1.2208333015441895, + "reward_std": 1.3383214473724365, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.22083334624767303, + "step": 4 + }, + { + "completion_length": 925.0, + "epoch": 0.017482517482517484, + "grad_norm": 1.042701005935669, + "kl": 0.000699286290910095, + "learning_rate": 1.2500000000000002e-07, + "loss": 0.0, + "reward": 2.4666666984558105, + "reward_std": 1.618847370147705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 5 + }, + { + "completion_length": 130.6666717529297, + "epoch": 0.02097902097902098, + "grad_norm": 1.276957631111145, + "kl": 0.0007741473382338881, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.0, + "reward": 0.38333332538604736, + "reward_std": 0.7222649455070496, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 6 + }, + { + "completion_length": 185.5, + "epoch": 0.024475524475524476, + "grad_norm": 1.277024507522583, + "kl": 0.0007853443967178464, + "learning_rate": 1.7500000000000002e-07, + "loss": 0.0, + "reward": 0.7250000238418579, + "reward_std": 0.44017040729522705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 7 + }, + { + "completion_length": 113.83333587646484, + "epoch": 0.027972027972027972, + "grad_norm": 4.894377708435059, + "kl": 0.0010196010116487741, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.0, + "reward": 0.7250000238418579, + "reward_std": 0.5777109861373901, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 8 + }, + { + "completion_length": 195.33334350585938, + "epoch": 0.03146853146853147, + "grad_norm": 0.9491543769836426, + "kl": 0.0009398699621669948, + "learning_rate": 2.2500000000000002e-07, + "loss": 0.0, + "reward": 1.2750000953674316, + "reward_std": 0.673609733581543, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10833333432674408, + "step": 9 + }, + { + "completion_length": 168.83334350585938, + "epoch": 0.03496503496503497, + "grad_norm": 4.634313583374023, + "kl": 0.0008446139981970191, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.0, + "reward": 0.5791666507720947, + "reward_std": 1.011238932609558, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.07916666567325592, + "step": 10 + }, + { + "completion_length": 181.0, + "epoch": 0.038461538461538464, + "grad_norm": 0.9203607439994812, + "kl": 0.0005472182529047132, + "learning_rate": 2.75e-07, + "loss": 0.0, + "reward": 1.2833333015441895, + "reward_std": 0.9125057458877563, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.11666666716337204, + "step": 11 + }, + { + "completion_length": 181.1666717529297, + "epoch": 0.04195804195804196, + "grad_norm": 1.4339206218719482, + "kl": 0.0007050944259390235, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.0, + "reward": 1.7333333492279053, + "reward_std": 1.0063133239746094, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.23333333432674408, + "step": 12 + }, + { + "completion_length": 130.0, + "epoch": 0.045454545454545456, + "grad_norm": 1.073473334312439, + "kl": 0.0007636564550921321, + "learning_rate": 3.25e-07, + "loss": 0.0, + "reward": 1.058333396911621, + "reward_std": 0.7486097812652588, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 13 + }, + { + "completion_length": 356.16668701171875, + "epoch": 0.04895104895104895, + "grad_norm": 0.8452476859092712, + "kl": 0.0006562608177773654, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.0, + "reward": 0.7416666746139526, + "reward_std": 0.8662659525871277, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.24166667461395264, + "step": 14 + }, + { + "completion_length": 143.1666717529297, + "epoch": 0.05244755244755245, + "grad_norm": 0.9590725302696228, + "kl": 0.0008172739762812853, + "learning_rate": 3.75e-07, + "loss": 0.0, + "reward": 0.5541666746139526, + "reward_std": 0.9553031921386719, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05416666716337204, + "step": 15 + }, + { + "completion_length": 454.16668701171875, + "epoch": 0.055944055944055944, + "grad_norm": 1.2272268533706665, + "kl": 0.0007388863014057279, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.0, + "reward": 1.2083333730697632, + "reward_std": 1.0360583066940308, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 16 + }, + { + "completion_length": 152.5, + "epoch": 0.05944055944055944, + "grad_norm": 1.0074872970581055, + "kl": 0.0006766216829419136, + "learning_rate": 4.2500000000000006e-07, + "loss": 0.0, + "reward": 0.8916666507720947, + "reward_std": 0.8662659525871277, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 17 + }, + { + "completion_length": 250.1666717529297, + "epoch": 0.06293706293706294, + "grad_norm": 1.305372953414917, + "kl": 0.001035388559103012, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.0, + "reward": 0.7166666984558105, + "reward_std": 1.2201093435287476, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 18 + }, + { + "completion_length": 243.0, + "epoch": 0.06643356643356643, + "grad_norm": 1.0690687894821167, + "kl": 0.0006665514083579183, + "learning_rate": 4.7500000000000006e-07, + "loss": 0.0, + "reward": 0.9916666746139526, + "reward_std": 0.6167792677879333, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.15833333134651184, + "step": 19 + }, + { + "completion_length": 276.16668701171875, + "epoch": 0.06993006993006994, + "grad_norm": 1.052300214767456, + "kl": 0.0005925261066295207, + "learning_rate": 5.000000000000001e-07, + "loss": 0.0, + "reward": 1.5333333015441895, + "reward_std": 1.0186593532562256, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.20000001788139343, + "step": 20 + }, + { + "completion_length": 333.3333435058594, + "epoch": 0.07342657342657342, + "grad_norm": 0.95088130235672, + "kl": 0.0006341444095596671, + "learning_rate": 5.250000000000001e-07, + "loss": 0.0, + "reward": 1.8583333492279053, + "reward_std": 0.8458231687545776, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3583333194255829, + "step": 21 + }, + { + "completion_length": 166.6666717529297, + "epoch": 0.07692307692307693, + "grad_norm": 1.2825149297714233, + "kl": 0.0007712479564361274, + "learning_rate": 5.5e-07, + "loss": 0.0, + "reward": 0.7666666507720947, + "reward_std": 1.1881358623504639, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10000000894069672, + "step": 22 + }, + { + "completion_length": 380.0, + "epoch": 0.08041958041958042, + "grad_norm": 1.2229748964309692, + "kl": 0.0007141837850213051, + "learning_rate": 5.750000000000001e-07, + "loss": 0.0, + "reward": 1.933333396911621, + "reward_std": 0.7672461867332458, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 23 + }, + { + "completion_length": 250.0, + "epoch": 0.08391608391608392, + "grad_norm": 1.1869820356369019, + "kl": 0.0007901927456259727, + "learning_rate": 6.000000000000001e-07, + "loss": 0.0, + "reward": 0.9666666984558105, + "reward_std": 0.797287106513977, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 24 + }, + { + "completion_length": 224.33334350585938, + "epoch": 0.08741258741258741, + "grad_norm": 1.1140718460083008, + "kl": 0.0006676652701571584, + "learning_rate": 6.25e-07, + "loss": 0.0, + "reward": 1.125, + "reward_std": 1.069462537765503, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.125, + "step": 25 + }, + { + "completion_length": 112.33333587646484, + "epoch": 0.09090909090909091, + "grad_norm": 1.20625901222229, + "kl": 0.0006995900766924024, + "learning_rate": 6.5e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0, + "step": 26 + }, + { + "completion_length": 398.8333435058594, + "epoch": 0.0944055944055944, + "grad_norm": 5.332723617553711, + "kl": 0.0007186655420809984, + "learning_rate": 6.750000000000001e-07, + "loss": 0.0, + "reward": 1.6625001430511475, + "reward_std": 0.9664044380187988, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3291666805744171, + "step": 27 + }, + { + "completion_length": 336.3333435058594, + "epoch": 0.0979020979020979, + "grad_norm": 0.7707162499427795, + "kl": 0.0007305681938305497, + "learning_rate": 7.000000000000001e-07, + "loss": 0.0, + "reward": 1.441666603088379, + "reward_std": 0.9876319766044617, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.2750000059604645, + "step": 28 + }, + { + "completion_length": 355.8333435058594, + "epoch": 0.10139860139860139, + "grad_norm": 0.999113142490387, + "kl": 0.0006821553106419742, + "learning_rate": 7.25e-07, + "loss": 0.0, + "reward": 1.1583333015441895, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.15833333134651184, + "step": 29 + }, + { + "completion_length": 188.1666717529297, + "epoch": 0.1048951048951049, + "grad_norm": 1.1029480695724487, + "kl": 0.0007804523920640349, + "learning_rate": 7.5e-07, + "loss": 0.0, + "reward": 1.183333396911621, + "reward_std": 1.0680201053619385, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.18333333730697632, + "step": 30 + }, + { + "completion_length": 380.3333435058594, + "epoch": 0.10839160839160839, + "grad_norm": 0.9132871627807617, + "kl": 0.0008556495886296034, + "learning_rate": 7.750000000000001e-07, + "loss": 0.0, + "reward": 2.2375001907348633, + "reward_std": 1.4762918949127197, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40416666865348816, + "step": 31 + }, + { + "completion_length": 348.0, + "epoch": 0.11188811188811189, + "grad_norm": 1.549122929573059, + "kl": 0.0009064790210686624, + "learning_rate": 8.000000000000001e-07, + "loss": 0.0, + "reward": 0.8291666507720947, + "reward_std": 1.029613733291626, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.16250000894069672, + "step": 32 + }, + { + "completion_length": 349.5, + "epoch": 0.11538461538461539, + "grad_norm": 0.8771302700042725, + "kl": 0.0008574656676501036, + "learning_rate": 8.250000000000001e-07, + "loss": 0.0, + "reward": 1.133333444595337, + "reward_std": 0.9867455363273621, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.30000001192092896, + "step": 33 + }, + { + "completion_length": 698.8333740234375, + "epoch": 0.11888111888111888, + "grad_norm": 0.7568854689598083, + "kl": 0.0007735582767054439, + "learning_rate": 8.500000000000001e-07, + "loss": 0.0, + "reward": 1.933333396911621, + "reward_std": 1.1737406253814697, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 34 + }, + { + "completion_length": 655.3333740234375, + "epoch": 0.12237762237762238, + "grad_norm": 1.5077099800109863, + "kl": 0.0007145506679080427, + "learning_rate": 8.75e-07, + "loss": 0.0, + "reward": 1.337499976158142, + "reward_std": 0.7572566270828247, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5041667222976685, + "step": 35 + }, + { + "completion_length": 156.0, + "epoch": 0.1258741258741259, + "grad_norm": 1.1091190576553345, + "kl": 0.0010963345412164927, + "learning_rate": 9.000000000000001e-07, + "loss": 0.0, + "reward": 1.1583333015441895, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.15833333134651184, + "step": 36 + }, + { + "completion_length": 184.6666717529297, + "epoch": 0.12937062937062938, + "grad_norm": 1.1978340148925781, + "kl": 0.000993944238871336, + "learning_rate": 9.25e-07, + "loss": 0.0, + "reward": 0.8333333730697632, + "reward_std": 1.2944754362106323, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 37 + }, + { + "completion_length": 170.1666717529297, + "epoch": 0.13286713286713286, + "grad_norm": 0.9296630620956421, + "kl": 0.0012741987593472004, + "learning_rate": 9.500000000000001e-07, + "loss": 0.0001, + "reward": 1.25, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 38 + }, + { + "completion_length": 284.3333435058594, + "epoch": 0.13636363636363635, + "grad_norm": 1.3948841094970703, + "kl": 0.0010804318590089679, + "learning_rate": 9.750000000000002e-07, + "loss": 0.0, + "reward": 1.1083333492279053, + "reward_std": 1.263098120689392, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2750000059604645, + "step": 39 + }, + { + "completion_length": 132.1666717529297, + "epoch": 0.13986013986013987, + "grad_norm": 1.0202951431274414, + "kl": 0.0013121496886014938, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0001, + "reward": 0.3333333432674408, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0, + "step": 40 + }, + { + "completion_length": 156.1666717529297, + "epoch": 0.14335664335664336, + "grad_norm": 0.9724128246307373, + "kl": 0.0010785979684442282, + "learning_rate": 1.025e-06, + "loss": 0.0, + "reward": 0.6083333492279053, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10833333432674408, + "step": 41 + }, + { + "completion_length": 603.1666870117188, + "epoch": 0.14685314685314685, + "grad_norm": 0.7776791453361511, + "kl": 0.0006764258723706007, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.0, + "reward": 1.4500001668930054, + "reward_std": 0.30659419298171997, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.45000001788139343, + "step": 42 + }, + { + "completion_length": 183.1666717529297, + "epoch": 0.15034965034965034, + "grad_norm": 1.2581369876861572, + "kl": 0.0012429999187588692, + "learning_rate": 1.075e-06, + "loss": 0.0, + "reward": 1.1749999523162842, + "reward_std": 1.0567638874053955, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17499999701976776, + "step": 43 + }, + { + "completion_length": 379.16668701171875, + "epoch": 0.15384615384615385, + "grad_norm": 2.0310208797454834, + "kl": 0.0011767616961151361, + "learning_rate": 1.1e-06, + "loss": 0.0, + "reward": 2.633333683013916, + "reward_std": 1.0595598220825195, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.46666663885116577, + "step": 44 + }, + { + "completion_length": 637.3333740234375, + "epoch": 0.15734265734265734, + "grad_norm": 1.2500090599060059, + "kl": 0.001643048133701086, + "learning_rate": 1.125e-06, + "loss": 0.0001, + "reward": 1.1500000953674316, + "reward_std": 0.7307531237602234, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 45 + }, + { + "completion_length": 182.0, + "epoch": 0.16083916083916083, + "grad_norm": 2.3323163986206055, + "kl": 0.003556631039828062, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.0001, + "reward": 0.9666666984558105, + "reward_std": 1.0230672359466553, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.13333334028720856, + "step": 46 + }, + { + "completion_length": 109.83333587646484, + "epoch": 0.16433566433566432, + "grad_norm": 1.834832787513733, + "kl": 0.002168774139136076, + "learning_rate": 1.175e-06, + "loss": 0.0001, + "reward": 0.5583333373069763, + "reward_std": 0.6248332858085632, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 47 + }, + { + "completion_length": 337.16668701171875, + "epoch": 0.16783216783216784, + "grad_norm": 1.1725846529006958, + "kl": 0.002405840437859297, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0001, + "reward": 0.6500000357627869, + "reward_std": 0.7962412238121033, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 48 + }, + { + "completion_length": 437.3333435058594, + "epoch": 0.17132867132867133, + "grad_norm": 0.743201494216919, + "kl": 0.0013375936541706324, + "learning_rate": 1.2250000000000001e-06, + "loss": 0.0001, + "reward": 1.183333396911621, + "reward_std": 1.3611271381378174, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3499999940395355, + "step": 49 + }, + { + "completion_length": 533.8333740234375, + "epoch": 0.17482517482517482, + "grad_norm": 0.7576809525489807, + "kl": 0.0019401045283302665, + "learning_rate": 1.25e-06, + "loss": 0.0001, + "reward": 1.7291667461395264, + "reward_std": 0.7050561308860779, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5625, + "step": 50 + }, + { + "completion_length": 203.5, + "epoch": 0.17832167832167833, + "grad_norm": 1.4076164960861206, + "kl": 0.0030774520710110664, + "learning_rate": 1.275e-06, + "loss": 0.0001, + "reward": 0.7750000357627869, + "reward_std": 0.5135659575462341, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2750000059604645, + "step": 51 + }, + { + "completion_length": 409.0, + "epoch": 0.18181818181818182, + "grad_norm": 0.8726016879081726, + "kl": 0.0025800741277635098, + "learning_rate": 1.3e-06, + "loss": 0.0001, + "reward": 0.5916666984558105, + "reward_std": 0.7324047088623047, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 52 + }, + { + "completion_length": 356.5, + "epoch": 0.1853146853146853, + "grad_norm": 0.877477765083313, + "kl": 0.0021268115378916264, + "learning_rate": 1.3250000000000002e-06, + "loss": 0.0001, + "reward": 1.6166666746139526, + "reward_std": 0.6976150274276733, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.28333336114883423, + "step": 53 + }, + { + "completion_length": 243.33334350585938, + "epoch": 0.1888111888111888, + "grad_norm": 0.9792532324790955, + "kl": 0.0043938253074884415, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.0002, + "reward": 1.1708333492279053, + "reward_std": 1.282616138458252, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17083333432674408, + "step": 54 + }, + { + "completion_length": 393.3333435058594, + "epoch": 0.19230769230769232, + "grad_norm": 1.205925703048706, + "kl": 0.0031106050591915846, + "learning_rate": 1.3750000000000002e-06, + "loss": 0.0001, + "reward": 0.9666666984558105, + "reward_std": 0.8084965944290161, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 55 + }, + { + "completion_length": 228.83334350585938, + "epoch": 0.1958041958041958, + "grad_norm": 0.7984407544136047, + "kl": 0.007072250358760357, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.0003, + "reward": 0.6916667222976685, + "reward_std": 1.1655113697052002, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19166666269302368, + "step": 56 + }, + { + "completion_length": 361.66668701171875, + "epoch": 0.1993006993006993, + "grad_norm": 3.0838680267333984, + "kl": 0.006738494616001844, + "learning_rate": 1.425e-06, + "loss": 0.0003, + "reward": 1.3041667938232422, + "reward_std": 0.2600080370903015, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30416667461395264, + "step": 57 + }, + { + "completion_length": 502.66668701171875, + "epoch": 0.20279720279720279, + "grad_norm": 0.7226095795631409, + "kl": 0.0058082761242985725, + "learning_rate": 1.45e-06, + "loss": 0.0002, + "reward": 1.399999976158142, + "reward_std": 1.3337916135787964, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.40000003576278687, + "step": 58 + }, + { + "completion_length": 210.5, + "epoch": 0.2062937062937063, + "grad_norm": 1.079681158065796, + "kl": 0.009464471600949764, + "learning_rate": 1.475e-06, + "loss": 0.0004, + "reward": 0.9750000238418579, + "reward_std": 1.1890122890472412, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.14166668057441711, + "step": 59 + }, + { + "completion_length": 208.5, + "epoch": 0.2097902097902098, + "grad_norm": 1.8312753438949585, + "kl": 0.03959222882986069, + "learning_rate": 1.5e-06, + "loss": 0.0016, + "reward": 0.5333333611488342, + "reward_std": 0.8553751707077026, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.20000001788139343, + "step": 60 + }, + { + "completion_length": 285.5, + "epoch": 0.21328671328671328, + "grad_norm": 0.9337784051895142, + "kl": 0.011914614588022232, + "learning_rate": 1.525e-06, + "loss": 0.0005, + "reward": 1.4458332061767578, + "reward_std": 0.4955846071243286, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.27916666865348816, + "step": 61 + }, + { + "completion_length": 276.3333435058594, + "epoch": 0.21678321678321677, + "grad_norm": 1.4266396760940552, + "kl": 0.02391706220805645, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.001, + "reward": 1.1583333015441895, + "reward_std": 0.8598934412002563, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32500001788139343, + "step": 62 + }, + { + "completion_length": 381.3333435058594, + "epoch": 0.2202797202797203, + "grad_norm": 1.1708087921142578, + "kl": 0.012987270019948483, + "learning_rate": 1.5750000000000002e-06, + "loss": 0.0005, + "reward": 1.5416667461395264, + "reward_std": 1.3807305097579956, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 63 + }, + { + "completion_length": 237.0, + "epoch": 0.22377622377622378, + "grad_norm": 1.3068374395370483, + "kl": 0.027782242745161057, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0011, + "reward": 1.433333396911621, + "reward_std": 1.162611961364746, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2666666805744171, + "step": 64 + }, + { + "completion_length": 797.6666870117188, + "epoch": 0.22727272727272727, + "grad_norm": 0.7319328784942627, + "kl": 0.013491494581103325, + "learning_rate": 1.6250000000000001e-06, + "loss": 0.0005, + "reward": 1.3166667222976685, + "reward_std": 0.8604747653007507, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3166666626930237, + "step": 65 + }, + { + "completion_length": 237.1666717529297, + "epoch": 0.23076923076923078, + "grad_norm": 1.9626200199127197, + "kl": 0.015099573880434036, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0006, + "reward": 0.9666666388511658, + "reward_std": 0.797287106513977, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 66 + }, + { + "completion_length": 221.1666717529297, + "epoch": 0.23426573426573427, + "grad_norm": 0.7815642952919006, + "kl": 0.03964684158563614, + "learning_rate": 1.6750000000000003e-06, + "loss": 0.0016, + "reward": 1.6416667699813843, + "reward_std": 1.0584973096847534, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.14166668057441711, + "step": 67 + }, + { + "completion_length": 227.33334350585938, + "epoch": 0.23776223776223776, + "grad_norm": 1.5282418727874756, + "kl": 0.0695306807756424, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0028, + "reward": 0.75, + "reward_std": 0.7375635504722595, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.25, + "step": 68 + }, + { + "completion_length": 673.3333740234375, + "epoch": 0.24125874125874125, + "grad_norm": 0.8560697436332703, + "kl": 0.03540939837694168, + "learning_rate": 1.725e-06, + "loss": 0.0014, + "reward": 2.200000047683716, + "reward_std": 0.9581232070922852, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5333333611488342, + "step": 69 + }, + { + "completion_length": 254.6666717529297, + "epoch": 0.24475524475524477, + "grad_norm": 1.2371562719345093, + "kl": 0.03692096844315529, + "learning_rate": 1.75e-06, + "loss": 0.0015, + "reward": 1.8249998092651367, + "reward_std": 0.9968700408935547, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32499998807907104, + "step": 70 + }, + { + "completion_length": 234.6666717529297, + "epoch": 0.24825174825174826, + "grad_norm": 0.9824966192245483, + "kl": 0.07421376556158066, + "learning_rate": 1.7750000000000002e-06, + "loss": 0.003, + "reward": 1.1666667461395264, + "reward_std": 0.6485882997512817, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3333333432674408, + "step": 71 + }, + { + "completion_length": 580.0, + "epoch": 0.2517482517482518, + "grad_norm": 1.0504631996154785, + "kl": 0.048039551824331284, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0019, + "reward": 1.808333396911621, + "reward_std": 1.302849531173706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 72 + }, + { + "completion_length": 788.1666870117188, + "epoch": 0.25524475524475526, + "grad_norm": 0.6447965502738953, + "kl": 0.04130098968744278, + "learning_rate": 1.825e-06, + "loss": 0.0017, + "reward": 1.3875000476837158, + "reward_std": 1.093817949295044, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5541666746139526, + "step": 73 + }, + { + "completion_length": 376.16668701171875, + "epoch": 0.25874125874125875, + "grad_norm": 1.347108244895935, + "kl": 0.19923770427703857, + "learning_rate": 1.85e-06, + "loss": 0.008, + "reward": 1.529166579246521, + "reward_std": 0.6618943214416504, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19583332538604736, + "step": 74 + }, + { + "completion_length": 227.1666717529297, + "epoch": 0.26223776223776224, + "grad_norm": 0.8091520667076111, + "kl": 0.06355344504117966, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.0025, + "reward": 0.75, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 75 + }, + { + "completion_length": 502.3333435058594, + "epoch": 0.26573426573426573, + "grad_norm": 1.1315293312072754, + "kl": 0.11514662951231003, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0046, + "reward": 1.504166603088379, + "reward_std": 1.256027102470398, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.33750003576278687, + "step": 76 + }, + { + "completion_length": 306.16668701171875, + "epoch": 0.2692307692307692, + "grad_norm": 1.6002874374389648, + "kl": 0.07964249700307846, + "learning_rate": 1.925e-06, + "loss": 0.0032, + "reward": 1.7083333730697632, + "reward_std": 1.2195971012115479, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5416666865348816, + "step": 77 + }, + { + "completion_length": 253.0, + "epoch": 0.2727272727272727, + "grad_norm": 1.134474754333496, + "kl": 0.09407778084278107, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0038, + "reward": 1.8333333730697632, + "reward_std": 1.0842816829681396, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3333333432674408, + "step": 78 + }, + { + "completion_length": 456.3333435058594, + "epoch": 0.2762237762237762, + "grad_norm": 1.4590799808502197, + "kl": 0.08163408935070038, + "learning_rate": 1.975e-06, + "loss": 0.0033, + "reward": 1.1875, + "reward_std": 1.164232611656189, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3541666865348816, + "step": 79 + }, + { + "completion_length": 273.0, + "epoch": 0.27972027972027974, + "grad_norm": 1.589087724685669, + "kl": 0.08010071516036987, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0032, + "reward": 0.9125000238418579, + "reward_std": 0.9088110327720642, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.07916666567325592, + "step": 80 + }, + { + "completion_length": 196.1666717529297, + "epoch": 0.28321678321678323, + "grad_norm": 1.4217482805252075, + "kl": 0.0619954913854599, + "learning_rate": 2.025e-06, + "loss": 0.0025, + "reward": 1.058333396911621, + "reward_std": 0.7486097812652588, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 81 + }, + { + "completion_length": 340.8333435058594, + "epoch": 0.2867132867132867, + "grad_norm": 1.056475043296814, + "kl": 0.05495650693774223, + "learning_rate": 2.05e-06, + "loss": 0.0022, + "reward": 0.8625000715255737, + "reward_std": 0.5305068492889404, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19583332538604736, + "step": 82 + }, + { + "completion_length": 410.66668701171875, + "epoch": 0.2902097902097902, + "grad_norm": 0.5162915587425232, + "kl": 0.04134432598948479, + "learning_rate": 2.075e-06, + "loss": 0.0017, + "reward": 1.1875, + "reward_std": 0.7466174364089966, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1875, + "step": 83 + }, + { + "completion_length": 510.66668701171875, + "epoch": 0.2937062937062937, + "grad_norm": 0.9501734972000122, + "kl": 0.047528013586997986, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0019, + "reward": 1.258333444595337, + "reward_std": 1.1069854497909546, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 84 + }, + { + "completion_length": 476.0, + "epoch": 0.2972027972027972, + "grad_norm": 1.0745543241500854, + "kl": 0.04738708958029747, + "learning_rate": 2.125e-06, + "loss": 0.0019, + "reward": 0.7541666030883789, + "reward_std": 0.6050654649734497, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2541666626930237, + "step": 85 + }, + { + "completion_length": 346.16668701171875, + "epoch": 0.3006993006993007, + "grad_norm": 0.7894018888473511, + "kl": 0.03818603605031967, + "learning_rate": 2.15e-06, + "loss": 0.0015, + "reward": 1.5499999523162842, + "reward_std": 0.8729261159896851, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 86 + }, + { + "completion_length": 157.5, + "epoch": 0.3041958041958042, + "grad_norm": 1.2285088300704956, + "kl": 0.04852033406496048, + "learning_rate": 2.1750000000000004e-06, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 1.2284135818481445, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 87 + }, + { + "completion_length": 853.5, + "epoch": 0.3076923076923077, + "grad_norm": 1.1314716339111328, + "kl": 0.03052813559770584, + "learning_rate": 2.2e-06, + "loss": 0.0012, + "reward": 1.5625, + "reward_std": 1.093817949295044, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3958333432674408, + "step": 88 + }, + { + "completion_length": 372.66668701171875, + "epoch": 0.3111888111888112, + "grad_norm": 0.9353286623954773, + "kl": 0.027921725064516068, + "learning_rate": 2.2250000000000003e-06, + "loss": 0.0011, + "reward": 1.8250000476837158, + "reward_std": 0.9234446287155151, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32500001788139343, + "step": 89 + }, + { + "completion_length": 296.3333435058594, + "epoch": 0.3146853146853147, + "grad_norm": 1.140289306640625, + "kl": 0.04811665043234825, + "learning_rate": 2.25e-06, + "loss": 0.0019, + "reward": 1.125, + "reward_std": 1.1268318891525269, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 90 + }, + { + "completion_length": 99.83333587646484, + "epoch": 0.3181818181818182, + "grad_norm": 4.178561687469482, + "kl": 0.09318779408931732, + "learning_rate": 2.2750000000000002e-06, + "loss": 0.0037, + "reward": 0.5583333373069763, + "reward_std": 0.9645810127258301, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 91 + }, + { + "completion_length": 192.1666717529297, + "epoch": 0.32167832167832167, + "grad_norm": 1.560648798942566, + "kl": 0.03698144853115082, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0015, + "reward": 1.9249999523162842, + "reward_std": 0.718853235244751, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.25833335518836975, + "step": 92 + }, + { + "completion_length": 576.5, + "epoch": 0.32517482517482516, + "grad_norm": 1.093043327331543, + "kl": 0.021529672667384148, + "learning_rate": 2.325e-06, + "loss": 0.0009, + "reward": 1.070833444595337, + "reward_std": 0.6477686166763306, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.23749999701976776, + "step": 93 + }, + { + "completion_length": 335.8333435058594, + "epoch": 0.32867132867132864, + "grad_norm": 0.8303731679916382, + "kl": 0.019405633211135864, + "learning_rate": 2.35e-06, + "loss": 0.0008, + "reward": 0.8416666984558105, + "reward_std": 0.9183772802352905, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17499999701976776, + "step": 94 + }, + { + "completion_length": 569.5, + "epoch": 0.3321678321678322, + "grad_norm": 1.4912625551223755, + "kl": 0.014733041636645794, + "learning_rate": 2.375e-06, + "loss": 0.0006, + "reward": 1.4541667699813843, + "reward_std": 1.1459076404571533, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4541666507720947, + "step": 95 + }, + { + "completion_length": 232.83334350585938, + "epoch": 0.3356643356643357, + "grad_norm": 0.9174475073814392, + "kl": 0.018923718482255936, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0008, + "reward": 1.3333333730697632, + "reward_std": 0.9877583980560303, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 96 + }, + { + "completion_length": 742.1666870117188, + "epoch": 0.33916083916083917, + "grad_norm": 1.258750557899475, + "kl": 0.017664968967437744, + "learning_rate": 2.425e-06, + "loss": 0.0007, + "reward": 1.4583333730697632, + "reward_std": 0.6202150583267212, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 97 + }, + { + "completion_length": 270.8333435058594, + "epoch": 0.34265734265734266, + "grad_norm": 0.9259786605834961, + "kl": 0.05115365609526634, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.002, + "reward": 1.5500000715255737, + "reward_std": 0.8729261159896851, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.21666666865348816, + "step": 98 + }, + { + "completion_length": 476.3333435058594, + "epoch": 0.34615384615384615, + "grad_norm": 1.240902066230774, + "kl": 0.036602895706892014, + "learning_rate": 2.475e-06, + "loss": 0.0015, + "reward": 1.2791666984558105, + "reward_std": 1.1935679912567139, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.27916669845581055, + "step": 99 + }, + { + "completion_length": 213.6666717529297, + "epoch": 0.34965034965034963, + "grad_norm": 0.943215548992157, + "kl": 0.04590342566370964, + "learning_rate": 2.5e-06, + "loss": 0.0018, + "reward": 1.841666579246521, + "reward_std": 0.6873258352279663, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.34166666865348816, + "step": 100 + }, + { + "completion_length": 401.0, + "epoch": 0.3531468531468531, + "grad_norm": 0.7366496324539185, + "kl": 0.016905900090932846, + "learning_rate": 2.5250000000000004e-06, + "loss": 0.0007, + "reward": 1.3000000715255737, + "reward_std": 1.1256110668182373, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 101 + }, + { + "completion_length": 854.5, + "epoch": 0.35664335664335667, + "grad_norm": 8.089740753173828, + "kl": 0.08785610646009445, + "learning_rate": 2.55e-06, + "loss": 0.0035, + "reward": 1.316666603088379, + "reward_std": 1.2330517768859863, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 102 + }, + { + "completion_length": 455.16668701171875, + "epoch": 0.36013986013986016, + "grad_norm": 1.6066083908081055, + "kl": 0.03349429741501808, + "learning_rate": 2.5750000000000003e-06, + "loss": 0.0013, + "reward": 1.7333333492279053, + "reward_std": 1.6448911428451538, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40000003576278687, + "step": 103 + }, + { + "completion_length": 558.6666870117188, + "epoch": 0.36363636363636365, + "grad_norm": 1.2461860179901123, + "kl": 0.0453556627035141, + "learning_rate": 2.6e-06, + "loss": 0.0018, + "reward": 1.933333396911621, + "reward_std": 1.1851863861083984, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 104 + }, + { + "completion_length": 462.16668701171875, + "epoch": 0.36713286713286714, + "grad_norm": 0.9176071286201477, + "kl": 0.05445032939314842, + "learning_rate": 2.6250000000000003e-06, + "loss": 0.0022, + "reward": 1.2916667461395264, + "reward_std": 0.9144214391708374, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 105 + }, + { + "completion_length": 357.5, + "epoch": 0.3706293706293706, + "grad_norm": 1.1796709299087524, + "kl": 0.08697855472564697, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0035, + "reward": 0.9833333492279053, + "reward_std": 1.2412359714508057, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 106 + }, + { + "completion_length": 556.8333740234375, + "epoch": 0.3741258741258741, + "grad_norm": 1.1719709634780884, + "kl": 0.09557916224002838, + "learning_rate": 2.6750000000000002e-06, + "loss": 0.0038, + "reward": 0.9541666507720947, + "reward_std": 1.0742924213409424, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2875000238418579, + "step": 107 + }, + { + "completion_length": 490.8333435058594, + "epoch": 0.3776223776223776, + "grad_norm": 0.9839584827423096, + "kl": 0.07620736211538315, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.003, + "reward": 1.3416666984558105, + "reward_std": 1.0551856756210327, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5083333253860474, + "step": 108 + }, + { + "completion_length": 459.8333435058594, + "epoch": 0.3811188811188811, + "grad_norm": 1.0232492685317993, + "kl": 0.09754881262779236, + "learning_rate": 2.7250000000000006e-06, + "loss": 0.0039, + "reward": 1.7916667461395264, + "reward_std": 1.201422929763794, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 109 + }, + { + "completion_length": 432.5, + "epoch": 0.38461538461538464, + "grad_norm": 0.7946304082870483, + "kl": 0.043154411017894745, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0017, + "reward": 2.1000001430511475, + "reward_std": 0.8933085203170776, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 110 + }, + { + "completion_length": 346.8333435058594, + "epoch": 0.3881118881118881, + "grad_norm": 0.9842674136161804, + "kl": 0.1046643778681755, + "learning_rate": 2.7750000000000005e-06, + "loss": 0.0042, + "reward": 0.8166667222976685, + "reward_std": 0.7353004217147827, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 111 + }, + { + "completion_length": 214.5, + "epoch": 0.3916083916083916, + "grad_norm": 1.1671849489212036, + "kl": 0.1281026154756546, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0051, + "reward": 1.0500000715255737, + "reward_std": 0.14832398295402527, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 112 + }, + { + "completion_length": 908.6666870117188, + "epoch": 0.3951048951048951, + "grad_norm": 0.3388780951499939, + "kl": 0.022495290264487267, + "learning_rate": 2.825e-06, + "loss": 0.0009, + "reward": 2.3375000953674316, + "reward_std": 0.3727431893348694, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6708333492279053, + "step": 113 + }, + { + "completion_length": 891.6666870117188, + "epoch": 0.3986013986013986, + "grad_norm": 0.467278391122818, + "kl": 0.025123490020632744, + "learning_rate": 2.85e-06, + "loss": 0.001, + "reward": 1.8541667461395264, + "reward_std": 0.7543899416923523, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6875, + "step": 114 + }, + { + "completion_length": 546.1666870117188, + "epoch": 0.4020979020979021, + "grad_norm": 1.054366111755371, + "kl": 0.0783834159374237, + "learning_rate": 2.875e-06, + "loss": 0.0031, + "reward": 2.4000000953674316, + "reward_std": 1.306904673576355, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5666666626930237, + "step": 115 + }, + { + "completion_length": 835.1666870117188, + "epoch": 0.40559440559440557, + "grad_norm": 0.7376688122749329, + "kl": 0.04768560454249382, + "learning_rate": 2.9e-06, + "loss": 0.0019, + "reward": 1.5291666984558105, + "reward_std": 0.32841163873672485, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5291666984558105, + "step": 116 + }, + { + "completion_length": 368.3333435058594, + "epoch": 0.4090909090909091, + "grad_norm": 1.456405758857727, + "kl": 0.1393664926290512, + "learning_rate": 2.925e-06, + "loss": 0.0056, + "reward": 0.9541666507720947, + "reward_std": 0.7450531721115112, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2875000238418579, + "step": 117 + }, + { + "completion_length": 485.5, + "epoch": 0.4125874125874126, + "grad_norm": 1.4957919120788574, + "kl": 0.1291833370923996, + "learning_rate": 2.95e-06, + "loss": 0.0052, + "reward": 1.5833333730697632, + "reward_std": 1.4998888969421387, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4166666865348816, + "step": 118 + }, + { + "completion_length": 356.3333435058594, + "epoch": 0.4160839160839161, + "grad_norm": 1.178475022315979, + "kl": 0.10108506679534912, + "learning_rate": 2.9750000000000003e-06, + "loss": 0.004, + "reward": 0.7083333730697632, + "reward_std": 0.7506109476089478, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 119 + }, + { + "completion_length": 140.33334350585938, + "epoch": 0.4195804195804196, + "grad_norm": 1.4624924659729004, + "kl": 0.2249661386013031, + "learning_rate": 3e-06, + "loss": 0.009, + "reward": 0.9166666865348816, + "reward_std": 0.9174239635467529, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 120 + }, + { + "completion_length": 673.1666870117188, + "epoch": 0.4230769230769231, + "grad_norm": 1.0837116241455078, + "kl": 0.09312133491039276, + "learning_rate": 3.0250000000000003e-06, + "loss": 0.0037, + "reward": 2.2208335399627686, + "reward_std": 0.9818881750106812, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.38749998807907104, + "step": 121 + }, + { + "completion_length": 238.1666717529297, + "epoch": 0.42657342657342656, + "grad_norm": 1.0982871055603027, + "kl": 0.05689762160181999, + "learning_rate": 3.05e-06, + "loss": 0.0023, + "reward": 1.1166666746139526, + "reward_std": 0.7567474246025085, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.11666666716337204, + "step": 122 + }, + { + "completion_length": 576.1666870117188, + "epoch": 0.43006993006993005, + "grad_norm": 1.0922025442123413, + "kl": 0.04579655081033707, + "learning_rate": 3.075e-06, + "loss": 0.0018, + "reward": 2.4000000953674316, + "reward_std": 1.0807406902313232, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5666666626930237, + "step": 123 + }, + { + "completion_length": 736.6666870117188, + "epoch": 0.43356643356643354, + "grad_norm": 1.5019290447235107, + "kl": 0.030428007245063782, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0012, + "reward": 1.504166603088379, + "reward_std": 1.2472386360168457, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5041667222976685, + "step": 124 + }, + { + "completion_length": 603.5, + "epoch": 0.4370629370629371, + "grad_norm": 4.212569713592529, + "kl": 0.37697991728782654, + "learning_rate": 3.125e-06, + "loss": 0.0151, + "reward": 1.6416667699813843, + "reward_std": 0.8303112387657166, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6416667103767395, + "step": 125 + }, + { + "completion_length": 492.0, + "epoch": 0.4405594405594406, + "grad_norm": 0.9634215831756592, + "kl": 0.06763506680727005, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0027, + "reward": 2.125, + "reward_std": 1.2069590091705322, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 126 + }, + { + "completion_length": 792.1666870117188, + "epoch": 0.44405594405594406, + "grad_norm": 0.4220138192176819, + "kl": 0.03986603766679764, + "learning_rate": 3.175e-06, + "loss": 0.0016, + "reward": 1.1375000476837158, + "reward_std": 0.5137485265731812, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6375000476837158, + "step": 127 + }, + { + "completion_length": 535.5, + "epoch": 0.44755244755244755, + "grad_norm": 4.797938823699951, + "kl": 0.13327616453170776, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0053, + "reward": 1.1791666746139526, + "reward_std": 1.1582764387130737, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.34583336114883423, + "step": 128 + }, + { + "completion_length": 444.8333435058594, + "epoch": 0.45104895104895104, + "grad_norm": 0.7808079719543457, + "kl": 0.055326174944639206, + "learning_rate": 3.2250000000000005e-06, + "loss": 0.0022, + "reward": 1.495833396911621, + "reward_std": 0.7681823968887329, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.16250000894069672, + "step": 129 + }, + { + "completion_length": 454.66668701171875, + "epoch": 0.45454545454545453, + "grad_norm": 0.8776301741600037, + "kl": 0.11162035167217255, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0045, + "reward": 1.5750001668930054, + "reward_std": 1.4812158346176147, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.24166665971279144, + "step": 130 + }, + { + "completion_length": 769.6666870117188, + "epoch": 0.458041958041958, + "grad_norm": 0.4391367733478546, + "kl": 0.025292951613664627, + "learning_rate": 3.2750000000000004e-06, + "loss": 0.001, + "reward": 2.433333396911621, + "reward_std": 0.2746209502220154, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6000000238418579, + "step": 131 + }, + { + "completion_length": 528.6666870117188, + "epoch": 0.46153846153846156, + "grad_norm": 0.8809014558792114, + "kl": 0.12223925441503525, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0049, + "reward": 2.120833396911621, + "reward_std": 1.101410150527954, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4541666507720947, + "step": 132 + }, + { + "completion_length": 491.3333435058594, + "epoch": 0.46503496503496505, + "grad_norm": 1.0070464611053467, + "kl": 0.05908138304948807, + "learning_rate": 3.3250000000000004e-06, + "loss": 0.0024, + "reward": 0.5916666984558105, + "reward_std": 0.5335416197776794, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 133 + }, + { + "completion_length": 892.5, + "epoch": 0.46853146853146854, + "grad_norm": 0.4570764899253845, + "kl": 0.037701599299907684, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0015, + "reward": 1.7249999046325684, + "reward_std": 1.292478322982788, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 134 + }, + { + "completion_length": 806.8333740234375, + "epoch": 0.47202797202797203, + "grad_norm": 0.5572299361228943, + "kl": 0.05404336377978325, + "learning_rate": 3.3750000000000003e-06, + "loss": 0.0022, + "reward": 1.4583333730697632, + "reward_std": 0.990033745765686, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7916666269302368, + "step": 135 + }, + { + "completion_length": 589.0, + "epoch": 0.4755244755244755, + "grad_norm": 0.7575751543045044, + "kl": 0.04170485585927963, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0017, + "reward": 2.683333396911621, + "reward_std": 1.1075499057769775, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8500000238418579, + "step": 136 + }, + { + "completion_length": 1060.166748046875, + "epoch": 0.479020979020979, + "grad_norm": 0.5119641423225403, + "kl": 0.04976843297481537, + "learning_rate": 3.4250000000000007e-06, + "loss": 0.002, + "reward": 1.1125000715255737, + "reward_std": 0.39457258582115173, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6124999523162842, + "step": 137 + }, + { + "completion_length": 559.8333740234375, + "epoch": 0.4825174825174825, + "grad_norm": 0.6115387082099915, + "kl": 0.05675242468714714, + "learning_rate": 3.45e-06, + "loss": 0.0023, + "reward": 2.0416667461395264, + "reward_std": 0.5715476274490356, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5416666865348816, + "step": 138 + }, + { + "completion_length": 685.6666870117188, + "epoch": 0.486013986013986, + "grad_norm": 1.2578071355819702, + "kl": 0.07080799341201782, + "learning_rate": 3.475e-06, + "loss": 0.0028, + "reward": 1.379166603088379, + "reward_std": 1.0072758197784424, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7125000357627869, + "step": 139 + }, + { + "completion_length": 987.5, + "epoch": 0.48951048951048953, + "grad_norm": 0.6280319690704346, + "kl": 0.03268418833613396, + "learning_rate": 3.5e-06, + "loss": 0.0013, + "reward": 0.9291666746139526, + "reward_std": 0.6654728651046753, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5958333015441895, + "step": 140 + }, + { + "completion_length": 728.5, + "epoch": 0.493006993006993, + "grad_norm": 0.8773026466369629, + "kl": 0.032183535397052765, + "learning_rate": 3.525e-06, + "loss": 0.0013, + "reward": 2.862499952316284, + "reward_std": 0.7864078879356384, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6958333253860474, + "step": 141 + }, + { + "completion_length": 405.8333435058594, + "epoch": 0.4965034965034965, + "grad_norm": 0.8974792957305908, + "kl": 0.059865664690732956, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0024, + "reward": 1.6875, + "reward_std": 0.8300225734710693, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.3541666865348816, + "step": 142 + }, + { + "completion_length": 1081.666748046875, + "epoch": 0.5, + "grad_norm": 0.5286564230918884, + "kl": 0.022505857050418854, + "learning_rate": 3.575e-06, + "loss": 0.0009, + "reward": 2.370833396911621, + "reward_std": 0.4925486445426941, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8708332777023315, + "step": 143 + }, + { + "completion_length": 1141.3333740234375, + "epoch": 0.5034965034965035, + "grad_norm": 0.527409017086029, + "kl": 0.021072231233119965, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0008, + "reward": 1.9291666746139526, + "reward_std": 0.7955214381217957, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5958333611488342, + "step": 144 + }, + { + "completion_length": 515.5, + "epoch": 0.506993006993007, + "grad_norm": 2.5036261081695557, + "kl": 0.3181736469268799, + "learning_rate": 3.625e-06, + "loss": 0.0127, + "reward": 1.5833333730697632, + "reward_std": 0.9988327026367188, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5833333730697632, + "step": 145 + }, + { + "completion_length": 599.5, + "epoch": 0.5104895104895105, + "grad_norm": 0.7538139224052429, + "kl": 0.041587017476558685, + "learning_rate": 3.65e-06, + "loss": 0.0017, + "reward": 1.3583334684371948, + "reward_std": 0.6873258352279663, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6916666030883789, + "step": 146 + }, + { + "completion_length": 645.6666870117188, + "epoch": 0.513986013986014, + "grad_norm": 0.6815938353538513, + "kl": 0.031590305268764496, + "learning_rate": 3.6750000000000004e-06, + "loss": 0.0013, + "reward": 2.445833683013916, + "reward_std": 1.186003565788269, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6124999523162842, + "step": 147 + }, + { + "completion_length": 731.0, + "epoch": 0.5174825174825175, + "grad_norm": 1.4654277563095093, + "kl": 0.11272114515304565, + "learning_rate": 3.7e-06, + "loss": 0.0045, + "reward": 1.2125000953674316, + "reward_std": 0.7435977458953857, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7124999761581421, + "step": 148 + }, + { + "completion_length": 476.16668701171875, + "epoch": 0.5209790209790209, + "grad_norm": 3.388495683670044, + "kl": 0.9080104827880859, + "learning_rate": 3.7250000000000003e-06, + "loss": 0.0363, + "reward": 1.8958333730697632, + "reward_std": 0.9965461492538452, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3958333432674408, + "step": 149 + }, + { + "completion_length": 1053.166748046875, + "epoch": 0.5244755244755245, + "grad_norm": 0.4761454164981842, + "kl": 0.027715642005205154, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0011, + "reward": 3.2916667461395264, + "reward_std": 0.7417322397232056, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 150 + }, + { + "completion_length": 751.1666870117188, + "epoch": 0.527972027972028, + "grad_norm": 0.6827074885368347, + "kl": 0.0386313796043396, + "learning_rate": 3.7750000000000003e-06, + "loss": 0.0015, + "reward": 2.495833396911621, + "reward_std": 1.0227923393249512, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6625000238418579, + "step": 151 + }, + { + "completion_length": 721.8333740234375, + "epoch": 0.5314685314685315, + "grad_norm": 1.2814685106277466, + "kl": 0.041070081293582916, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0016, + "reward": 2.4666666984558105, + "reward_std": 0.8834120631217957, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 152 + }, + { + "completion_length": 513.0, + "epoch": 0.534965034965035, + "grad_norm": 0.6044140458106995, + "kl": 0.08036690950393677, + "learning_rate": 3.825000000000001e-06, + "loss": 0.0032, + "reward": 1.7875001430511475, + "reward_std": 1.1646621227264404, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6208333373069763, + "step": 153 + }, + { + "completion_length": 720.8333740234375, + "epoch": 0.5384615384615384, + "grad_norm": 0.7732751965522766, + "kl": 0.04927179962396622, + "learning_rate": 3.85e-06, + "loss": 0.002, + "reward": 2.383333206176758, + "reward_std": 1.4126808643341064, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 154 + }, + { + "completion_length": 708.8333740234375, + "epoch": 0.541958041958042, + "grad_norm": 0.6660548448562622, + "kl": 0.07937665283679962, + "learning_rate": 3.875e-06, + "loss": 0.0032, + "reward": 2.183333396911621, + "reward_std": 0.6377042531967163, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8500000238418579, + "step": 155 + }, + { + "completion_length": 1192.0, + "epoch": 0.5454545454545454, + "grad_norm": 0.3896901309490204, + "kl": 0.025209862738847733, + "learning_rate": 3.900000000000001e-06, + "loss": 0.001, + "reward": 1.8833332061767578, + "reward_std": 0.8691471815109253, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 156 + }, + { + "completion_length": 705.1666870117188, + "epoch": 0.548951048951049, + "grad_norm": 0.5750932097434998, + "kl": 0.04517858847975731, + "learning_rate": 3.9250000000000005e-06, + "loss": 0.0018, + "reward": 2.9541664123535156, + "reward_std": 0.6458360552787781, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6208333373069763, + "step": 157 + }, + { + "completion_length": 465.5, + "epoch": 0.5524475524475524, + "grad_norm": 0.8335661888122559, + "kl": 0.08351196348667145, + "learning_rate": 3.95e-06, + "loss": 0.0033, + "reward": 2.424999952316284, + "reward_std": 0.941673994064331, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5916666984558105, + "step": 158 + }, + { + "completion_length": 539.6666870117188, + "epoch": 0.5559440559440559, + "grad_norm": 1.1459757089614868, + "kl": 0.12647944688796997, + "learning_rate": 3.975000000000001e-06, + "loss": 0.0051, + "reward": 1.6416667699813843, + "reward_std": 1.2068209648132324, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 159 + }, + { + "completion_length": 798.0, + "epoch": 0.5594405594405595, + "grad_norm": 0.4939272105693817, + "kl": 0.051064085215330124, + "learning_rate": 4.000000000000001e-06, + "loss": 0.002, + "reward": 2.183333396911621, + "reward_std": 1.2081665992736816, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 160 + }, + { + "completion_length": 338.8333435058594, + "epoch": 0.5629370629370629, + "grad_norm": 0.8890612125396729, + "kl": 0.12327366322278976, + "learning_rate": 4.0250000000000004e-06, + "loss": 0.0049, + "reward": 2.575000286102295, + "reward_std": 0.9913375377655029, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40833336114883423, + "step": 161 + }, + { + "completion_length": 809.6666870117188, + "epoch": 0.5664335664335665, + "grad_norm": 0.3928314447402954, + "kl": 0.040153808891773224, + "learning_rate": 4.05e-06, + "loss": 0.0016, + "reward": 1.7208335399627686, + "reward_std": 0.5225937366485596, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7208333015441895, + "step": 162 + }, + { + "completion_length": 766.0, + "epoch": 0.5699300699300699, + "grad_norm": 0.7869060039520264, + "kl": 0.04531605541706085, + "learning_rate": 4.075e-06, + "loss": 0.0018, + "reward": 2.120833396911621, + "reward_std": 0.8866251707077026, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4541667103767395, + "step": 163 + }, + { + "completion_length": 1085.666748046875, + "epoch": 0.5734265734265734, + "grad_norm": 1.0671396255493164, + "kl": 0.06464602053165436, + "learning_rate": 4.1e-06, + "loss": 0.0026, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666865348816, + "step": 164 + }, + { + "completion_length": 628.1666870117188, + "epoch": 0.5769230769230769, + "grad_norm": 0.9583672285079956, + "kl": 0.06743767857551575, + "learning_rate": 4.125e-06, + "loss": 0.0027, + "reward": 2.137500286102295, + "reward_std": 1.376930594444275, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.637499988079071, + "step": 165 + }, + { + "completion_length": 351.8333435058594, + "epoch": 0.5804195804195804, + "grad_norm": 0.6946209669113159, + "kl": 0.09894745796918869, + "learning_rate": 4.15e-06, + "loss": 0.004, + "reward": 2.7750000953674316, + "reward_std": 0.7055140733718872, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4416666626930237, + "step": 166 + }, + { + "completion_length": 448.16668701171875, + "epoch": 0.583916083916084, + "grad_norm": 0.6712130308151245, + "kl": 0.0714031383395195, + "learning_rate": 4.175e-06, + "loss": 0.0029, + "reward": 1.9583333730697632, + "reward_std": 0.6499359011650085, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6250000596046448, + "step": 167 + }, + { + "completion_length": 763.0, + "epoch": 0.5874125874125874, + "grad_norm": 0.5934569239616394, + "kl": 0.039833370596170425, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0016, + "reward": 1.7208335399627686, + "reward_std": 0.6870983839035034, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.720833420753479, + "step": 168 + }, + { + "completion_length": 813.8333740234375, + "epoch": 0.5909090909090909, + "grad_norm": 0.46408811211586, + "kl": 0.0639135017991066, + "learning_rate": 4.225e-06, + "loss": 0.0026, + "reward": 2.6625001430511475, + "reward_std": 0.271454393863678, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6625000238418579, + "step": 169 + }, + { + "completion_length": 621.3333740234375, + "epoch": 0.5944055944055944, + "grad_norm": 1.6175382137298584, + "kl": 0.23431169986724854, + "learning_rate": 4.25e-06, + "loss": 0.0094, + "reward": 1.5250000953674316, + "reward_std": 1.00784432888031, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 170 + }, + { + "completion_length": 685.1666870117188, + "epoch": 0.5979020979020979, + "grad_norm": 0.7504808306694031, + "kl": 0.06654171645641327, + "learning_rate": 4.2750000000000006e-06, + "loss": 0.0027, + "reward": 2.4583334922790527, + "reward_std": 0.8187898397445679, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 171 + }, + { + "completion_length": 772.6666870117188, + "epoch": 0.6013986013986014, + "grad_norm": 0.39892545342445374, + "kl": 0.030765770003199577, + "learning_rate": 4.3e-06, + "loss": 0.0012, + "reward": 1.7333333492279053, + "reward_std": 0.40207797288894653, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 172 + }, + { + "completion_length": 600.8333740234375, + "epoch": 0.6048951048951049, + "grad_norm": 0.6147928833961487, + "kl": 0.07108036428689957, + "learning_rate": 4.325e-06, + "loss": 0.0028, + "reward": 2.054166793823242, + "reward_std": 0.5684225559234619, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7208333015441895, + "step": 173 + }, + { + "completion_length": 761.3333740234375, + "epoch": 0.6083916083916084, + "grad_norm": 1.1690645217895508, + "kl": 0.11572085320949554, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0046, + "reward": 1.9583333730697632, + "reward_std": 1.2491663694381714, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7916666865348816, + "step": 174 + }, + { + "completion_length": 800.6666870117188, + "epoch": 0.6118881118881119, + "grad_norm": 1.141146183013916, + "kl": 0.0763167217373848, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.0031, + "reward": 1.4458335638046265, + "reward_std": 1.0782413482666016, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6124999523162842, + "step": 175 + }, + { + "completion_length": 582.0, + "epoch": 0.6153846153846154, + "grad_norm": 0.9667629599571228, + "kl": 0.04065123200416565, + "learning_rate": 4.4e-06, + "loss": 0.0016, + "reward": 1.5625, + "reward_std": 1.3656271696090698, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5625, + "step": 176 + }, + { + "completion_length": 653.6666870117188, + "epoch": 0.6188811188811189, + "grad_norm": 0.7743256092071533, + "kl": 0.07254478335380554, + "learning_rate": 4.425e-06, + "loss": 0.0029, + "reward": 1.308333396911621, + "reward_std": 0.7324048280715942, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6416666507720947, + "step": 177 + }, + { + "completion_length": 624.8333740234375, + "epoch": 0.6223776223776224, + "grad_norm": 1.7900493144989014, + "kl": 0.2500300407409668, + "learning_rate": 4.450000000000001e-06, + "loss": 0.01, + "reward": 1.3583333492279053, + "reward_std": 0.7825705409049988, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916667222976685, + "step": 178 + }, + { + "completion_length": 1285.0, + "epoch": 0.6258741258741258, + "grad_norm": 0.3387628197669983, + "kl": 0.025821728631854057, + "learning_rate": 4.475e-06, + "loss": 0.001, + "reward": 2.7916667461395264, + "reward_std": 0.678355872631073, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666269302368, + "step": 179 + }, + { + "completion_length": 975.8333740234375, + "epoch": 0.6293706293706294, + "grad_norm": 0.41932833194732666, + "kl": 0.04700490087270737, + "learning_rate": 4.5e-06, + "loss": 0.0019, + "reward": 1.8500001430511475, + "reward_std": 0.6782330274581909, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8499999046325684, + "step": 180 + }, + { + "completion_length": 771.8333740234375, + "epoch": 0.6328671328671329, + "grad_norm": 0.6049262881278992, + "kl": 0.05856431648135185, + "learning_rate": 4.525000000000001e-06, + "loss": 0.0023, + "reward": 1.6624999046325684, + "reward_std": 1.5213277339935303, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6625000238418579, + "step": 181 + }, + { + "completion_length": 718.3333740234375, + "epoch": 0.6363636363636364, + "grad_norm": 0.519266664981842, + "kl": 0.05408002436161041, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0022, + "reward": 3.012500286102295, + "reward_std": 1.0839452743530273, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8458333015441895, + "step": 182 + }, + { + "completion_length": 417.3333435058594, + "epoch": 0.6398601398601399, + "grad_norm": 1.159592866897583, + "kl": 0.06883987784385681, + "learning_rate": 4.575e-06, + "loss": 0.0028, + "reward": 2.308333396911621, + "reward_std": 1.089686393737793, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416666507720947, + "step": 183 + }, + { + "completion_length": 403.66668701171875, + "epoch": 0.6433566433566433, + "grad_norm": 0.9109689593315125, + "kl": 0.12938742339611053, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0052, + "reward": 2.829166889190674, + "reward_std": 0.9263390898704529, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4958333373069763, + "step": 184 + }, + { + "completion_length": 584.1666870117188, + "epoch": 0.6468531468531469, + "grad_norm": 1.3091282844543457, + "kl": 0.1182996854186058, + "learning_rate": 4.625000000000001e-06, + "loss": 0.0047, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 185 + }, + { + "completion_length": 715.8333740234375, + "epoch": 0.6503496503496503, + "grad_norm": 0.8944427967071533, + "kl": 0.07471362501382828, + "learning_rate": 4.65e-06, + "loss": 0.003, + "reward": 2.5500001907348633, + "reward_std": 1.0044898986816406, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 186 + }, + { + "completion_length": 328.66668701171875, + "epoch": 0.6538461538461539, + "grad_norm": 2.0265045166015625, + "kl": 0.3070363402366638, + "learning_rate": 4.675000000000001e-06, + "loss": 0.0123, + "reward": 2.0291666984558105, + "reward_std": 0.9910117983818054, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.36250001192092896, + "step": 187 + }, + { + "completion_length": 463.8333435058594, + "epoch": 0.6573426573426573, + "grad_norm": 1.1863874197006226, + "kl": 0.07772837579250336, + "learning_rate": 4.7e-06, + "loss": 0.0031, + "reward": 2.5333335399627686, + "reward_std": 0.9558593034744263, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5333333611488342, + "step": 188 + }, + { + "completion_length": 516.5, + "epoch": 0.6608391608391608, + "grad_norm": 0.690477192401886, + "kl": 0.08707510679960251, + "learning_rate": 4.7250000000000005e-06, + "loss": 0.0035, + "reward": 3.4000000953674316, + "reward_std": 1.2024973630905151, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 189 + }, + { + "completion_length": 656.8333740234375, + "epoch": 0.6643356643356644, + "grad_norm": 0.7191756963729858, + "kl": 0.05152536556124687, + "learning_rate": 4.75e-06, + "loss": 0.0021, + "reward": 1.7833335399627686, + "reward_std": 0.5288351774215698, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 190 + }, + { + "completion_length": 510.16668701171875, + "epoch": 0.6678321678321678, + "grad_norm": 1.589722990989685, + "kl": 0.11165278404951096, + "learning_rate": 4.775e-06, + "loss": 0.0045, + "reward": 1.5916666984558105, + "reward_std": 1.1620744466781616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5916666984558105, + "step": 191 + }, + { + "completion_length": 463.3333435058594, + "epoch": 0.6713286713286714, + "grad_norm": 1.1402506828308105, + "kl": 0.12224837392568588, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0049, + "reward": 3.0166664123535156, + "reward_std": 0.46224093437194824, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6833333373069763, + "step": 192 + }, + { + "completion_length": 668.8333740234375, + "epoch": 0.6748251748251748, + "grad_norm": 0.829407811164856, + "kl": 0.04827030003070831, + "learning_rate": 4.825e-06, + "loss": 0.0019, + "reward": 2.516666889190674, + "reward_std": 0.9416297674179077, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 193 + }, + { + "completion_length": 653.1666870117188, + "epoch": 0.6783216783216783, + "grad_norm": 0.8737359642982483, + "kl": 0.11687206476926804, + "learning_rate": 4.85e-06, + "loss": 0.0047, + "reward": 1.883333444595337, + "reward_std": 0.9978310465812683, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666388511658, + "step": 194 + }, + { + "completion_length": 521.1666870117188, + "epoch": 0.6818181818181818, + "grad_norm": 1.265020728111267, + "kl": 0.1497541069984436, + "learning_rate": 4.875e-06, + "loss": 0.006, + "reward": 1.6666667461395264, + "reward_std": 1.1578716039657593, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6666666865348816, + "step": 195 + }, + { + "completion_length": 720.3333740234375, + "epoch": 0.6853146853146853, + "grad_norm": 0.5844486355781555, + "kl": 0.07905390858650208, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0032, + "reward": 2.683333396911621, + "reward_std": 0.7659417986869812, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 196 + }, + { + "completion_length": 654.3333740234375, + "epoch": 0.6888111888111889, + "grad_norm": 1.0279442071914673, + "kl": 0.05869147181510925, + "learning_rate": 4.925e-06, + "loss": 0.0023, + "reward": 1.8250000476837158, + "reward_std": 1.047735571861267, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.824999988079071, + "step": 197 + }, + { + "completion_length": 696.5, + "epoch": 0.6923076923076923, + "grad_norm": 0.5949178338050842, + "kl": 0.10564576834440231, + "learning_rate": 4.95e-06, + "loss": 0.0042, + "reward": 2.7958333492279053, + "reward_std": 0.8044278621673584, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 198 + }, + { + "completion_length": 667.3333740234375, + "epoch": 0.6958041958041958, + "grad_norm": 1.4045933485031128, + "kl": 0.2249039262533188, + "learning_rate": 4.975000000000001e-06, + "loss": 0.009, + "reward": 1.7833333015441895, + "reward_std": 1.2967909574508667, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 199 + }, + { + "completion_length": 549.0, + "epoch": 0.6993006993006993, + "grad_norm": 11.491266250610352, + "kl": 2.7085909843444824, + "learning_rate": 5e-06, + "loss": 0.1083, + "reward": 1.883333444595337, + "reward_std": 1.0934655666351318, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 200 + }, + { + "completion_length": 1157.666748046875, + "epoch": 0.7027972027972028, + "grad_norm": 0.3758504092693329, + "kl": 0.03439244627952576, + "learning_rate": 4.99999619228322e-06, + "loss": 0.0014, + "reward": 1.5375001430511475, + "reward_std": 0.490853875875473, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 201 + }, + { + "completion_length": 276.66668701171875, + "epoch": 0.7062937062937062, + "grad_norm": 1.4240407943725586, + "kl": 0.09711845219135284, + "learning_rate": 4.999984769144476e-06, + "loss": 0.0039, + "reward": 1.774999976158142, + "reward_std": 1.4250439405441284, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.44166669249534607, + "step": 202 + }, + { + "completion_length": 506.16668701171875, + "epoch": 0.7097902097902098, + "grad_norm": 0.8863720893859863, + "kl": 0.0886097177863121, + "learning_rate": 4.999965730618567e-06, + "loss": 0.0035, + "reward": 2.4166667461395264, + "reward_std": 0.7717944979667664, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.75, + "step": 203 + }, + { + "completion_length": 558.8333740234375, + "epoch": 0.7132867132867133, + "grad_norm": 1.036176323890686, + "kl": 0.11752279102802277, + "learning_rate": 4.999939076763487e-06, + "loss": 0.0047, + "reward": 1.8583334684371948, + "reward_std": 0.7761551141738892, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916666030883789, + "step": 204 + }, + { + "completion_length": 590.3333740234375, + "epoch": 0.7167832167832168, + "grad_norm": 1.2968803644180298, + "kl": 0.1260688155889511, + "learning_rate": 4.9999048076604286e-06, + "loss": 0.005, + "reward": 1.883333444595337, + "reward_std": 1.0934655666351318, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666984558105, + "step": 205 + }, + { + "completion_length": 653.3333740234375, + "epoch": 0.7202797202797203, + "grad_norm": 1.9041389226913452, + "kl": 0.350026935338974, + "learning_rate": 4.999862923413781e-06, + "loss": 0.014, + "reward": 1.8041666746139526, + "reward_std": 0.5104941129684448, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6375000476837158, + "step": 206 + }, + { + "completion_length": 359.3333435058594, + "epoch": 0.7237762237762237, + "grad_norm": 1.4652067422866821, + "kl": 0.09337612986564636, + "learning_rate": 4.9998134241511305e-06, + "loss": 0.0037, + "reward": 1.875, + "reward_std": 1.1440061330795288, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5416666865348816, + "step": 207 + }, + { + "completion_length": 393.3333435058594, + "epoch": 0.7272727272727273, + "grad_norm": 0.8172839879989624, + "kl": 0.11479752510786057, + "learning_rate": 4.999756310023261e-06, + "loss": 0.0046, + "reward": 3.2916667461395264, + "reward_std": 0.46627962589263916, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.625, + "step": 208 + }, + { + "completion_length": 1035.166748046875, + "epoch": 0.7307692307692307, + "grad_norm": 0.45489755272865295, + "kl": 0.03647574782371521, + "learning_rate": 4.9996915812041515e-06, + "loss": 0.0015, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 209 + }, + { + "completion_length": 561.5, + "epoch": 0.7342657342657343, + "grad_norm": 0.7732179164886475, + "kl": 0.10910838097333908, + "learning_rate": 4.9996192378909785e-06, + "loss": 0.0044, + "reward": 3.075000286102295, + "reward_std": 0.9852665662765503, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416667342185974, + "step": 210 + }, + { + "completion_length": 327.3333435058594, + "epoch": 0.7377622377622378, + "grad_norm": 1.1959446668624878, + "kl": 0.18659886717796326, + "learning_rate": 4.999539280304111e-06, + "loss": 0.0075, + "reward": 1.7333333492279053, + "reward_std": 0.6875075697898865, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5666666626930237, + "step": 211 + }, + { + "completion_length": 698.1666870117188, + "epoch": 0.7412587412587412, + "grad_norm": 0.5885636806488037, + "kl": 0.06670037657022476, + "learning_rate": 4.999451708687114e-06, + "loss": 0.0027, + "reward": 2.7750003337860107, + "reward_std": 0.8341163396835327, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7749999761581421, + "step": 212 + }, + { + "completion_length": 679.8333740234375, + "epoch": 0.7447552447552448, + "grad_norm": 0.9122396111488342, + "kl": 0.10316199064254761, + "learning_rate": 4.999356523306746e-06, + "loss": 0.0041, + "reward": 2.008333444595337, + "reward_std": 1.2973692417144775, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5083333253860474, + "step": 213 + }, + { + "completion_length": 604.1666870117188, + "epoch": 0.7482517482517482, + "grad_norm": 0.7414869070053101, + "kl": 0.08340045064687729, + "learning_rate": 4.9992537244529585e-06, + "loss": 0.0033, + "reward": 3.299999952316284, + "reward_std": 0.41713306307792664, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8000000715255737, + "step": 214 + }, + { + "completion_length": 704.5, + "epoch": 0.7517482517482518, + "grad_norm": 2.09073543548584, + "kl": 0.10594753921031952, + "learning_rate": 4.999143312438893e-06, + "loss": 0.0042, + "reward": 1.7416666746139526, + "reward_std": 0.9259679317474365, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7416667342185974, + "step": 215 + }, + { + "completion_length": 587.8333740234375, + "epoch": 0.7552447552447552, + "grad_norm": 1.304240107536316, + "kl": 0.1295248121023178, + "learning_rate": 4.999025287600886e-06, + "loss": 0.0052, + "reward": 2.616666793823242, + "reward_std": 1.6061341762542725, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6166666746139526, + "step": 216 + }, + { + "completion_length": 495.8333435058594, + "epoch": 0.7587412587412588, + "grad_norm": 1.2090598344802856, + "kl": 0.11880560964345932, + "learning_rate": 4.9988996502984604e-06, + "loss": 0.0048, + "reward": 2.7333333492279053, + "reward_std": 1.022578477859497, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5666667222976685, + "step": 217 + }, + { + "completion_length": 565.6666870117188, + "epoch": 0.7622377622377622, + "grad_norm": 0.553954005241394, + "kl": 0.052788302302360535, + "learning_rate": 4.998766400914329e-06, + "loss": 0.0021, + "reward": 2.6999998092651367, + "reward_std": 0.9705669283866882, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.699999988079071, + "step": 218 + }, + { + "completion_length": 645.6666870117188, + "epoch": 0.7657342657342657, + "grad_norm": 2.507683038711548, + "kl": 0.2849184274673462, + "learning_rate": 4.998625539854394e-06, + "loss": 0.0114, + "reward": 2.6000001430511475, + "reward_std": 1.0089600086212158, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7666666507720947, + "step": 219 + }, + { + "completion_length": 321.66668701171875, + "epoch": 0.7692307692307693, + "grad_norm": 1.2175945043563843, + "kl": 0.0842239186167717, + "learning_rate": 4.99847706754774e-06, + "loss": 0.0034, + "reward": 2.933333158493042, + "reward_std": 0.6516644954681396, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6000000238418579, + "step": 220 + }, + { + "completion_length": 700.5, + "epoch": 0.7727272727272727, + "grad_norm": 2.048892021179199, + "kl": 0.16157689690589905, + "learning_rate": 4.9983209844466404e-06, + "loss": 0.0065, + "reward": 1.625, + "reward_std": 1.0167349576950073, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 221 + }, + { + "completion_length": 833.5, + "epoch": 0.7762237762237763, + "grad_norm": 0.9171572327613831, + "kl": 0.06645169854164124, + "learning_rate": 4.998157291026553e-06, + "loss": 0.0027, + "reward": 2.9083335399627686, + "reward_std": 1.2068209648132324, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7416666746139526, + "step": 222 + }, + { + "completion_length": 506.3333435058594, + "epoch": 0.7797202797202797, + "grad_norm": 19.220211029052734, + "kl": 3.192702293395996, + "learning_rate": 4.9979859877861155e-06, + "loss": 0.1277, + "reward": 3.191666603088379, + "reward_std": 1.2146673202514648, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.6916667222976685, + "step": 223 + }, + { + "completion_length": 593.0, + "epoch": 0.7832167832167832, + "grad_norm": 0.8852243423461914, + "kl": 0.09442658722400665, + "learning_rate": 4.997807075247147e-06, + "loss": 0.0038, + "reward": 3.2750003337860107, + "reward_std": 0.6691412925720215, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7749999761581421, + "step": 224 + }, + { + "completion_length": 831.1666870117188, + "epoch": 0.7867132867132867, + "grad_norm": 0.4429211914539337, + "kl": 0.04310205578804016, + "learning_rate": 4.997620553954645e-06, + "loss": 0.0017, + "reward": 3.1541666984558105, + "reward_std": 1.132741928100586, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8208333849906921, + "step": 225 + }, + { + "completion_length": 731.0, + "epoch": 0.7902097902097902, + "grad_norm": 0.4210525155067444, + "kl": 0.0507250651717186, + "learning_rate": 4.997426424476787e-06, + "loss": 0.002, + "reward": 3.758333206176758, + "reward_std": 0.40052053332328796, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 226 + }, + { + "completion_length": 683.1666870117188, + "epoch": 0.7937062937062938, + "grad_norm": 1.443489670753479, + "kl": 0.1432674527168274, + "learning_rate": 4.9972246874049254e-06, + "loss": 0.0057, + "reward": 2.7166666984558105, + "reward_std": 1.075019359588623, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 227 + }, + { + "completion_length": 749.0, + "epoch": 0.7972027972027972, + "grad_norm": 0.4731828272342682, + "kl": 0.05084119364619255, + "learning_rate": 4.9970153433535855e-06, + "loss": 0.002, + "reward": 2.5250000953674316, + "reward_std": 0.49371039867401123, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8583332896232605, + "step": 228 + }, + { + "completion_length": 749.6666870117188, + "epoch": 0.8006993006993007, + "grad_norm": 1.1463042497634888, + "kl": 0.0917380303144455, + "learning_rate": 4.996798392960466e-06, + "loss": 0.0037, + "reward": 3.1000001430511475, + "reward_std": 1.1304867267608643, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 229 + }, + { + "completion_length": 444.3333435058594, + "epoch": 0.8041958041958042, + "grad_norm": 2.1588308811187744, + "kl": 0.2637466788291931, + "learning_rate": 4.9965738368864345e-06, + "loss": 0.0105, + "reward": 1.4583333730697632, + "reward_std": 0.665895402431488, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4583333432674408, + "step": 230 + }, + { + "completion_length": 563.8333740234375, + "epoch": 0.8076923076923077, + "grad_norm": 1.7064660787582397, + "kl": 0.15527644753456116, + "learning_rate": 4.99634167581553e-06, + "loss": 0.0062, + "reward": 2.9208335876464844, + "reward_std": 1.1095513105392456, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5874999761581421, + "step": 231 + }, + { + "completion_length": 571.6666870117188, + "epoch": 0.8111888111888111, + "grad_norm": 0.7909032106399536, + "kl": 0.10144728422164917, + "learning_rate": 4.996101910454953e-06, + "loss": 0.0041, + "reward": 3.200000286102295, + "reward_std": 0.6928204298019409, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.699999988079071, + "step": 232 + }, + { + "completion_length": 442.16668701171875, + "epoch": 0.8146853146853147, + "grad_norm": 2.3640758991241455, + "kl": 0.1561039686203003, + "learning_rate": 4.995854541535072e-06, + "loss": 0.0062, + "reward": 2.8583333492279053, + "reward_std": 1.5499732494354248, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 233 + }, + { + "completion_length": 635.0, + "epoch": 0.8181818181818182, + "grad_norm": 1.519736409187317, + "kl": 0.08059443533420563, + "learning_rate": 4.995599569809414e-06, + "loss": 0.0032, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 234 + }, + { + "completion_length": 867.1666870117188, + "epoch": 0.8216783216783217, + "grad_norm": 1.0411657094955444, + "kl": 0.18848155438899994, + "learning_rate": 4.995336996054668e-06, + "loss": 0.0075, + "reward": 2.566666603088379, + "reward_std": 0.8010410666465759, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 235 + }, + { + "completion_length": 767.0, + "epoch": 0.8251748251748252, + "grad_norm": 1.3162877559661865, + "kl": 0.1943603754043579, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.0078, + "reward": 2.8458335399627686, + "reward_std": 1.271457552909851, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8458333015441895, + "step": 236 + }, + { + "completion_length": 971.0, + "epoch": 0.8286713286713286, + "grad_norm": 0.7847824096679688, + "kl": 0.07626049965620041, + "learning_rate": 4.994789045680448e-06, + "loss": 0.0031, + "reward": 2.766666889190674, + "reward_std": 1.1245739459991455, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7666666507720947, + "step": 237 + }, + { + "completion_length": 552.0, + "epoch": 0.8321678321678322, + "grad_norm": 0.7410560250282288, + "kl": 0.10457824170589447, + "learning_rate": 4.994503670730126e-06, + "loss": 0.0042, + "reward": 3.391666889190674, + "reward_std": 0.7059863805770874, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7250000238418579, + "step": 238 + }, + { + "completion_length": 725.6666870117188, + "epoch": 0.8356643356643356, + "grad_norm": 0.4836815595626831, + "kl": 0.05600851774215698, + "learning_rate": 4.9942106970890136e-06, + "loss": 0.0022, + "reward": 2.7333333492279053, + "reward_std": 0.40207791328430176, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8999999761581421, + "step": 239 + }, + { + "completion_length": 670.1666870117188, + "epoch": 0.8391608391608392, + "grad_norm": 1.1572860479354858, + "kl": 0.09645780920982361, + "learning_rate": 4.993910125649561e-06, + "loss": 0.0039, + "reward": 1.945833444595337, + "reward_std": 1.1002748012542725, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.612500011920929, + "step": 240 + }, + { + "completion_length": 716.0, + "epoch": 0.8426573426573427, + "grad_norm": 0.6385201811790466, + "kl": 0.10877624154090881, + "learning_rate": 4.993601957327361e-06, + "loss": 0.0044, + "reward": 1.7999999523162842, + "reward_std": 1.3168143033981323, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 241 + }, + { + "completion_length": 783.0, + "epoch": 0.8461538461538461, + "grad_norm": 0.4785465598106384, + "kl": 0.06399235874414444, + "learning_rate": 4.993286193061145e-06, + "loss": 0.0026, + "reward": 2.258333444595337, + "reward_std": 0.5389031767845154, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9249999523162842, + "step": 242 + }, + { + "completion_length": 660.6666870117188, + "epoch": 0.8496503496503497, + "grad_norm": 0.7678278684616089, + "kl": 0.07323874533176422, + "learning_rate": 4.9929628338127904e-06, + "loss": 0.0029, + "reward": 2.575000047683716, + "reward_std": 1.0048632621765137, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7416666746139526, + "step": 243 + }, + { + "completion_length": 904.5, + "epoch": 0.8531468531468531, + "grad_norm": 0.41908255219459534, + "kl": 0.049275174736976624, + "learning_rate": 4.992631880567301e-06, + "loss": 0.002, + "reward": 1.9250000715255737, + "reward_std": 0.6354132890701294, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.9249999523162842, + "step": 244 + }, + { + "completion_length": 524.8333740234375, + "epoch": 0.8566433566433567, + "grad_norm": 0.9670363068580627, + "kl": 0.17363564670085907, + "learning_rate": 4.992293334332821e-06, + "loss": 0.0069, + "reward": 1.558333396911621, + "reward_std": 1.3331979513168335, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333373069763, + "step": 245 + }, + { + "completion_length": 869.1666870117188, + "epoch": 0.8601398601398601, + "grad_norm": 0.45620983839035034, + "kl": 0.0668826699256897, + "learning_rate": 4.991947196140619e-06, + "loss": 0.0027, + "reward": 2.5416667461395264, + "reward_std": 0.9057685732841492, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 246 + }, + { + "completion_length": 841.3333740234375, + "epoch": 0.8636363636363636, + "grad_norm": 0.559363603591919, + "kl": 0.0583985298871994, + "learning_rate": 4.991593467045092e-06, + "loss": 0.0023, + "reward": 2.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 247 + }, + { + "completion_length": 599.1666870117188, + "epoch": 0.8671328671328671, + "grad_norm": 0.9642091989517212, + "kl": 0.11994724720716476, + "learning_rate": 4.9912321481237616e-06, + "loss": 0.0048, + "reward": 2.5250000953674316, + "reward_std": 1.0810874700546265, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 248 + }, + { + "completion_length": 462.16668701171875, + "epoch": 0.8706293706293706, + "grad_norm": 36.93287658691406, + "kl": 9.688800811767578, + "learning_rate": 4.990863240477266e-06, + "loss": 0.3876, + "reward": 2.133333444595337, + "reward_std": 1.5154757499694824, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.46666666865348816, + "step": 249 + }, + { + "completion_length": 339.0, + "epoch": 0.8741258741258742, + "grad_norm": 26.625389099121094, + "kl": 0.959087610244751, + "learning_rate": 4.990486745229364e-06, + "loss": 0.0384, + "reward": 2.4000000953674316, + "reward_std": 1.4926488399505615, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5666666626930237, + "step": 250 + }, + { + "completion_length": 618.1666870117188, + "epoch": 0.8776223776223776, + "grad_norm": 0.8756181597709656, + "kl": 0.1540575623512268, + "learning_rate": 4.990102663526925e-06, + "loss": 0.0062, + "reward": 2.3583335876464844, + "reward_std": 0.7564169764518738, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916666030883789, + "step": 251 + }, + { + "completion_length": 659.0, + "epoch": 0.8811188811188811, + "grad_norm": 1.4729007482528687, + "kl": 0.22244331240653992, + "learning_rate": 4.989710996539926e-06, + "loss": 0.0089, + "reward": 2.6666667461395264, + "reward_std": 1.386602759361267, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6666666865348816, + "step": 252 + }, + { + "completion_length": 471.0, + "epoch": 0.8846153846153846, + "grad_norm": 1.7183626890182495, + "kl": 0.19531545042991638, + "learning_rate": 4.989311745461456e-06, + "loss": 0.0078, + "reward": 2.2624998092651367, + "reward_std": 1.547720193862915, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.42916664481163025, + "step": 253 + }, + { + "completion_length": 809.5, + "epoch": 0.8881118881118881, + "grad_norm": 1.3393943309783936, + "kl": 0.06276177614927292, + "learning_rate": 4.9889049115077e-06, + "loss": 0.0025, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 254 + }, + { + "completion_length": 696.0, + "epoch": 0.8916083916083916, + "grad_norm": 0.5159295201301575, + "kl": 0.06829811632633209, + "learning_rate": 4.988490495917948e-06, + "loss": 0.0027, + "reward": 2.375, + "reward_std": 0.8226482272148132, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.875, + "step": 255 + }, + { + "completion_length": 469.8333435058594, + "epoch": 0.8951048951048951, + "grad_norm": 15.731892585754395, + "kl": 5.195942401885986, + "learning_rate": 4.988068499954578e-06, + "loss": 0.2078, + "reward": 2.5333333015441895, + "reward_std": 1.7218208312988281, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5333333611488342, + "step": 256 + }, + { + "completion_length": 267.66668701171875, + "epoch": 0.8986013986013986, + "grad_norm": 2.6494510173797607, + "kl": 0.2645886242389679, + "learning_rate": 4.987638924903066e-06, + "loss": 0.0106, + "reward": 1.9833333492279053, + "reward_std": 1.6277797222137451, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4833333194255829, + "step": 257 + }, + { + "completion_length": 772.3333740234375, + "epoch": 0.9020979020979021, + "grad_norm": 0.4527927339076996, + "kl": 0.06693247705698013, + "learning_rate": 4.987201772071971e-06, + "loss": 0.0027, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 258 + }, + { + "completion_length": 585.6666870117188, + "epoch": 0.9055944055944056, + "grad_norm": 0.689224362373352, + "kl": 0.08530323952436447, + "learning_rate": 4.9867570427929356e-06, + "loss": 0.0034, + "reward": 0.7916666865348816, + "reward_std": 0.9183772802352905, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4583333432674408, + "step": 259 + }, + { + "completion_length": 537.1666870117188, + "epoch": 0.9090909090909091, + "grad_norm": 0.6728858947753906, + "kl": 0.0897747129201889, + "learning_rate": 4.986304738420684e-06, + "loss": 0.0036, + "reward": 3.129167079925537, + "reward_std": 1.1996268033981323, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7958333492279053, + "step": 260 + }, + { + "completion_length": 407.8333435058594, + "epoch": 0.9125874125874126, + "grad_norm": 1.1994887590408325, + "kl": 0.09183052182197571, + "learning_rate": 4.985844860333012e-06, + "loss": 0.0037, + "reward": 2.3583335876464844, + "reward_std": 1.752831220626831, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 261 + }, + { + "completion_length": 677.5, + "epoch": 0.916083916083916, + "grad_norm": 0.508855402469635, + "kl": 0.07326661795377731, + "learning_rate": 4.985377409930789e-06, + "loss": 0.0029, + "reward": 3.375, + "reward_std": 0.8635681867599487, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 262 + }, + { + "completion_length": 736.8333740234375, + "epoch": 0.9195804195804196, + "grad_norm": 0.9614912271499634, + "kl": 0.09196578711271286, + "learning_rate": 4.98490238863795e-06, + "loss": 0.0037, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 263 + }, + { + "completion_length": 770.8333740234375, + "epoch": 0.9230769230769231, + "grad_norm": 0.47455278038978577, + "kl": 0.06785900890827179, + "learning_rate": 4.984419797901491e-06, + "loss": 0.0027, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 264 + }, + { + "completion_length": 623.6666870117188, + "epoch": 0.9265734265734266, + "grad_norm": 0.5573136210441589, + "kl": 0.08627455681562424, + "learning_rate": 4.9839296391914696e-06, + "loss": 0.0035, + "reward": 3.116666793823242, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 265 + }, + { + "completion_length": 391.3333435058594, + "epoch": 0.9300699300699301, + "grad_norm": 1.9462356567382812, + "kl": 0.16661277413368225, + "learning_rate": 4.983431914000991e-06, + "loss": 0.0067, + "reward": 2.4749999046325684, + "reward_std": 1.4665435552597046, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 266 + }, + { + "completion_length": 397.3333435058594, + "epoch": 0.9335664335664335, + "grad_norm": 1.011677622795105, + "kl": 0.23764805495738983, + "learning_rate": 4.982926623846216e-06, + "loss": 0.0095, + "reward": 3.366666793823242, + "reward_std": 0.6274287104606628, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7000000476837158, + "step": 267 + }, + { + "completion_length": 417.0, + "epoch": 0.9370629370629371, + "grad_norm": 1.4490914344787598, + "kl": 0.13754335045814514, + "learning_rate": 4.9824137702663424e-06, + "loss": 0.0055, + "reward": 1.7125000953674316, + "reward_std": 0.9684717655181885, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7124999761581421, + "step": 268 + }, + { + "completion_length": 410.5, + "epoch": 0.9405594405594405, + "grad_norm": 0.8436146974563599, + "kl": 0.14260268211364746, + "learning_rate": 4.981893354823614e-06, + "loss": 0.0057, + "reward": 1.8125, + "reward_std": 1.1806514263153076, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6458333730697632, + "step": 269 + }, + { + "completion_length": 644.6666870117188, + "epoch": 0.9440559440559441, + "grad_norm": 0.7549885511398315, + "kl": 0.09023593366146088, + "learning_rate": 4.981365379103306e-06, + "loss": 0.0036, + "reward": 2.3500001430511475, + "reward_std": 1.3856406211853027, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 270 + }, + { + "completion_length": 195.5, + "epoch": 0.9475524475524476, + "grad_norm": 1.895914077758789, + "kl": 0.29670989513397217, + "learning_rate": 4.980829844713722e-06, + "loss": 0.0119, + "reward": 1.649999976158142, + "reward_std": 1.0168579816818237, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.3166666626930237, + "step": 271 + }, + { + "completion_length": 359.8333435058594, + "epoch": 0.951048951048951, + "grad_norm": 1.0856112241744995, + "kl": 0.255443274974823, + "learning_rate": 4.980286753286196e-06, + "loss": 0.0102, + "reward": 2.2916667461395264, + "reward_std": 1.2310227155685425, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.625, + "step": 272 + }, + { + "completion_length": 726.8333740234375, + "epoch": 0.9545454545454546, + "grad_norm": 0.2943981885910034, + "kl": 0.12990406155586243, + "learning_rate": 4.979736106475075e-06, + "loss": 0.0064, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 273 + }, + { + "completion_length": 680.0, + "epoch": 0.958041958041958, + "grad_norm": 0.5072641372680664, + "kl": 0.07472037523984909, + "learning_rate": 4.979177905957726e-06, + "loss": 0.003, + "reward": 3.012500286102295, + "reward_std": 1.1379531621932983, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8458333015441895, + "step": 274 + }, + { + "completion_length": 491.5, + "epoch": 0.9615384615384616, + "grad_norm": 0.6770206689834595, + "kl": 0.13075995445251465, + "learning_rate": 4.978612153434527e-06, + "loss": 0.0052, + "reward": 2.008333444595337, + "reward_std": 0.7618508338928223, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6750000715255737, + "step": 275 + }, + { + "completion_length": 749.6666870117188, + "epoch": 0.965034965034965, + "grad_norm": 0.5412439107894897, + "kl": 0.10561086982488632, + "learning_rate": 4.978038850628855e-06, + "loss": 0.0042, + "reward": 2.870833396911621, + "reward_std": 0.6615166068077087, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 276 + }, + { + "completion_length": 511.5, + "epoch": 0.9685314685314685, + "grad_norm": 1.1368520259857178, + "kl": 0.14474637806415558, + "learning_rate": 4.977457999287091e-06, + "loss": 0.0058, + "reward": 1.7583332061767578, + "reward_std": 1.0646204948425293, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 277 + }, + { + "completion_length": 750.6666870117188, + "epoch": 0.972027972027972, + "grad_norm": 1.0957084894180298, + "kl": 0.10108073800802231, + "learning_rate": 4.9768696011786095e-06, + "loss": 0.004, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 278 + }, + { + "completion_length": 324.3333435058594, + "epoch": 0.9755244755244755, + "grad_norm": 1.0172570943832397, + "kl": 0.31204575300216675, + "learning_rate": 4.976273658095772e-06, + "loss": 0.0125, + "reward": 0.908333420753479, + "reward_std": 1.0532886981964111, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40833330154418945, + "step": 279 + }, + { + "completion_length": 329.66668701171875, + "epoch": 0.9790209790209791, + "grad_norm": 0.753690242767334, + "kl": 0.09907300770282745, + "learning_rate": 4.975670171853926e-06, + "loss": 0.004, + "reward": 2.7750003337860107, + "reward_std": 1.0994317531585693, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7749999761581421, + "step": 280 + }, + { + "completion_length": 615.3333740234375, + "epoch": 0.9825174825174825, + "grad_norm": 0.8215593695640564, + "kl": 0.09376661479473114, + "learning_rate": 4.975059144291395e-06, + "loss": 0.0038, + "reward": 2.375, + "reward_std": 0.6509608626365662, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8749999403953552, + "step": 281 + }, + { + "completion_length": 435.8333435058594, + "epoch": 0.986013986013986, + "grad_norm": 1.3309355974197388, + "kl": 0.21346941590309143, + "learning_rate": 4.974440577269473e-06, + "loss": 0.0085, + "reward": 2.0333333015441895, + "reward_std": 1.6485350131988525, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5333333611488342, + "step": 282 + }, + { + "completion_length": 470.3333435058594, + "epoch": 0.9895104895104895, + "grad_norm": 1.1230376958847046, + "kl": 0.1047142893075943, + "learning_rate": 4.973814472672424e-06, + "loss": 0.0042, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 283 + }, + { + "completion_length": 887.5, + "epoch": 0.993006993006993, + "grad_norm": 0.6477030515670776, + "kl": 0.08142790198326111, + "learning_rate": 4.973180832407471e-06, + "loss": 0.0033, + "reward": 1.4250000715255737, + "reward_std": 0.9661781191825867, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5916666388511658, + "step": 284 + }, + { + "completion_length": 566.3333740234375, + "epoch": 0.9965034965034965, + "grad_norm": 0.7089259624481201, + "kl": 0.1486695259809494, + "learning_rate": 4.972539658404793e-06, + "loss": 0.0059, + "reward": 1.7166666984558105, + "reward_std": 0.7332576513290405, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 285 + }, + { + "completion_length": 899.3333740234375, + "epoch": 1.0, + "grad_norm": 0.6575971841812134, + "kl": 0.0989997610449791, + "learning_rate": 4.971890952617515e-06, + "loss": 0.004, + "reward": 2.8583335876464844, + "reward_std": 0.9960757493972778, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 286 + }, + { + "completion_length": 414.5, + "epoch": 1.0034965034965035, + "grad_norm": 1.0364247560501099, + "kl": 0.19011634588241577, + "learning_rate": 4.971234717021709e-06, + "loss": 0.0076, + "reward": 1.7916667461395264, + "reward_std": 1.7468304634094238, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6250000596046448, + "step": 287 + }, + { + "completion_length": 524.0, + "epoch": 1.006993006993007, + "grad_norm": 0.9833644032478333, + "kl": 0.14835724234580994, + "learning_rate": 4.970570953616383e-06, + "loss": 0.0059, + "reward": 2.3583335876464844, + "reward_std": 1.1191142797470093, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916666030883789, + "step": 288 + }, + { + "completion_length": 681.1666870117188, + "epoch": 1.0104895104895104, + "grad_norm": 0.6175888180732727, + "kl": 0.10941031575202942, + "learning_rate": 4.969899664423473e-06, + "loss": 0.0044, + "reward": 2.704166889190674, + "reward_std": 0.7567061185836792, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8708333373069763, + "step": 289 + }, + { + "completion_length": 386.5, + "epoch": 1.013986013986014, + "grad_norm": 2.7495882511138916, + "kl": 0.5513795614242554, + "learning_rate": 4.9692208514878445e-06, + "loss": 0.0221, + "reward": 1.3666666746139526, + "reward_std": 1.0023306608200073, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5333333611488342, + "step": 290 + }, + { + "completion_length": 679.6666870117188, + "epoch": 1.0174825174825175, + "grad_norm": 0.9174596667289734, + "kl": 0.14350205659866333, + "learning_rate": 4.968534516877279e-06, + "loss": 0.0057, + "reward": 2.879167079925537, + "reward_std": 1.0047906637191772, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7124999761581421, + "step": 291 + }, + { + "completion_length": 322.0, + "epoch": 1.020979020979021, + "grad_norm": 6.856034278869629, + "kl": 3.479478597640991, + "learning_rate": 4.96784066268247e-06, + "loss": 0.1392, + "reward": 0.875, + "reward_std": 0.9832345247268677, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.375, + "step": 292 + }, + { + "completion_length": 500.5, + "epoch": 1.0244755244755244, + "grad_norm": 0.8394511938095093, + "kl": 0.14955884218215942, + "learning_rate": 4.967139291017018e-06, + "loss": 0.006, + "reward": 2.133333206176758, + "reward_std": 1.149202585220337, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 293 + }, + { + "completion_length": 470.5, + "epoch": 1.027972027972028, + "grad_norm": 1.0547795295715332, + "kl": 0.26865124702453613, + "learning_rate": 4.966430404017424e-06, + "loss": 0.0107, + "reward": 1.7916667461395264, + "reward_std": 1.1534368991851807, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 294 + }, + { + "completion_length": 357.3333435058594, + "epoch": 1.0314685314685315, + "grad_norm": 1.61123788356781, + "kl": 0.2728823125362396, + "learning_rate": 4.965714003843079e-06, + "loss": 0.0109, + "reward": 3.266666889190674, + "reward_std": 1.6014575958251953, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7666666507720947, + "step": 295 + }, + { + "completion_length": 388.3333435058594, + "epoch": 1.034965034965035, + "grad_norm": 0.8229731917381287, + "kl": 0.33708059787750244, + "learning_rate": 4.964990092676263e-06, + "loss": 0.0135, + "reward": 0.7125000357627869, + "reward_std": 0.5300353765487671, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3791666626930237, + "step": 296 + }, + { + "completion_length": 667.0, + "epoch": 1.0384615384615385, + "grad_norm": 1.0831242799758911, + "kl": 0.26999422907829285, + "learning_rate": 4.964258672722135e-06, + "loss": 0.0108, + "reward": 2.5458335876464844, + "reward_std": 1.5425965785980225, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7124999761581421, + "step": 297 + }, + { + "completion_length": 804.1666870117188, + "epoch": 1.0419580419580419, + "grad_norm": 0.625715434551239, + "kl": 0.12136679887771606, + "learning_rate": 4.963519746208726e-06, + "loss": 0.0049, + "reward": 1.5791667699813843, + "reward_std": 1.2249915599822998, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7458333373069763, + "step": 298 + }, + { + "completion_length": 615.3333740234375, + "epoch": 1.0454545454545454, + "grad_norm": 0.9705678820610046, + "kl": 0.2214520424604416, + "learning_rate": 4.962773315386935e-06, + "loss": 0.0089, + "reward": 2.516666889190674, + "reward_std": 1.2355836629867554, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 299 + }, + { + "completion_length": 836.1666870117188, + "epoch": 1.048951048951049, + "grad_norm": 1.5465428829193115, + "kl": 0.24709966778755188, + "learning_rate": 4.962019382530521e-06, + "loss": 0.0099, + "reward": 2.0458333492279053, + "reward_std": 1.097544550895691, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 300 + }, + { + "completion_length": 597.6666870117188, + "epoch": 1.0524475524475525, + "grad_norm": 3.8257570266723633, + "kl": 0.9686455130577087, + "learning_rate": 4.961257949936092e-06, + "loss": 0.0387, + "reward": 1.4750001430511475, + "reward_std": 1.025061011314392, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 301 + }, + { + "completion_length": 516.6666870117188, + "epoch": 1.055944055944056, + "grad_norm": 2.1578736305236816, + "kl": 0.25257474184036255, + "learning_rate": 4.960489019923105e-06, + "loss": 0.0101, + "reward": 1.712499976158142, + "reward_std": 1.2360976934432983, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7125000357627869, + "step": 302 + }, + { + "completion_length": 390.3333435058594, + "epoch": 1.0594405594405594, + "grad_norm": 1.1851695775985718, + "kl": 0.30646514892578125, + "learning_rate": 4.959712594833855e-06, + "loss": 0.0123, + "reward": 1.3875000476837158, + "reward_std": 1.3440377712249756, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5541666746139526, + "step": 303 + }, + { + "completion_length": 329.66668701171875, + "epoch": 1.062937062937063, + "grad_norm": 1.7874314785003662, + "kl": 0.5978689193725586, + "learning_rate": 4.958928677033465e-06, + "loss": 0.0239, + "reward": 2.5625, + "reward_std": 1.447562575340271, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5625, + "step": 304 + }, + { + "completion_length": 676.5, + "epoch": 1.0664335664335665, + "grad_norm": 1.6353819370269775, + "kl": 0.2865048348903656, + "learning_rate": 4.958137268909887e-06, + "loss": 0.0115, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 305 + }, + { + "completion_length": 685.1666870117188, + "epoch": 1.06993006993007, + "grad_norm": 0.5405178666114807, + "kl": 0.16403402388095856, + "learning_rate": 4.957338372873886e-06, + "loss": 0.0066, + "reward": 1.1166666746139526, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 306 + }, + { + "completion_length": 377.16668701171875, + "epoch": 1.0734265734265733, + "grad_norm": 1.3861095905303955, + "kl": 0.5912900567054749, + "learning_rate": 4.956531991359038e-06, + "loss": 0.0237, + "reward": 0.9541667699813843, + "reward_std": 0.9423928260803223, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4541666507720947, + "step": 307 + }, + { + "completion_length": 568.1666870117188, + "epoch": 1.0769230769230769, + "grad_norm": 2.0841739177703857, + "kl": 0.3946326673030853, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.0158, + "reward": 1.2583333253860474, + "reward_std": 1.1876096725463867, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666388511658, + "step": 308 + }, + { + "completion_length": 610.1666870117188, + "epoch": 1.0804195804195804, + "grad_norm": 0.7838713526725769, + "kl": 0.20940952003002167, + "learning_rate": 4.95489678174111e-06, + "loss": 0.0084, + "reward": 1.1750000715255737, + "reward_std": 1.1035170555114746, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6750000715255737, + "step": 309 + }, + { + "completion_length": 780.3333740234375, + "epoch": 1.083916083916084, + "grad_norm": 0.91953444480896, + "kl": 0.13563194870948792, + "learning_rate": 4.9540679586191605e-06, + "loss": 0.0054, + "reward": 1.8500001430511475, + "reward_std": 1.006479024887085, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 310 + }, + { + "completion_length": 468.66668701171875, + "epoch": 1.0874125874125875, + "grad_norm": 1.1062681674957275, + "kl": 0.36474311351776123, + "learning_rate": 4.953231659980613e-06, + "loss": 0.0146, + "reward": 2.058333396911621, + "reward_std": 1.7576736211776733, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 311 + }, + { + "completion_length": 571.3333740234375, + "epoch": 1.0909090909090908, + "grad_norm": 0.7562583088874817, + "kl": 0.17403468489646912, + "learning_rate": 4.9523878883729794e-06, + "loss": 0.007, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 312 + }, + { + "completion_length": 580.6666870117188, + "epoch": 1.0944055944055944, + "grad_norm": 0.7236371040344238, + "kl": 0.20542237162590027, + "learning_rate": 4.9515366463665324e-06, + "loss": 0.0082, + "reward": 2.4000000953674316, + "reward_std": 0.8803409337997437, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 313 + }, + { + "completion_length": 372.5, + "epoch": 1.097902097902098, + "grad_norm": 0.736242949962616, + "kl": 0.19798314571380615, + "learning_rate": 4.9506779365543054e-06, + "loss": 0.0079, + "reward": 3.0916666984558105, + "reward_std": 0.4247548282146454, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9250000715255737, + "step": 314 + }, + { + "completion_length": 660.8333740234375, + "epoch": 1.1013986013986015, + "grad_norm": 0.7641960978507996, + "kl": 0.29524654150009155, + "learning_rate": 4.949811761552074e-06, + "loss": 0.0118, + "reward": 2.4166669845581055, + "reward_std": 1.2176480293273926, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7499999403953552, + "step": 315 + }, + { + "completion_length": 838.3333740234375, + "epoch": 1.104895104895105, + "grad_norm": 0.5717921853065491, + "kl": 0.14558419585227966, + "learning_rate": 4.94893812399836e-06, + "loss": 0.0058, + "reward": 2.258333206176758, + "reward_std": 0.8662660121917725, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9249999523162842, + "step": 316 + }, + { + "completion_length": 308.8333435058594, + "epoch": 1.1083916083916083, + "grad_norm": 1.5407124757766724, + "kl": 0.36382099986076355, + "learning_rate": 4.948057026554415e-06, + "loss": 0.0146, + "reward": 1.2291667461395264, + "reward_std": 1.2054479122161865, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5625, + "step": 317 + }, + { + "completion_length": 582.1666870117188, + "epoch": 1.1118881118881119, + "grad_norm": 0.5300387144088745, + "kl": 0.19406351447105408, + "learning_rate": 4.947168471904213e-06, + "loss": 0.0078, + "reward": 1.375, + "reward_std": 0.4937104880809784, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8749999403953552, + "step": 318 + }, + { + "completion_length": 889.3333740234375, + "epoch": 1.1153846153846154, + "grad_norm": 0.7921298146247864, + "kl": 0.14385448396205902, + "learning_rate": 4.946272462754447e-06, + "loss": 0.0058, + "reward": 1.629166603088379, + "reward_std": 0.8614546656608582, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 319 + }, + { + "completion_length": 576.6666870117188, + "epoch": 1.118881118881119, + "grad_norm": 2.1564207077026367, + "kl": 0.8259252309799194, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.033, + "reward": 1.399999976158142, + "reward_std": 1.3337916135787964, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40000003576278687, + "step": 320 + }, + { + "completion_length": 471.8333435058594, + "epoch": 1.1223776223776223, + "grad_norm": 1.2515596151351929, + "kl": 0.24163812398910522, + "learning_rate": 4.944458091896515e-06, + "loss": 0.0097, + "reward": 1.883333444595337, + "reward_std": 1.3325413465499878, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666984558105, + "step": 321 + }, + { + "completion_length": 416.66668701171875, + "epoch": 1.1258741258741258, + "grad_norm": 0.7721207141876221, + "kl": 0.2213769555091858, + "learning_rate": 4.9435397357152406e-06, + "loss": 0.0089, + "reward": 1.899999976158142, + "reward_std": 0.6442049741744995, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 322 + }, + { + "completion_length": 1349.5, + "epoch": 1.1293706293706294, + "grad_norm": 0.3130567967891693, + "kl": 0.10197386145591736, + "learning_rate": 4.94261393608816e-06, + "loss": 0.0041, + "reward": 1.9666666984558105, + "reward_std": 0.9277212023735046, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7999999523162842, + "step": 323 + }, + { + "completion_length": 669.5, + "epoch": 1.132867132867133, + "grad_norm": 0.9291994571685791, + "kl": 0.22598087787628174, + "learning_rate": 4.9416806958354206e-06, + "loss": 0.009, + "reward": 0.949999988079071, + "reward_std": 0.6595453023910522, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 324 + }, + { + "completion_length": 184.1666717529297, + "epoch": 1.1363636363636362, + "grad_norm": 2.9357590675354004, + "kl": 0.44805118441581726, + "learning_rate": 4.9407400177998335e-06, + "loss": 0.0179, + "reward": 2.450000047683716, + "reward_std": 1.4673106670379639, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 325 + }, + { + "completion_length": 786.8333740234375, + "epoch": 1.1398601398601398, + "grad_norm": 0.7112540006637573, + "kl": 0.23709163069725037, + "learning_rate": 4.939791904846869e-06, + "loss": 0.0095, + "reward": 2.7333335876464844, + "reward_std": 0.40207797288894653, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 326 + }, + { + "completion_length": 391.16668701171875, + "epoch": 1.1433566433566433, + "grad_norm": 1.6311299800872803, + "kl": 0.31598275899887085, + "learning_rate": 4.938836359864641e-06, + "loss": 0.0126, + "reward": 2.2791666984558105, + "reward_std": 0.9937827587127686, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6124999523162842, + "step": 327 + }, + { + "completion_length": 325.8333435058594, + "epoch": 1.1468531468531469, + "grad_norm": 1.6858141422271729, + "kl": 0.40026235580444336, + "learning_rate": 4.937873385763909e-06, + "loss": 0.016, + "reward": 2.0250000953674316, + "reward_std": 1.1339092254638672, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 328 + }, + { + "completion_length": 313.3333435058594, + "epoch": 1.1503496503496504, + "grad_norm": 1.9852374792099, + "kl": 0.36842843890190125, + "learning_rate": 4.936902985478055e-06, + "loss": 0.0147, + "reward": 2.5250000953674316, + "reward_std": 1.1206024885177612, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 329 + }, + { + "completion_length": 333.66668701171875, + "epoch": 1.1538461538461537, + "grad_norm": 1.0456072092056274, + "kl": 0.3002980351448059, + "learning_rate": 4.935925161963089e-06, + "loss": 0.012, + "reward": 2.1083335876464844, + "reward_std": 0.9068719744682312, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7749999761581421, + "step": 330 + }, + { + "completion_length": 419.16668701171875, + "epoch": 1.1573426573426573, + "grad_norm": 0.9209095239639282, + "kl": 0.19463126361370087, + "learning_rate": 4.93493991819763e-06, + "loss": 0.0078, + "reward": 3.566666603088379, + "reward_std": 0.9389710426330566, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 331 + }, + { + "completion_length": 501.3333435058594, + "epoch": 1.1608391608391608, + "grad_norm": 0.9894822239875793, + "kl": 0.23653444647789001, + "learning_rate": 4.933947257182901e-06, + "loss": 0.0095, + "reward": 2.4583334922790527, + "reward_std": 1.6280101537704468, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 332 + }, + { + "completion_length": 283.8333435058594, + "epoch": 1.1643356643356644, + "grad_norm": 1.3056206703186035, + "kl": 0.3558562397956848, + "learning_rate": 4.932947181942721e-06, + "loss": 0.0142, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 333 + }, + { + "completion_length": 617.8333740234375, + "epoch": 1.167832167832168, + "grad_norm": 0.7905691266059875, + "kl": 0.2221965491771698, + "learning_rate": 4.9319396955234925e-06, + "loss": 0.0089, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 334 + }, + { + "completion_length": 802.3333740234375, + "epoch": 1.1713286713286712, + "grad_norm": 0.650930643081665, + "kl": 0.2902371287345886, + "learning_rate": 4.930924800994192e-06, + "loss": 0.0116, + "reward": 2.9375, + "reward_std": 0.9523326754570007, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7708332538604736, + "step": 335 + }, + { + "completion_length": 571.5, + "epoch": 1.1748251748251748, + "grad_norm": 2.592233180999756, + "kl": 0.44388240575790405, + "learning_rate": 4.9299025014463665e-06, + "loss": 0.0178, + "reward": 2.625, + "reward_std": 1.0167349576950073, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 336 + }, + { + "completion_length": 765.0, + "epoch": 1.1783216783216783, + "grad_norm": 0.8478806018829346, + "kl": 0.23496964573860168, + "learning_rate": 4.928872799994116e-06, + "loss": 0.0094, + "reward": 2.4166665077209473, + "reward_std": 1.0943796634674072, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.75, + "step": 337 + }, + { + "completion_length": 369.5, + "epoch": 1.1818181818181819, + "grad_norm": 1.2003388404846191, + "kl": 0.283313125371933, + "learning_rate": 4.92783569977409e-06, + "loss": 0.0113, + "reward": 2.4625000953674316, + "reward_std": 1.1056389808654785, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 338 + }, + { + "completion_length": 241.1666717529297, + "epoch": 1.1853146853146854, + "grad_norm": 1.1362509727478027, + "kl": 0.36542683839797974, + "learning_rate": 4.926791203945477e-06, + "loss": 0.0146, + "reward": 2.941667079925537, + "reward_std": 1.237908124923706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7749999761581421, + "step": 339 + }, + { + "completion_length": 262.3333435058594, + "epoch": 1.1888111888111887, + "grad_norm": 2.5425589084625244, + "kl": 0.46542689204216003, + "learning_rate": 4.925739315689991e-06, + "loss": 0.0186, + "reward": 2.2166666984558105, + "reward_std": 1.3840761184692383, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666388511658, + "step": 340 + }, + { + "completion_length": 458.8333435058594, + "epoch": 1.1923076923076923, + "grad_norm": 1.0685269832611084, + "kl": 0.28533288836479187, + "learning_rate": 4.924680038211868e-06, + "loss": 0.0114, + "reward": 3.0375001430511475, + "reward_std": 0.7974568605422974, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 341 + }, + { + "completion_length": 680.6666870117188, + "epoch": 1.1958041958041958, + "grad_norm": 1.049636960029602, + "kl": 0.2565695643424988, + "learning_rate": 4.923613374737848e-06, + "loss": 0.0103, + "reward": 2.875, + "reward_std": 0.7834221124649048, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.875, + "step": 342 + }, + { + "completion_length": 669.5, + "epoch": 1.1993006993006994, + "grad_norm": 0.47562330961227417, + "kl": 0.15911276638507843, + "learning_rate": 4.922539328517174e-06, + "loss": 0.0064, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 343 + }, + { + "completion_length": 533.1666870117188, + "epoch": 1.2027972027972027, + "grad_norm": 2.7278823852539062, + "kl": 0.42878812551498413, + "learning_rate": 4.921457902821578e-06, + "loss": 0.0172, + "reward": 2.191666603088379, + "reward_std": 1.1499637365341187, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 344 + }, + { + "completion_length": 410.5, + "epoch": 1.2062937062937062, + "grad_norm": 1.2009421586990356, + "kl": 0.30361247062683105, + "learning_rate": 4.92036910094527e-06, + "loss": 0.0121, + "reward": 2.2958333492279053, + "reward_std": 0.7362772822380066, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7958333492279053, + "step": 345 + }, + { + "completion_length": 678.0, + "epoch": 1.2097902097902098, + "grad_norm": 1.1339452266693115, + "kl": 0.36994367837905884, + "learning_rate": 4.9192729262049285e-06, + "loss": 0.0148, + "reward": 1.375, + "reward_std": 1.7195203304290771, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.375, + "step": 346 + }, + { + "completion_length": 364.66668701171875, + "epoch": 1.2132867132867133, + "grad_norm": 1.0105022192001343, + "kl": 0.22824347019195557, + "learning_rate": 4.918169381939693e-06, + "loss": 0.0091, + "reward": 1.75, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.75, + "step": 347 + }, + { + "completion_length": 231.83334350585938, + "epoch": 1.2167832167832167, + "grad_norm": 2.2665371894836426, + "kl": 0.5012367963790894, + "learning_rate": 4.917058471511149e-06, + "loss": 0.02, + "reward": 0.8916667699813843, + "reward_std": 0.8929818868637085, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 348 + }, + { + "completion_length": 149.6666717529297, + "epoch": 1.2202797202797202, + "grad_norm": 1.465401530265808, + "kl": 0.71610426902771, + "learning_rate": 4.915940198303324e-06, + "loss": 0.0286, + "reward": 2.183333396911621, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5166666507720947, + "step": 349 + }, + { + "completion_length": 265.66668701171875, + "epoch": 1.2237762237762237, + "grad_norm": 1.1324924230575562, + "kl": 0.39196571707725525, + "learning_rate": 4.914814565722671e-06, + "loss": 0.0157, + "reward": 2.016666889190674, + "reward_std": 0.9521905779838562, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 350 + }, + { + "completion_length": 228.1666717529297, + "epoch": 1.2272727272727273, + "grad_norm": 2.361294746398926, + "kl": 0.5443918704986572, + "learning_rate": 4.913681577198063e-06, + "loss": 0.0218, + "reward": 2.7916667461395264, + "reward_std": 1.5844295024871826, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 351 + }, + { + "completion_length": 645.1666870117188, + "epoch": 1.2307692307692308, + "grad_norm": 1.6541866064071655, + "kl": 0.3587082326412201, + "learning_rate": 4.912541236180779e-06, + "loss": 0.0143, + "reward": 3.0208334922790527, + "reward_std": 1.1969144344329834, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6875, + "step": 352 + }, + { + "completion_length": 592.1666870117188, + "epoch": 1.2342657342657342, + "grad_norm": 3.038172483444214, + "kl": 0.6741119623184204, + "learning_rate": 4.9113935461444955e-06, + "loss": 0.027, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 353 + }, + { + "completion_length": 416.16668701171875, + "epoch": 1.2377622377622377, + "grad_norm": 1.0763347148895264, + "kl": 0.32444697618484497, + "learning_rate": 4.910238510585275e-06, + "loss": 0.013, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 354 + }, + { + "completion_length": 276.3333435058594, + "epoch": 1.2412587412587412, + "grad_norm": 2.7986843585968018, + "kl": 0.9174998998641968, + "learning_rate": 4.909076133021558e-06, + "loss": 0.0367, + "reward": 2.625, + "reward_std": 1.616709589958191, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 355 + }, + { + "completion_length": 269.16668701171875, + "epoch": 1.2447552447552448, + "grad_norm": 0.9633187055587769, + "kl": 0.3955456614494324, + "learning_rate": 4.907906416994146e-06, + "loss": 0.0158, + "reward": 3.066667079925537, + "reward_std": 0.4490731656551361, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8999999761581421, + "step": 356 + }, + { + "completion_length": 313.16668701171875, + "epoch": 1.2482517482517483, + "grad_norm": 2.313849449157715, + "kl": 0.662523627281189, + "learning_rate": 4.906729366066197e-06, + "loss": 0.0265, + "reward": 1.7666667699813843, + "reward_std": 1.1767185926437378, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 357 + }, + { + "completion_length": 216.0, + "epoch": 1.2517482517482517, + "grad_norm": 4.379472255706787, + "kl": 0.7677586078643799, + "learning_rate": 4.905544983823214e-06, + "loss": 0.0307, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 358 + }, + { + "completion_length": 860.3333740234375, + "epoch": 1.2552447552447552, + "grad_norm": 2.9275009632110596, + "kl": 0.6438803672790527, + "learning_rate": 4.904353273873029e-06, + "loss": 0.0258, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 359 + }, + { + "completion_length": 217.83334350585938, + "epoch": 1.2587412587412588, + "grad_norm": 2.738201141357422, + "kl": 0.6947124004364014, + "learning_rate": 4.903154239845798e-06, + "loss": 0.0278, + "reward": 2.125, + "reward_std": 1.3129165172576904, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 360 + }, + { + "completion_length": 850.6666870117188, + "epoch": 1.2622377622377623, + "grad_norm": 0.6407853364944458, + "kl": 0.21777069568634033, + "learning_rate": 4.901947885393986e-06, + "loss": 0.0087, + "reward": 3.066667079925537, + "reward_std": 0.9389710426330566, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 361 + }, + { + "completion_length": 430.5, + "epoch": 1.2657342657342658, + "grad_norm": 3.934774398803711, + "kl": 1.3171093463897705, + "learning_rate": 4.900734214192358e-06, + "loss": 0.0527, + "reward": 2.4666666984558105, + "reward_std": 1.7380064725875854, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 362 + }, + { + "completion_length": 1049.0, + "epoch": 1.2692307692307692, + "grad_norm": 1.0587317943572998, + "kl": 0.3339938521385193, + "learning_rate": 4.899513229937968e-06, + "loss": 0.0134, + "reward": 1.183333396911621, + "reward_std": 0.6088240146636963, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8499999046325684, + "step": 363 + }, + { + "completion_length": 752.5, + "epoch": 1.2727272727272727, + "grad_norm": 0.9463182687759399, + "kl": 0.2867739796638489, + "learning_rate": 4.898284936350144e-06, + "loss": 0.0115, + "reward": 1.445833444595337, + "reward_std": 1.1011831760406494, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.612500011920929, + "step": 364 + }, + { + "completion_length": 302.3333435058594, + "epoch": 1.2762237762237763, + "grad_norm": 1.0470837354660034, + "kl": 0.4384109377861023, + "learning_rate": 4.897049337170483e-06, + "loss": 0.0175, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 365 + }, + { + "completion_length": 299.5, + "epoch": 1.2797202797202798, + "grad_norm": 1.4532350301742554, + "kl": 0.48457586765289307, + "learning_rate": 4.8958064361628334e-06, + "loss": 0.0194, + "reward": 2.183333396911621, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 366 + }, + { + "completion_length": 591.1666870117188, + "epoch": 1.2832167832167833, + "grad_norm": 1.7987697124481201, + "kl": 0.44638824462890625, + "learning_rate": 4.894556237113287e-06, + "loss": 0.0179, + "reward": 1.1166666746139526, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 367 + }, + { + "completion_length": 1384.5, + "epoch": 1.2867132867132867, + "grad_norm": 0.4040040373802185, + "kl": 0.12767352163791656, + "learning_rate": 4.893298743830168e-06, + "loss": 0.0051, + "reward": 1.691666841506958, + "reward_std": 1.4019334316253662, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 368 + }, + { + "completion_length": 440.3333435058594, + "epoch": 1.2902097902097902, + "grad_norm": 1.9347208738327026, + "kl": 0.46111249923706055, + "learning_rate": 4.89203396014402e-06, + "loss": 0.0184, + "reward": 1.9333332777023315, + "reward_std": 1.0510313510894775, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7666666507720947, + "step": 369 + }, + { + "completion_length": 602.8333740234375, + "epoch": 1.2937062937062938, + "grad_norm": 1.7568728923797607, + "kl": 0.5643346309661865, + "learning_rate": 4.890761889907589e-06, + "loss": 0.0226, + "reward": 1.2333333492279053, + "reward_std": 1.1513760089874268, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40000003576278687, + "step": 370 + }, + { + "completion_length": 584.1666870117188, + "epoch": 1.297202797202797, + "grad_norm": 2.6727964878082275, + "kl": 0.5424228310585022, + "learning_rate": 4.889482536995826e-06, + "loss": 0.0217, + "reward": 2.7916667461395264, + "reward_std": 1.4527275562286377, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 371 + }, + { + "completion_length": 302.16668701171875, + "epoch": 1.3006993006993006, + "grad_norm": 1.0215359926223755, + "kl": 0.38776999711990356, + "learning_rate": 4.888195905305859e-06, + "loss": 0.0155, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 372 + }, + { + "completion_length": 1038.5, + "epoch": 1.3041958041958042, + "grad_norm": 0.8328973054885864, + "kl": 0.31271958351135254, + "learning_rate": 4.886901998756995e-06, + "loss": 0.0125, + "reward": 1.4750001430511475, + "reward_std": 1.0486897230148315, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 373 + }, + { + "completion_length": 407.16668701171875, + "epoch": 1.3076923076923077, + "grad_norm": 1.812672734260559, + "kl": 0.3156376779079437, + "learning_rate": 4.885600821290692e-06, + "loss": 0.0126, + "reward": 3.183333396911621, + "reward_std": 1.2027745246887207, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 374 + }, + { + "completion_length": 264.16668701171875, + "epoch": 1.3111888111888113, + "grad_norm": 4.727421760559082, + "kl": 1.329188585281372, + "learning_rate": 4.884292376870567e-06, + "loss": 0.0532, + "reward": 2.0916666984558105, + "reward_std": 0.94890296459198, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 375 + }, + { + "completion_length": 516.5, + "epoch": 1.3146853146853146, + "grad_norm": 2.27711820602417, + "kl": 0.6330995559692383, + "learning_rate": 4.882976669482368e-06, + "loss": 0.0253, + "reward": 1.3583333492279053, + "reward_std": 1.1029127836227417, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6916667222976685, + "step": 376 + }, + { + "completion_length": 420.66668701171875, + "epoch": 1.3181818181818181, + "grad_norm": 2.9678735733032227, + "kl": 0.8875288367271423, + "learning_rate": 4.881653703133966e-06, + "loss": 0.0355, + "reward": 1.883333444595337, + "reward_std": 1.3325413465499878, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 377 + }, + { + "completion_length": 753.1666870117188, + "epoch": 1.3216783216783217, + "grad_norm": 0.774476945400238, + "kl": 0.36767667531967163, + "learning_rate": 4.880323481855347e-06, + "loss": 0.0147, + "reward": 2.3583335876464844, + "reward_std": 1.55962073802948, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 378 + }, + { + "completion_length": 182.5, + "epoch": 1.3251748251748252, + "grad_norm": 1.207739233970642, + "kl": 0.43915602564811707, + "learning_rate": 4.878986009698596e-06, + "loss": 0.0176, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 379 + }, + { + "completion_length": 341.0, + "epoch": 1.3286713286713288, + "grad_norm": 0.7512596249580383, + "kl": 0.3403867483139038, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.0136, + "reward": 3.0416667461395264, + "reward_std": 1.4800056219100952, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.875, + "step": 380 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.332167832167832, + "grad_norm": 2.4150354862213135, + "kl": 0.6687287092208862, + "learning_rate": 4.87628932906946e-06, + "loss": 0.0267, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 381 + }, + { + "completion_length": 657.5, + "epoch": 1.3356643356643356, + "grad_norm": 1.1033812761306763, + "kl": 0.2525772750377655, + "learning_rate": 4.874930128811631e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 382 + }, + { + "completion_length": 655.6666870117188, + "epoch": 1.3391608391608392, + "grad_norm": 2.7283008098602295, + "kl": 0.7087686061859131, + "learning_rate": 4.87356369410476e-06, + "loss": 0.0284, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6333333253860474, + "step": 383 + }, + { + "completion_length": 1037.166748046875, + "epoch": 1.3426573426573427, + "grad_norm": 1.4860605001449585, + "kl": 0.35516053438186646, + "learning_rate": 4.8721900291112415e-06, + "loss": 0.0142, + "reward": 1.3416666984558105, + "reward_std": 1.0956352949142456, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6750000715255737, + "step": 384 + }, + { + "completion_length": 776.0, + "epoch": 1.3461538461538463, + "grad_norm": 2.1169064044952393, + "kl": 0.6649973392486572, + "learning_rate": 4.870809138015499e-06, + "loss": 0.0266, + "reward": 1.4750001430511475, + "reward_std": 1.2451908588409424, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 385 + }, + { + "completion_length": 803.8333740234375, + "epoch": 1.3496503496503496, + "grad_norm": 1.5138658285140991, + "kl": 0.5593903064727783, + "learning_rate": 4.869421025023965e-06, + "loss": 0.0224, + "reward": 1.2250001430511475, + "reward_std": 1.229125738143921, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5583333373069763, + "step": 386 + }, + { + "completion_length": 579.8333740234375, + "epoch": 1.3531468531468531, + "grad_norm": 0.8988491892814636, + "kl": 0.2851899266242981, + "learning_rate": 4.868025694365073e-06, + "loss": 0.0114, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 387 + }, + { + "completion_length": 173.5, + "epoch": 1.3566433566433567, + "grad_norm": 1.3644022941589355, + "kl": 0.5744073390960693, + "learning_rate": 4.866623150289241e-06, + "loss": 0.023, + "reward": 1.9666666984558105, + "reward_std": 1.2412359714508057, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 388 + }, + { + "completion_length": 578.3333740234375, + "epoch": 1.3601398601398602, + "grad_norm": 0.8156600594520569, + "kl": 0.2687755227088928, + "learning_rate": 4.865213397068864e-06, + "loss": 0.0108, + "reward": 2.2916667461395264, + "reward_std": 1.4901063442230225, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 389 + }, + { + "completion_length": 1756.8333740234375, + "epoch": 1.3636363636363638, + "grad_norm": 0.36968812346458435, + "kl": 0.11372655630111694, + "learning_rate": 4.863796438998293e-06, + "loss": 0.0045, + "reward": 1.4666666984558105, + "reward_std": 0.9174239635467529, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6333333253860474, + "step": 390 + }, + { + "completion_length": 605.5, + "epoch": 1.367132867132867, + "grad_norm": 1.086455225944519, + "kl": 0.2938157916069031, + "learning_rate": 4.862372280393828e-06, + "loss": 0.0118, + "reward": 2.4375, + "reward_std": 1.2702115774154663, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7708333730697632, + "step": 391 + }, + { + "completion_length": 736.0, + "epoch": 1.3706293706293706, + "grad_norm": 3.411510705947876, + "kl": 0.9218753576278687, + "learning_rate": 4.860940925593703e-06, + "loss": 0.0369, + "reward": 1.4583333730697632, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666865348816, + "step": 392 + }, + { + "completion_length": 166.5, + "epoch": 1.3741258741258742, + "grad_norm": 1.464406132698059, + "kl": 0.34225571155548096, + "learning_rate": 4.8595023789580745e-06, + "loss": 0.0137, + "reward": 1.6041667461395264, + "reward_std": 0.7573666572570801, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7708332538604736, + "step": 393 + }, + { + "completion_length": 646.5, + "epoch": 1.3776223776223775, + "grad_norm": 1.6122732162475586, + "kl": 0.4424184560775757, + "learning_rate": 4.858056644869002e-06, + "loss": 0.0177, + "reward": 1.3250000476837158, + "reward_std": 0.9527591466903687, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8250000476837158, + "step": 394 + }, + { + "completion_length": 641.1666870117188, + "epoch": 1.381118881118881, + "grad_norm": 0.6985570192337036, + "kl": 0.23967330157756805, + "learning_rate": 4.856603727730446e-06, + "loss": 0.0096, + "reward": 2.5458333492279053, + "reward_std": 1.5425965785980225, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7125000357627869, + "step": 395 + }, + { + "completion_length": 161.83334350585938, + "epoch": 1.3846153846153846, + "grad_norm": 1.9270485639572144, + "kl": 0.7514389753341675, + "learning_rate": 4.855143631968242e-06, + "loss": 0.0301, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 396 + }, + { + "completion_length": 166.0, + "epoch": 1.3881118881118881, + "grad_norm": 1.2144757509231567, + "kl": 0.35039469599723816, + "learning_rate": 4.853676362030095e-06, + "loss": 0.014, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 397 + }, + { + "completion_length": 569.0, + "epoch": 1.3916083916083917, + "grad_norm": 6.755039215087891, + "kl": 0.7890805006027222, + "learning_rate": 4.852201922385564e-06, + "loss": 0.0316, + "reward": 2.1083333492279053, + "reward_std": 1.7987264394760132, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6083333492279053, + "step": 398 + }, + { + "completion_length": 909.0, + "epoch": 1.395104895104895, + "grad_norm": 0.7347401976585388, + "kl": 0.18117789924144745, + "learning_rate": 4.850720317526047e-06, + "loss": 0.0072, + "reward": 1.962499976158142, + "reward_std": 0.534263551235199, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7958333492279053, + "step": 399 + }, + { + "completion_length": 793.5, + "epoch": 1.3986013986013985, + "grad_norm": 0.849243700504303, + "kl": 0.27008673548698425, + "learning_rate": 4.849231551964771e-06, + "loss": 0.0108, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 400 + }, + { + "completion_length": 554.1666870117188, + "epoch": 1.402097902097902, + "grad_norm": 2.7050747871398926, + "kl": 0.5240260362625122, + "learning_rate": 4.847735630236773e-06, + "loss": 0.021, + "reward": 3.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 401 + }, + { + "completion_length": 215.83334350585938, + "epoch": 1.4055944055944056, + "grad_norm": 0.9243234992027283, + "kl": 0.3121068477630615, + "learning_rate": 4.84623255689889e-06, + "loss": 0.0125, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 402 + }, + { + "completion_length": 235.6666717529297, + "epoch": 1.4090909090909092, + "grad_norm": 3.3891875743865967, + "kl": 0.5218031406402588, + "learning_rate": 4.844722336529745e-06, + "loss": 0.0209, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 403 + }, + { + "completion_length": 923.5, + "epoch": 1.4125874125874125, + "grad_norm": 3.197908878326416, + "kl": 0.7076524496078491, + "learning_rate": 4.84320497372973e-06, + "loss": 0.0283, + "reward": 2.0458335876464844, + "reward_std": 1.3396285772323608, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7125000357627869, + "step": 404 + }, + { + "completion_length": 197.83334350585938, + "epoch": 1.416083916083916, + "grad_norm": 1.1261261701583862, + "kl": 0.3264281153678894, + "learning_rate": 4.841680473120994e-06, + "loss": 0.0131, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 405 + }, + { + "completion_length": 554.5, + "epoch": 1.4195804195804196, + "grad_norm": 3.3561604022979736, + "kl": 0.8642048835754395, + "learning_rate": 4.840148839347434e-06, + "loss": 0.0346, + "reward": 1.8500001430511475, + "reward_std": 1.0315039157867432, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 406 + }, + { + "completion_length": 795.8333740234375, + "epoch": 1.4230769230769231, + "grad_norm": 4.25921630859375, + "kl": 0.770601749420166, + "learning_rate": 4.838610077074669e-06, + "loss": 0.0308, + "reward": 1.2916667461395264, + "reward_std": 1.0551856756210327, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 407 + }, + { + "completion_length": 915.0, + "epoch": 1.4265734265734267, + "grad_norm": 0.571506142616272, + "kl": 0.20412606000900269, + "learning_rate": 4.837064190990036e-06, + "loss": 0.0082, + "reward": 2.241666793823242, + "reward_std": 1.3698238134384155, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7416666746139526, + "step": 408 + }, + { + "completion_length": 520.6666870117188, + "epoch": 1.43006993006993, + "grad_norm": 0.9773194193840027, + "kl": 0.29276588559150696, + "learning_rate": 4.835511185802574e-06, + "loss": 0.0117, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 409 + }, + { + "completion_length": 357.5, + "epoch": 1.4335664335664335, + "grad_norm": 2.5951545238494873, + "kl": 0.4989779591560364, + "learning_rate": 4.833951066243004e-06, + "loss": 0.02, + "reward": 1.945833444595337, + "reward_std": 1.279689073562622, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.612500011920929, + "step": 410 + }, + { + "completion_length": 794.3333740234375, + "epoch": 1.437062937062937, + "grad_norm": 0.761000394821167, + "kl": 0.20721551775932312, + "learning_rate": 4.832383837063723e-06, + "loss": 0.0083, + "reward": 2.0416667461395264, + "reward_std": 1.100189447402954, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 411 + }, + { + "completion_length": 1086.5, + "epoch": 1.4405594405594406, + "grad_norm": 0.9872347116470337, + "kl": 0.296750009059906, + "learning_rate": 4.830809503038781e-06, + "loss": 0.0119, + "reward": 2.0916666984558105, + "reward_std": 1.442365050315857, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 412 + }, + { + "completion_length": 168.5, + "epoch": 1.4440559440559442, + "grad_norm": 1.2185351848602295, + "kl": 0.34197482466697693, + "learning_rate": 4.829228068963873e-06, + "loss": 0.0137, + "reward": 3.2333333492279053, + "reward_std": 0.5671566724777222, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 413 + }, + { + "completion_length": 775.3333740234375, + "epoch": 1.4475524475524475, + "grad_norm": 1.1913334131240845, + "kl": 0.3759481906890869, + "learning_rate": 4.8276395396563215e-06, + "loss": 0.015, + "reward": 0.8916667699813843, + "reward_std": 0.5633975267410278, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7250000834465027, + "step": 414 + }, + { + "completion_length": 203.6666717529297, + "epoch": 1.451048951048951, + "grad_norm": 1.0359302759170532, + "kl": 0.31211602687835693, + "learning_rate": 4.826043919955062e-06, + "loss": 0.0125, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 415 + }, + { + "completion_length": 543.6666870117188, + "epoch": 1.4545454545454546, + "grad_norm": 0.7396105527877808, + "kl": 0.25116777420043945, + "learning_rate": 4.824441214720629e-06, + "loss": 0.01, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 416 + }, + { + "completion_length": 253.0, + "epoch": 1.458041958041958, + "grad_norm": 2.3947131633758545, + "kl": 0.3577002286911011, + "learning_rate": 4.8228314288351405e-06, + "loss": 0.0143, + "reward": 1.8500001430511475, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 417 + }, + { + "completion_length": 776.0, + "epoch": 1.4615384615384617, + "grad_norm": 0.9339893460273743, + "kl": 0.2636467218399048, + "learning_rate": 4.821214567202284e-06, + "loss": 0.0105, + "reward": 2.2333333492279053, + "reward_std": 0.5671566724777222, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 418 + }, + { + "completion_length": 185.33334350585938, + "epoch": 1.465034965034965, + "grad_norm": 3.6216635704040527, + "kl": 0.6233493685722351, + "learning_rate": 4.8195906347473e-06, + "loss": 0.0249, + "reward": 1.8000000715255737, + "reward_std": 1.579240322113037, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 419 + }, + { + "completion_length": 1112.0, + "epoch": 1.4685314685314685, + "grad_norm": 0.6356344223022461, + "kl": 0.26539915800094604, + "learning_rate": 4.817959636416969e-06, + "loss": 0.0106, + "reward": 2.375, + "reward_std": 1.001873254776001, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 420 + }, + { + "completion_length": 531.1666870117188, + "epoch": 1.472027972027972, + "grad_norm": 0.8300501108169556, + "kl": 0.31844228506088257, + "learning_rate": 4.816321577179594e-06, + "loss": 0.0127, + "reward": 2.875, + "reward_std": 0.7834221124649048, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 421 + }, + { + "completion_length": 218.83334350585938, + "epoch": 1.4755244755244754, + "grad_norm": 0.796237051486969, + "kl": 0.331187903881073, + "learning_rate": 4.814676462024988e-06, + "loss": 0.0132, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 422 + }, + { + "completion_length": 186.83334350585938, + "epoch": 1.479020979020979, + "grad_norm": 1.279965877532959, + "kl": 0.3236890733242035, + "learning_rate": 4.8130242959644555e-06, + "loss": 0.0129, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 423 + }, + { + "completion_length": 249.0, + "epoch": 1.4825174825174825, + "grad_norm": 4.079779624938965, + "kl": 0.39256423711776733, + "learning_rate": 4.811365084030784e-06, + "loss": 0.0157, + "reward": 1.7125000953674316, + "reward_std": 0.9684717655181885, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7125000357627869, + "step": 424 + }, + { + "completion_length": 183.33334350585938, + "epoch": 1.486013986013986, + "grad_norm": 1.1069165468215942, + "kl": 0.262847363948822, + "learning_rate": 4.809698831278217e-06, + "loss": 0.0105, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 425 + }, + { + "completion_length": 199.6666717529297, + "epoch": 1.4895104895104896, + "grad_norm": 1.413517713546753, + "kl": 0.39733991026878357, + "learning_rate": 4.808025542782453e-06, + "loss": 0.0159, + "reward": 2.7083334922790527, + "reward_std": 0.8662660121917725, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 426 + }, + { + "completion_length": 235.6666717529297, + "epoch": 1.493006993006993, + "grad_norm": 0.9659198522567749, + "kl": 0.2365071177482605, + "learning_rate": 4.806345223640616e-06, + "loss": 0.0095, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 427 + }, + { + "completion_length": 774.1666870117188, + "epoch": 1.4965034965034965, + "grad_norm": 0.830765962600708, + "kl": 0.33350443840026855, + "learning_rate": 4.804657878971252e-06, + "loss": 0.0133, + "reward": 2.183333396911621, + "reward_std": 1.3265244960784912, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 428 + }, + { + "completion_length": 203.0, + "epoch": 1.5, + "grad_norm": 1.0319793224334717, + "kl": 0.27221041917800903, + "learning_rate": 4.802963513914304e-06, + "loss": 0.0109, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 429 + }, + { + "completion_length": 461.16668701171875, + "epoch": 1.5034965034965035, + "grad_norm": 1.0231879949569702, + "kl": 0.24733422696590424, + "learning_rate": 4.801262133631101e-06, + "loss": 0.0099, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 430 + }, + { + "completion_length": 244.83334350585938, + "epoch": 1.506993006993007, + "grad_norm": 0.9520881772041321, + "kl": 0.31419527530670166, + "learning_rate": 4.799553743304345e-06, + "loss": 0.0126, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 431 + }, + { + "completion_length": 206.83334350585938, + "epoch": 1.5104895104895104, + "grad_norm": 0.8148533701896667, + "kl": 0.2550124228000641, + "learning_rate": 4.7978383481380865e-06, + "loss": 0.0102, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 432 + }, + { + "completion_length": 1087.8333740234375, + "epoch": 1.513986013986014, + "grad_norm": 0.3516090214252472, + "kl": 0.2816867530345917, + "learning_rate": 4.796115953357718e-06, + "loss": 0.0113, + "reward": 2.2833333015441895, + "reward_std": 1.2408331632614136, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 433 + }, + { + "completion_length": 556.3333740234375, + "epoch": 1.5174825174825175, + "grad_norm": 3.6779227256774902, + "kl": 0.4250108003616333, + "learning_rate": 4.794386564209953e-06, + "loss": 0.017, + "reward": 2.4083335399627686, + "reward_std": 1.687132716178894, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 434 + }, + { + "completion_length": 707.8333740234375, + "epoch": 1.5209790209790208, + "grad_norm": 1.121485948562622, + "kl": 0.24696388840675354, + "learning_rate": 4.79265018596281e-06, + "loss": 0.0099, + "reward": 2.9000000953674316, + "reward_std": 0.9027735590934753, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 435 + }, + { + "completion_length": 469.8333435058594, + "epoch": 1.5244755244755246, + "grad_norm": 2.6518046855926514, + "kl": 0.7716752886772156, + "learning_rate": 4.790906823905599e-06, + "loss": 0.0309, + "reward": 1.8000000715255737, + "reward_std": 1.447066068649292, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 436 + }, + { + "completion_length": 192.83334350585938, + "epoch": 1.527972027972028, + "grad_norm": 1.165176272392273, + "kl": 0.2884241044521332, + "learning_rate": 4.7891564833489035e-06, + "loss": 0.0115, + "reward": 2.450000286102295, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 437 + }, + { + "completion_length": 254.6666717529297, + "epoch": 1.5314685314685315, + "grad_norm": 0.8783808350563049, + "kl": 0.26613113284111023, + "learning_rate": 4.787399169624562e-06, + "loss": 0.0106, + "reward": 3.370833396911621, + "reward_std": 1.011238932609558, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 438 + }, + { + "completion_length": 158.5, + "epoch": 1.534965034965035, + "grad_norm": 2.008617877960205, + "kl": 0.5028926134109497, + "learning_rate": 4.7856348880856595e-06, + "loss": 0.0201, + "reward": 1.7416666746139526, + "reward_std": 1.1517016887664795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7416666746139526, + "step": 439 + }, + { + "completion_length": 208.5, + "epoch": 1.5384615384615383, + "grad_norm": 0.8693957924842834, + "kl": 0.2799164056777954, + "learning_rate": 4.783863644106502e-06, + "loss": 0.0112, + "reward": 2.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 440 + }, + { + "completion_length": 211.5, + "epoch": 1.541958041958042, + "grad_norm": 1.5437381267547607, + "kl": 0.3011782467365265, + "learning_rate": 4.782085443082607e-06, + "loss": 0.012, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 441 + }, + { + "completion_length": 491.8333435058594, + "epoch": 1.5454545454545454, + "grad_norm": 3.308060884475708, + "kl": 0.43526870012283325, + "learning_rate": 4.780300290430683e-06, + "loss": 0.0174, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 442 + }, + { + "completion_length": 177.1666717529297, + "epoch": 1.548951048951049, + "grad_norm": 2.3108198642730713, + "kl": 0.6005208492279053, + "learning_rate": 4.778508191588613e-06, + "loss": 0.024, + "reward": 2.683333396911621, + "reward_std": 1.2110600471496582, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 443 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.5524475524475525, + "grad_norm": 0.9576809406280518, + "kl": 0.3041282296180725, + "learning_rate": 4.776709152015443e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 444 + }, + { + "completion_length": 807.3333740234375, + "epoch": 1.5559440559440558, + "grad_norm": 0.6298768520355225, + "kl": 0.2337806224822998, + "learning_rate": 4.774903177191358e-06, + "loss": 0.0094, + "reward": 2.5458335876464844, + "reward_std": 1.3377609252929688, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 445 + }, + { + "completion_length": 523.6666870117188, + "epoch": 1.5594405594405596, + "grad_norm": 1.1019190549850464, + "kl": 0.39509618282318115, + "learning_rate": 4.773090272617672e-06, + "loss": 0.0158, + "reward": 2.049999952316284, + "reward_std": 1.5391557216644287, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 446 + }, + { + "completion_length": 787.6666870117188, + "epoch": 1.562937062937063, + "grad_norm": 0.893694281578064, + "kl": 0.37470337748527527, + "learning_rate": 4.771270443816805e-06, + "loss": 0.015, + "reward": 2.2083334922790527, + "reward_std": 0.8720186948776245, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 447 + }, + { + "completion_length": 546.8333740234375, + "epoch": 1.5664335664335665, + "grad_norm": 0.837485134601593, + "kl": 0.22402605414390564, + "learning_rate": 4.769443696332272e-06, + "loss": 0.009, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 448 + }, + { + "completion_length": 177.6666717529297, + "epoch": 1.56993006993007, + "grad_norm": 1.617317795753479, + "kl": 0.3958384692668915, + "learning_rate": 4.767610035728663e-06, + "loss": 0.0158, + "reward": 2.875, + "reward_std": 1.0068515539169312, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 449 + }, + { + "completion_length": 147.33334350585938, + "epoch": 1.5734265734265733, + "grad_norm": 0.9628480076789856, + "kl": 0.3490566611289978, + "learning_rate": 4.765769467591626e-06, + "loss": 0.014, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 450 + }, + { + "completion_length": 203.83334350585938, + "epoch": 1.5769230769230769, + "grad_norm": 0.9194980263710022, + "kl": 0.3181028962135315, + "learning_rate": 4.763921997527849e-06, + "loss": 0.0127, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 451 + }, + { + "completion_length": 167.5, + "epoch": 1.5804195804195804, + "grad_norm": 3.041954517364502, + "kl": 0.426164835691452, + "learning_rate": 4.762067631165049e-06, + "loss": 0.017, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 452 + }, + { + "completion_length": 212.33334350585938, + "epoch": 1.583916083916084, + "grad_norm": 1.1762245893478394, + "kl": 0.2974995970726013, + "learning_rate": 4.760206374151947e-06, + "loss": 0.0119, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 453 + }, + { + "completion_length": 493.66668701171875, + "epoch": 1.5874125874125875, + "grad_norm": 1.3206851482391357, + "kl": 0.36789295077323914, + "learning_rate": 4.7583382321582525e-06, + "loss": 0.0147, + "reward": 1.9166667461395264, + "reward_std": 1.2738393545150757, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.75, + "step": 454 + }, + { + "completion_length": 205.0, + "epoch": 1.5909090909090908, + "grad_norm": 1.0482568740844727, + "kl": 0.2594867944717407, + "learning_rate": 4.7564632108746524e-06, + "loss": 0.0104, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 455 + }, + { + "completion_length": 502.8333435058594, + "epoch": 1.5944055944055944, + "grad_norm": 2.1341159343719482, + "kl": 0.4591405391693115, + "learning_rate": 4.754581316012785e-06, + "loss": 0.0184, + "reward": 3.7083334922790527, + "reward_std": 0.591960072517395, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 456 + }, + { + "completion_length": 633.3333740234375, + "epoch": 1.597902097902098, + "grad_norm": 1.0107204914093018, + "kl": 0.24642407894134521, + "learning_rate": 4.752692553305229e-06, + "loss": 0.0099, + "reward": 3.0375001430511475, + "reward_std": 0.7974569201469421, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708333373069763, + "step": 457 + }, + { + "completion_length": 517.0, + "epoch": 1.6013986013986012, + "grad_norm": 0.6217291355133057, + "kl": 0.22938358783721924, + "learning_rate": 4.750796928505484e-06, + "loss": 0.0092, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 458 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.604895104895105, + "grad_norm": 0.5446264743804932, + "kl": 0.1968853920698166, + "learning_rate": 4.7488944473879515e-06, + "loss": 0.0079, + "reward": 1.625, + "reward_std": 0.7960842847824097, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 459 + }, + { + "completion_length": 193.83334350585938, + "epoch": 1.6083916083916083, + "grad_norm": 0.8946224451065063, + "kl": 0.25773894786834717, + "learning_rate": 4.746985115747918e-06, + "loss": 0.0103, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 460 + }, + { + "completion_length": 204.6666717529297, + "epoch": 1.6118881118881119, + "grad_norm": 0.8260864019393921, + "kl": 0.2527741491794586, + "learning_rate": 4.745068939401539e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 461 + }, + { + "completion_length": 848.6666870117188, + "epoch": 1.6153846153846154, + "grad_norm": 1.5746495723724365, + "kl": 0.3351367712020874, + "learning_rate": 4.743145924185821e-06, + "loss": 0.0134, + "reward": 2.25, + "reward_std": 0.7803846597671509, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.75, + "step": 462 + }, + { + "completion_length": 190.0, + "epoch": 1.6188811188811187, + "grad_norm": 1.0435597896575928, + "kl": 0.26553571224212646, + "learning_rate": 4.741216075958602e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 463 + }, + { + "completion_length": 209.6666717529297, + "epoch": 1.6223776223776225, + "grad_norm": 1.0996354818344116, + "kl": 0.31133967638015747, + "learning_rate": 4.7392794005985324e-06, + "loss": 0.0125, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 464 + }, + { + "completion_length": 512.6666870117188, + "epoch": 1.6258741258741258, + "grad_norm": 0.7010518908500671, + "kl": 0.21432137489318848, + "learning_rate": 4.737335904005063e-06, + "loss": 0.0086, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 465 + }, + { + "completion_length": 527.0, + "epoch": 1.6293706293706294, + "grad_norm": 0.5995029211044312, + "kl": 0.22433510422706604, + "learning_rate": 4.735385592098421e-06, + "loss": 0.009, + "reward": 2.7916667461395264, + "reward_std": 1.4527275562286377, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 466 + }, + { + "completion_length": 191.0, + "epoch": 1.632867132867133, + "grad_norm": 1.2079272270202637, + "kl": 0.2614157795906067, + "learning_rate": 4.733428470819595e-06, + "loss": 0.0105, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 467 + }, + { + "completion_length": 783.1666870117188, + "epoch": 1.6363636363636362, + "grad_norm": 2.2251851558685303, + "kl": 0.6713162660598755, + "learning_rate": 4.731464546130315e-06, + "loss": 0.0269, + "reward": 2.4375, + "reward_std": 1.3401259183883667, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7708333730697632, + "step": 468 + }, + { + "completion_length": 529.1666870117188, + "epoch": 1.63986013986014, + "grad_norm": 0.5742272138595581, + "kl": 0.23623262345790863, + "learning_rate": 4.729493824013036e-06, + "loss": 0.0094, + "reward": 2.2125000953674316, + "reward_std": 1.234073519706726, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7124999761581421, + "step": 469 + }, + { + "completion_length": 181.0, + "epoch": 1.6433566433566433, + "grad_norm": 1.7596086263656616, + "kl": 0.33919036388397217, + "learning_rate": 4.72751631047092e-06, + "loss": 0.0136, + "reward": 1.8500001430511475, + "reward_std": 1.2247450351715088, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 470 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.6468531468531469, + "grad_norm": 1.0671755075454712, + "kl": 0.27314767241477966, + "learning_rate": 4.725532011527817e-06, + "loss": 0.0109, + "reward": 3.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 471 + }, + { + "completion_length": 189.6666717529297, + "epoch": 1.6503496503496504, + "grad_norm": 1.0676515102386475, + "kl": 0.2805836498737335, + "learning_rate": 4.723540933228245e-06, + "loss": 0.0112, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 472 + }, + { + "completion_length": 836.5, + "epoch": 1.6538461538461537, + "grad_norm": 0.8203516006469727, + "kl": 0.172221839427948, + "learning_rate": 4.721543081637372e-06, + "loss": 0.0069, + "reward": 1.5833333730697632, + "reward_std": 1.0308573246002197, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7499999403953552, + "step": 473 + }, + { + "completion_length": 169.0, + "epoch": 1.6573426573426573, + "grad_norm": 1.7924721240997314, + "kl": 0.30363911390304565, + "learning_rate": 4.719538462841003e-06, + "loss": 0.0121, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 474 + }, + { + "completion_length": 176.6666717529297, + "epoch": 1.6608391608391608, + "grad_norm": 0.19596193730831146, + "kl": 0.24111799895763397, + "learning_rate": 4.717527082945555e-06, + "loss": 0.0108, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 475 + }, + { + "completion_length": 234.6666717529297, + "epoch": 1.6643356643356644, + "grad_norm": 0.9966434240341187, + "kl": 0.25714850425720215, + "learning_rate": 4.715508948078037e-06, + "loss": 0.0103, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 476 + }, + { + "completion_length": 1046.8333740234375, + "epoch": 1.667832167832168, + "grad_norm": 0.6285001635551453, + "kl": 0.1687658280134201, + "learning_rate": 4.71348406438604e-06, + "loss": 0.0068, + "reward": 2.0250000953674316, + "reward_std": 1.4372718334197998, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 477 + }, + { + "completion_length": 219.1666717529297, + "epoch": 1.6713286713286712, + "grad_norm": 1.0476932525634766, + "kl": 0.29544544219970703, + "learning_rate": 4.71145243803771e-06, + "loss": 0.0118, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 478 + }, + { + "completion_length": 561.1666870117188, + "epoch": 1.6748251748251748, + "grad_norm": 1.0641223192214966, + "kl": 0.1950298398733139, + "learning_rate": 4.709414075221734e-06, + "loss": 0.0078, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 479 + }, + { + "completion_length": 228.5, + "epoch": 1.6783216783216783, + "grad_norm": 0.8561164736747742, + "kl": 0.26422810554504395, + "learning_rate": 4.707368982147318e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 480 + }, + { + "completion_length": 509.3333435058594, + "epoch": 1.6818181818181817, + "grad_norm": 0.5843437314033508, + "kl": 0.20474323630332947, + "learning_rate": 4.70531716504417e-06, + "loss": 0.0082, + "reward": 2.183333396911621, + "reward_std": 1.2027745246887207, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 481 + }, + { + "completion_length": 548.6666870117188, + "epoch": 1.6853146853146854, + "grad_norm": 0.648353636264801, + "kl": 0.18905925750732422, + "learning_rate": 4.703258630162481e-06, + "loss": 0.0076, + "reward": 2.2125000953674316, + "reward_std": 1.3164108991622925, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7124999761581421, + "step": 482 + }, + { + "completion_length": 219.6666717529297, + "epoch": 1.6888111888111887, + "grad_norm": 4.2207932472229, + "kl": 1.0905920267105103, + "learning_rate": 4.701193383772905e-06, + "loss": 0.0436, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 483 + }, + { + "completion_length": 1049.166748046875, + "epoch": 1.6923076923076923, + "grad_norm": 0.5171648859977722, + "kl": 0.20516209304332733, + "learning_rate": 4.699121432166542e-06, + "loss": 0.0082, + "reward": 2.2333333492279053, + "reward_std": 0.9174240827560425, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 484 + }, + { + "completion_length": 201.6666717529297, + "epoch": 1.6958041958041958, + "grad_norm": 1.1004559993743896, + "kl": 0.2839426100254059, + "learning_rate": 4.697042781654913e-06, + "loss": 0.0114, + "reward": 1.870833396911621, + "reward_std": 0.193917915225029, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 485 + }, + { + "completion_length": 190.33334350585938, + "epoch": 1.6993006993006992, + "grad_norm": 1.0573567152023315, + "kl": 0.22315821051597595, + "learning_rate": 4.6949574385699514e-06, + "loss": 0.0089, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 486 + }, + { + "completion_length": 835.5, + "epoch": 1.702797202797203, + "grad_norm": 0.7173390984535217, + "kl": 0.1510881930589676, + "learning_rate": 4.6928654092639725e-06, + "loss": 0.006, + "reward": 1.5500000715255737, + "reward_std": 1.0904128551483154, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666388511658, + "step": 487 + }, + { + "completion_length": 615.8333740234375, + "epoch": 1.7062937062937062, + "grad_norm": 0.8014463186264038, + "kl": 0.22651296854019165, + "learning_rate": 4.690766700109659e-06, + "loss": 0.0091, + "reward": 2.7083334922790527, + "reward_std": 1.315453052520752, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 488 + }, + { + "completion_length": 169.6666717529297, + "epoch": 1.7097902097902098, + "grad_norm": 3.6473190784454346, + "kl": 0.40026336908340454, + "learning_rate": 4.688661317500045e-06, + "loss": 0.016, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 489 + }, + { + "completion_length": 1151.5, + "epoch": 1.7132867132867133, + "grad_norm": 0.8561959266662598, + "kl": 0.16577297449111938, + "learning_rate": 4.68654926784849e-06, + "loss": 0.0066, + "reward": 2.7083334922790527, + "reward_std": 1.0641508102416992, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.875, + "step": 490 + }, + { + "completion_length": 397.3333435058594, + "epoch": 1.7167832167832167, + "grad_norm": 1.0723934173583984, + "kl": 0.21682481467723846, + "learning_rate": 4.6844305575886635e-06, + "loss": 0.0087, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 491 + }, + { + "completion_length": 169.6666717529297, + "epoch": 1.7202797202797204, + "grad_norm": 1.4164685010910034, + "kl": 0.245243638753891, + "learning_rate": 4.682305193174524e-06, + "loss": 0.0098, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 492 + }, + { + "completion_length": 110.33333587646484, + "epoch": 1.7237762237762237, + "grad_norm": 5.974154949188232, + "kl": 1.1889418363571167, + "learning_rate": 4.680173181080302e-06, + "loss": 0.0476, + "reward": 3.075000286102295, + "reward_std": 1.1660832166671753, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 493 + }, + { + "completion_length": 215.5, + "epoch": 1.7272727272727273, + "grad_norm": 0.9199399352073669, + "kl": 0.2431143820285797, + "learning_rate": 4.6780345278004744e-06, + "loss": 0.0097, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 494 + }, + { + "completion_length": 190.83334350585938, + "epoch": 1.7307692307692308, + "grad_norm": 0.9801461696624756, + "kl": 0.25382137298583984, + "learning_rate": 4.675889239849749e-06, + "loss": 0.0102, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 495 + }, + { + "completion_length": 846.6666870117188, + "epoch": 1.7342657342657342, + "grad_norm": 0.6822401881217957, + "kl": 0.21501430869102478, + "learning_rate": 4.673737323763048e-06, + "loss": 0.0086, + "reward": 2.679166793823242, + "reward_std": 1.3748105764389038, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8458333015441895, + "step": 496 + }, + { + "completion_length": 182.33334350585938, + "epoch": 1.737762237762238, + "grad_norm": 6.3415422439575195, + "kl": 1.284159541130066, + "learning_rate": 4.671578786095479e-06, + "loss": 0.0514, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 497 + }, + { + "completion_length": 164.83334350585938, + "epoch": 1.7412587412587412, + "grad_norm": 1.421428918838501, + "kl": 0.3243716359138489, + "learning_rate": 4.669413633422322e-06, + "loss": 0.013, + "reward": 3.566666603088379, + "reward_std": 0.6013872623443604, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 498 + }, + { + "completion_length": 229.6666717529297, + "epoch": 1.7447552447552448, + "grad_norm": 0.8355535864830017, + "kl": 0.24279817938804626, + "learning_rate": 4.667241872339007e-06, + "loss": 0.0097, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 499 + }, + { + "completion_length": 672.6666870117188, + "epoch": 1.7482517482517483, + "grad_norm": 0.5215955376625061, + "kl": 0.19877499341964722, + "learning_rate": 4.665063509461098e-06, + "loss": 0.008, + "reward": 2.924999952316284, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 500 + }, + { + "completion_length": 198.83334350585938, + "epoch": 1.7517482517482517, + "grad_norm": 0.9148537516593933, + "kl": 0.24169328808784485, + "learning_rate": 4.6628785514242615e-06, + "loss": 0.0097, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 501 + }, + { + "completion_length": 928.5, + "epoch": 1.7552447552447552, + "grad_norm": 0.4413454532623291, + "kl": 0.15593400597572327, + "learning_rate": 4.6606870048842626e-06, + "loss": 0.0062, + "reward": 2.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 502 + }, + { + "completion_length": 508.0, + "epoch": 1.7587412587412588, + "grad_norm": 0.7536454796791077, + "kl": 0.24186736345291138, + "learning_rate": 4.658488876516929e-06, + "loss": 0.0097, + "reward": 2.625, + "reward_std": 1.616709589958191, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 503 + }, + { + "completion_length": 208.33334350585938, + "epoch": 1.762237762237762, + "grad_norm": 1.1730728149414062, + "kl": 0.2987002432346344, + "learning_rate": 4.656284173018144e-06, + "loss": 0.0119, + "reward": 2.758333206176758, + "reward_std": 1.0394309759140015, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 504 + }, + { + "completion_length": 197.1666717529297, + "epoch": 1.7657342657342658, + "grad_norm": 2.2083706855773926, + "kl": 0.3215945363044739, + "learning_rate": 4.654072901103815e-06, + "loss": 0.0129, + "reward": 2.0416667461395264, + "reward_std": 0.9002315402030945, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 505 + }, + { + "completion_length": 572.0, + "epoch": 1.7692307692307692, + "grad_norm": 0.8655341863632202, + "kl": 0.24153539538383484, + "learning_rate": 4.65185506750986e-06, + "loss": 0.0097, + "reward": 1.870833396911621, + "reward_std": 1.0137083530426025, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 506 + }, + { + "completion_length": 517.5, + "epoch": 1.7727272727272727, + "grad_norm": 0.49979329109191895, + "kl": 0.16330799460411072, + "learning_rate": 4.649630678992184e-06, + "loss": 0.0065, + "reward": 2.4000000953674316, + "reward_std": 0.9460445642471313, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 507 + }, + { + "completion_length": 324.16668701171875, + "epoch": 1.7762237762237763, + "grad_norm": 0.9129101037979126, + "kl": 0.26079505681991577, + "learning_rate": 4.6473997423266615e-06, + "loss": 0.0104, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 508 + }, + { + "completion_length": 316.16668701171875, + "epoch": 1.7797202797202796, + "grad_norm": 0.7381297945976257, + "kl": 0.34089159965515137, + "learning_rate": 4.645162264309112e-06, + "loss": 0.0136, + "reward": 3.2333335876464844, + "reward_std": 0.849509596824646, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 509 + }, + { + "completion_length": 207.83334350585938, + "epoch": 1.7832167832167833, + "grad_norm": 1.0436253547668457, + "kl": 0.2835765480995178, + "learning_rate": 4.642918251755281e-06, + "loss": 0.0113, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 510 + }, + { + "completion_length": 230.33334350585938, + "epoch": 1.7867132867132867, + "grad_norm": 0.9628374576568604, + "kl": 0.2641430199146271, + "learning_rate": 4.640667711500821e-06, + "loss": 0.0106, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 511 + }, + { + "completion_length": 507.66668701171875, + "epoch": 1.7902097902097902, + "grad_norm": 0.3851446211338043, + "kl": 0.251933217048645, + "learning_rate": 4.638410650401267e-06, + "loss": 0.0101, + "reward": 3.516666889190674, + "reward_std": 1.0614454746246338, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 512 + }, + { + "completion_length": 192.0, + "epoch": 1.7937062937062938, + "grad_norm": 1.3856638669967651, + "kl": 0.2984909415245056, + "learning_rate": 4.636147075332019e-06, + "loss": 0.0119, + "reward": 3.0916666984558105, + "reward_std": 1.2249150276184082, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 513 + }, + { + "completion_length": 206.83334350585938, + "epoch": 1.797202797202797, + "grad_norm": 0.9139816164970398, + "kl": 0.24960675835609436, + "learning_rate": 4.633876993188319e-06, + "loss": 0.01, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 514 + }, + { + "completion_length": 538.0, + "epoch": 1.8006993006993008, + "grad_norm": 0.7666388750076294, + "kl": 0.2067805826663971, + "learning_rate": 4.631600410885231e-06, + "loss": 0.0083, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 515 + }, + { + "completion_length": 186.0, + "epoch": 1.8041958041958042, + "grad_norm": 0.9322411417961121, + "kl": 0.24232684075832367, + "learning_rate": 4.62931733535762e-06, + "loss": 0.0097, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 516 + }, + { + "completion_length": 170.6666717529297, + "epoch": 1.8076923076923077, + "grad_norm": 1.5746034383773804, + "kl": 0.36948150396347046, + "learning_rate": 4.627027773560129e-06, + "loss": 0.0148, + "reward": 2.516666889190674, + "reward_std": 1.525341510772705, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 517 + }, + { + "completion_length": 193.0, + "epoch": 1.8111888111888113, + "grad_norm": 0.9759989380836487, + "kl": 0.3557225167751312, + "learning_rate": 4.62473173246716e-06, + "loss": 0.0142, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 518 + }, + { + "completion_length": 523.6666870117188, + "epoch": 1.8146853146853146, + "grad_norm": 0.9804190993309021, + "kl": 0.2574712038040161, + "learning_rate": 4.622429219072854e-06, + "loss": 0.0103, + "reward": 1.633333444595337, + "reward_std": 1.1919171810150146, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 519 + }, + { + "completion_length": 1029.166748046875, + "epoch": 1.8181818181818183, + "grad_norm": 0.5941687822341919, + "kl": 0.1915300190448761, + "learning_rate": 4.620120240391065e-06, + "loss": 0.0077, + "reward": 2.883333206176758, + "reward_std": 1.7192052602767944, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 520 + }, + { + "completion_length": 157.1666717529297, + "epoch": 1.8216783216783217, + "grad_norm": 3.1836304664611816, + "kl": 0.6161837577819824, + "learning_rate": 4.6178048034553435e-06, + "loss": 0.0246, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 521 + }, + { + "completion_length": 201.33334350585938, + "epoch": 1.8251748251748252, + "grad_norm": 1.5185062885284424, + "kl": 0.31097742915153503, + "learning_rate": 4.6154829153189105e-06, + "loss": 0.0124, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 522 + }, + { + "completion_length": 186.1666717529297, + "epoch": 1.8286713286713288, + "grad_norm": 0.936562180519104, + "kl": 0.3272198438644409, + "learning_rate": 4.613154583054641e-06, + "loss": 0.0131, + "reward": 2.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 523 + }, + { + "completion_length": 216.6666717529297, + "epoch": 1.832167832167832, + "grad_norm": 0.9323495626449585, + "kl": 0.3112618923187256, + "learning_rate": 4.610819813755038e-06, + "loss": 0.0125, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 524 + }, + { + "completion_length": 525.3333740234375, + "epoch": 1.8356643356643356, + "grad_norm": 0.40873953700065613, + "kl": 0.241009920835495, + "learning_rate": 4.608478614532215e-06, + "loss": 0.0096, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 525 + }, + { + "completion_length": 160.83334350585938, + "epoch": 1.8391608391608392, + "grad_norm": 1.1447237730026245, + "kl": 0.37633103132247925, + "learning_rate": 4.60613099251787e-06, + "loss": 0.0151, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 526 + }, + { + "completion_length": 176.5, + "epoch": 1.8426573426573427, + "grad_norm": 1.4215019941329956, + "kl": 0.31421756744384766, + "learning_rate": 4.603776954863266e-06, + "loss": 0.0126, + "reward": 2.2083334922790527, + "reward_std": 0.6003471612930298, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 527 + }, + { + "completion_length": 511.16668701171875, + "epoch": 1.8461538461538463, + "grad_norm": 0.7890862226486206, + "kl": 0.21260276436805725, + "learning_rate": 4.601416508739211e-06, + "loss": 0.0085, + "reward": 2.2916667461395264, + "reward_std": 1.4901063442230225, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 528 + }, + { + "completion_length": 145.6666717529297, + "epoch": 1.8496503496503496, + "grad_norm": 2.972633123397827, + "kl": 1.6821321249008179, + "learning_rate": 4.599049661336033e-06, + "loss": 0.0673, + "reward": 2.4583334922790527, + "reward_std": 1.3603004217147827, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 529 + }, + { + "completion_length": 337.66668701171875, + "epoch": 1.8531468531468531, + "grad_norm": 0.4933686852455139, + "kl": 0.2972989082336426, + "learning_rate": 4.596676419863561e-06, + "loss": 0.0119, + "reward": 3.758333206176758, + "reward_std": 0.4694856107234955, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9250000715255737, + "step": 530 + }, + { + "completion_length": 1491.166748046875, + "epoch": 1.8566433566433567, + "grad_norm": 0.7114420533180237, + "kl": 0.16526620090007782, + "learning_rate": 4.5942967915510975e-06, + "loss": 0.0066, + "reward": 2.683333396911621, + "reward_std": 0.8942409753799438, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 531 + }, + { + "completion_length": 822.0, + "epoch": 1.86013986013986, + "grad_norm": 0.4190931022167206, + "kl": 0.21502110362052917, + "learning_rate": 4.591910783647405e-06, + "loss": 0.0086, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 532 + }, + { + "completion_length": 739.5, + "epoch": 1.8636363636363638, + "grad_norm": 0.5615747570991516, + "kl": 0.223265141248703, + "learning_rate": 4.589518403420676e-06, + "loss": 0.0089, + "reward": 2.3500001430511475, + "reward_std": 1.5231547355651855, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 533 + }, + { + "completion_length": 188.6666717529297, + "epoch": 1.867132867132867, + "grad_norm": 0.754673957824707, + "kl": 0.2731919288635254, + "learning_rate": 4.587119658158517e-06, + "loss": 0.0109, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 534 + }, + { + "completion_length": 528.3333740234375, + "epoch": 1.8706293706293706, + "grad_norm": 0.45285508036613464, + "kl": 0.21540388464927673, + "learning_rate": 4.584714555167921e-06, + "loss": 0.0086, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 535 + }, + { + "completion_length": 513.1666870117188, + "epoch": 1.8741258741258742, + "grad_norm": 0.6436936259269714, + "kl": 0.2541727125644684, + "learning_rate": 4.582303101775249e-06, + "loss": 0.0102, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 536 + }, + { + "completion_length": 503.3333435058594, + "epoch": 1.8776223776223775, + "grad_norm": 0.5080775618553162, + "kl": 0.2073960304260254, + "learning_rate": 4.579885305326206e-06, + "loss": 0.0083, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 537 + }, + { + "completion_length": 209.6666717529297, + "epoch": 1.8811188811188813, + "grad_norm": 0.9030362963676453, + "kl": 0.283308744430542, + "learning_rate": 4.577461173185821e-06, + "loss": 0.0113, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 538 + }, + { + "completion_length": 121.5, + "epoch": 1.8846153846153846, + "grad_norm": 2.8895628452301025, + "kl": 0.8616495132446289, + "learning_rate": 4.5750307127384194e-06, + "loss": 0.0345, + "reward": 1.4666666984558105, + "reward_std": 1.2002778053283691, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 539 + }, + { + "completion_length": 208.83334350585938, + "epoch": 1.8881118881118881, + "grad_norm": 1.0781502723693848, + "kl": 0.2666887640953064, + "learning_rate": 4.572593931387604e-06, + "loss": 0.0107, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 540 + }, + { + "completion_length": 529.8333740234375, + "epoch": 1.8916083916083917, + "grad_norm": 0.8341970443725586, + "kl": 0.27578771114349365, + "learning_rate": 4.570150836556236e-06, + "loss": 0.011, + "reward": 2.683333396911621, + "reward_std": 0.9092121124267578, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 541 + }, + { + "completion_length": 509.0, + "epoch": 1.895104895104895, + "grad_norm": 0.7221694588661194, + "kl": 0.20753830671310425, + "learning_rate": 4.567701435686405e-06, + "loss": 0.0083, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 542 + }, + { + "completion_length": 999.0, + "epoch": 1.8986013986013988, + "grad_norm": 0.8567831516265869, + "kl": 0.2119346261024475, + "learning_rate": 4.5652457362394094e-06, + "loss": 0.0085, + "reward": 1.808333396911621, + "reward_std": 2.014302968978882, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 543 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.902097902097902, + "grad_norm": 0.5826951265335083, + "kl": 0.2415902316570282, + "learning_rate": 4.562783745695738e-06, + "loss": 0.0097, + "reward": 2.183333396911621, + "reward_std": 1.0230673551559448, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 544 + }, + { + "completion_length": 831.0, + "epoch": 1.9055944055944056, + "grad_norm": 0.5661029815673828, + "kl": 0.2621002495288849, + "learning_rate": 4.560315471555039e-06, + "loss": 0.0105, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 545 + }, + { + "completion_length": 190.6666717529297, + "epoch": 1.9090909090909092, + "grad_norm": 0.8984940648078918, + "kl": 0.261735200881958, + "learning_rate": 4.5578409213361055e-06, + "loss": 0.0105, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 546 + }, + { + "completion_length": 672.5, + "epoch": 1.9125874125874125, + "grad_norm": 0.6307451128959656, + "kl": 0.3331562280654907, + "learning_rate": 4.555360102576844e-06, + "loss": 0.0133, + "reward": 3.5916666984558105, + "reward_std": 0.5571505427360535, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9250000715255737, + "step": 547 + }, + { + "completion_length": 193.5, + "epoch": 1.916083916083916, + "grad_norm": 0.9689189791679382, + "kl": 0.31761375069618225, + "learning_rate": 4.55287302283426e-06, + "loss": 0.0127, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 548 + }, + { + "completion_length": 477.0, + "epoch": 1.9195804195804196, + "grad_norm": 1.1217161417007446, + "kl": 0.4803551435470581, + "learning_rate": 4.550379689684431e-06, + "loss": 0.0192, + "reward": 2.924999952316284, + "reward_std": 0.06123730167746544, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.9249999523162842, + "step": 549 + }, + { + "completion_length": 501.66668701171875, + "epoch": 1.9230769230769231, + "grad_norm": 0.48732584714889526, + "kl": 0.3280116021633148, + "learning_rate": 4.54788011072248e-06, + "loss": 0.0131, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 550 + }, + { + "completion_length": 190.5, + "epoch": 1.9265734265734267, + "grad_norm": 0.05169845372438431, + "kl": 0.2321687638759613, + "learning_rate": 4.545374293562559e-06, + "loss": 0.0117, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 551 + }, + { + "completion_length": 226.33334350585938, + "epoch": 1.93006993006993, + "grad_norm": 1.1284880638122559, + "kl": 0.3435511291027069, + "learning_rate": 4.542862245837821e-06, + "loss": 0.0137, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 552 + }, + { + "completion_length": 197.5, + "epoch": 1.9335664335664335, + "grad_norm": 0.8085185289382935, + "kl": 0.2905815541744232, + "learning_rate": 4.540343975200401e-06, + "loss": 0.0116, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 553 + }, + { + "completion_length": 504.8333435058594, + "epoch": 1.937062937062937, + "grad_norm": 0.38323989510536194, + "kl": 0.26971811056137085, + "learning_rate": 4.537819489321385e-06, + "loss": 0.0108, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 554 + }, + { + "completion_length": 172.5, + "epoch": 1.9405594405594404, + "grad_norm": 1.8462821245193481, + "kl": 0.32645952701568604, + "learning_rate": 4.535288795890799e-06, + "loss": 0.0131, + "reward": 2.8500001430511475, + "reward_std": 1.3190906047821045, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 555 + }, + { + "completion_length": 508.66668701171875, + "epoch": 1.9440559440559442, + "grad_norm": 0.48262494802474976, + "kl": 0.26610442996025085, + "learning_rate": 4.5327519026175694e-06, + "loss": 0.0106, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 556 + }, + { + "completion_length": 205.33334350585938, + "epoch": 1.9475524475524475, + "grad_norm": 0.8724077343940735, + "kl": 0.34979626536369324, + "learning_rate": 4.530208817229516e-06, + "loss": 0.014, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 557 + }, + { + "completion_length": 466.3333435058594, + "epoch": 1.951048951048951, + "grad_norm": 1.2409106492996216, + "kl": 0.5075003504753113, + "learning_rate": 4.527659547473317e-06, + "loss": 0.0203, + "reward": 1.774999976158142, + "reward_std": 1.3299436569213867, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6083333492279053, + "step": 558 + }, + { + "completion_length": 201.0, + "epoch": 1.9545454545454546, + "grad_norm": 0.9538130760192871, + "kl": 0.22750967741012573, + "learning_rate": 4.5251041011144905e-06, + "loss": 0.0091, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 559 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.958041958041958, + "grad_norm": 0.8161240220069885, + "kl": 0.28019654750823975, + "learning_rate": 4.522542485937369e-06, + "loss": 0.0112, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 560 + }, + { + "completion_length": 515.5, + "epoch": 1.9615384615384617, + "grad_norm": 0.6905736327171326, + "kl": 0.20913702249526978, + "learning_rate": 4.519974709745076e-06, + "loss": 0.0084, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 561 + }, + { + "completion_length": 201.5, + "epoch": 1.965034965034965, + "grad_norm": 1.109075665473938, + "kl": 0.29383933544158936, + "learning_rate": 4.517400780359505e-06, + "loss": 0.0118, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 562 + }, + { + "completion_length": 849.0, + "epoch": 1.9685314685314685, + "grad_norm": 0.5454800128936768, + "kl": 0.16988810896873474, + "learning_rate": 4.51482070562129e-06, + "loss": 0.0068, + "reward": 2.4666666984558105, + "reward_std": 1.949530005455017, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 563 + }, + { + "completion_length": 826.0, + "epoch": 1.972027972027972, + "grad_norm": 0.521063506603241, + "kl": 0.2149253934621811, + "learning_rate": 4.512234493389785e-06, + "loss": 0.0086, + "reward": 1.9666666984558105, + "reward_std": 1.5233734846115112, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 564 + }, + { + "completion_length": 502.8333435058594, + "epoch": 1.9755244755244754, + "grad_norm": 0.4798555076122284, + "kl": 0.26902374625205994, + "learning_rate": 4.509642151543043e-06, + "loss": 0.0108, + "reward": 1.625, + "reward_std": 0.7960842847824097, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 565 + }, + { + "completion_length": 525.0, + "epoch": 1.9790209790209792, + "grad_norm": 0.566384494304657, + "kl": 0.2703857123851776, + "learning_rate": 4.507043687977787e-06, + "loss": 0.0108, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 566 + }, + { + "completion_length": 194.33334350585938, + "epoch": 1.9825174825174825, + "grad_norm": 2.502077579498291, + "kl": 0.4179210364818573, + "learning_rate": 4.504439110609385e-06, + "loss": 0.0167, + "reward": 1.383333444595337, + "reward_std": 0.8920015096664429, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 567 + }, + { + "completion_length": 199.33334350585938, + "epoch": 1.986013986013986, + "grad_norm": 0.07109465450048447, + "kl": 0.2686344385147095, + "learning_rate": 4.501828427371834e-06, + "loss": 0.0131, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 568 + }, + { + "completion_length": 190.83334350585938, + "epoch": 1.9895104895104896, + "grad_norm": 1.11842942237854, + "kl": 0.2603175640106201, + "learning_rate": 4.4992116462177274e-06, + "loss": 0.0104, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 569 + }, + { + "completion_length": 513.8333740234375, + "epoch": 1.993006993006993, + "grad_norm": 0.47602808475494385, + "kl": 0.20756664872169495, + "learning_rate": 4.496588775118232e-06, + "loss": 0.0083, + "reward": 2.2916667461395264, + "reward_std": 1.19181227684021, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 570 + }, + { + "completion_length": 197.1666717529297, + "epoch": 1.9965034965034965, + "grad_norm": 0.7599025368690491, + "kl": 0.23664715886116028, + "learning_rate": 4.4939598220630724e-06, + "loss": 0.0095, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 571 + }, + { + "completion_length": 207.83334350585938, + "epoch": 2.0, + "grad_norm": 0.7908173203468323, + "kl": 0.28615739941596985, + "learning_rate": 4.491324795060491e-06, + "loss": 0.0114, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 572 + }, + { + "completion_length": 194.6666717529297, + "epoch": 2.0034965034965033, + "grad_norm": 0.9715352654457092, + "kl": 0.3183891177177429, + "learning_rate": 4.48868370213724e-06, + "loss": 0.0127, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 573 + }, + { + "completion_length": 173.83334350585938, + "epoch": 2.006993006993007, + "grad_norm": 2.3841874599456787, + "kl": 1.3214149475097656, + "learning_rate": 4.4860365513385456e-06, + "loss": 0.0529, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 574 + }, + { + "completion_length": 213.83334350585938, + "epoch": 2.0104895104895104, + "grad_norm": 0.9496575593948364, + "kl": 0.22735705971717834, + "learning_rate": 4.4833833507280884e-06, + "loss": 0.0091, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 575 + }, + { + "completion_length": 511.0, + "epoch": 2.013986013986014, + "grad_norm": 0.6045878529548645, + "kl": 0.25393787026405334, + "learning_rate": 4.4807241083879774e-06, + "loss": 0.0102, + "reward": 1.4583333730697632, + "reward_std": 0.8187898397445679, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 576 + }, + { + "completion_length": 222.1666717529297, + "epoch": 2.0174825174825175, + "grad_norm": 0.7379043102264404, + "kl": 0.22020569443702698, + "learning_rate": 4.478058832418726e-06, + "loss": 0.0088, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 577 + }, + { + "completion_length": 204.6666717529297, + "epoch": 2.020979020979021, + "grad_norm": 0.9404547810554504, + "kl": 0.2797861695289612, + "learning_rate": 4.475387530939226e-06, + "loss": 0.0112, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 578 + }, + { + "completion_length": 206.6666717529297, + "epoch": 2.0244755244755246, + "grad_norm": 0.8784480690956116, + "kl": 0.24152153730392456, + "learning_rate": 4.4727102120867274e-06, + "loss": 0.0097, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 579 + }, + { + "completion_length": 414.66668701171875, + "epoch": 2.027972027972028, + "grad_norm": 0.6715477705001831, + "kl": 0.21307629346847534, + "learning_rate": 4.470026884016805e-06, + "loss": 0.0085, + "reward": 2.4583334922790527, + "reward_std": 1.5001389980316162, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 580 + }, + { + "completion_length": 528.5, + "epoch": 2.0314685314685317, + "grad_norm": 0.7886191010475159, + "kl": 0.4145243763923645, + "learning_rate": 4.467337554903344e-06, + "loss": 0.0166, + "reward": 3.5416667461395264, + "reward_std": 1.0002083778381348, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.875, + "step": 581 + }, + { + "completion_length": 457.5, + "epoch": 2.034965034965035, + "grad_norm": 5.719381809234619, + "kl": 1.370613932609558, + "learning_rate": 4.464642232938505e-06, + "loss": 0.0548, + "reward": 1.9750001430511475, + "reward_std": 2.163504123687744, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 582 + }, + { + "completion_length": 361.5, + "epoch": 2.0384615384615383, + "grad_norm": 0.5381609201431274, + "kl": 0.23687216639518738, + "learning_rate": 4.461940926332708e-06, + "loss": 0.0095, + "reward": 1.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 583 + }, + { + "completion_length": 874.6666870117188, + "epoch": 2.041958041958042, + "grad_norm": 0.45025861263275146, + "kl": 0.16833463311195374, + "learning_rate": 4.4592336433146e-06, + "loss": 0.0067, + "reward": 2.9583334922790527, + "reward_std": 1.6554203033447266, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 584 + }, + { + "completion_length": 726.3333740234375, + "epoch": 2.0454545454545454, + "grad_norm": 0.4446694254875183, + "kl": 0.17844387888908386, + "learning_rate": 4.456520392131035e-06, + "loss": 0.0071, + "reward": 1.133333444595337, + "reward_std": 0.9595138430595398, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 585 + }, + { + "completion_length": 830.3333740234375, + "epoch": 2.0489510489510487, + "grad_norm": 0.8371572494506836, + "kl": 0.16316595673561096, + "learning_rate": 4.453801181047047e-06, + "loss": 0.0065, + "reward": 1.524999976158142, + "reward_std": 1.1206024885177612, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 586 + }, + { + "completion_length": 110.5, + "epoch": 2.0524475524475525, + "grad_norm": 3.6648356914520264, + "kl": 0.4860494136810303, + "learning_rate": 4.4510760183458246e-06, + "loss": 0.0194, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 587 + }, + { + "completion_length": 228.6666717529297, + "epoch": 2.055944055944056, + "grad_norm": 0.8717478513717651, + "kl": 0.28448450565338135, + "learning_rate": 4.448344912328686e-06, + "loss": 0.0114, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 588 + }, + { + "completion_length": 614.0, + "epoch": 2.0594405594405596, + "grad_norm": 0.352130651473999, + "kl": 0.19009076058864594, + "learning_rate": 4.445607871315053e-06, + "loss": 0.0076, + "reward": 1.7333333492279053, + "reward_std": 0.5307227969169617, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 589 + }, + { + "completion_length": 476.3333435058594, + "epoch": 2.062937062937063, + "grad_norm": 2.5581870079040527, + "kl": 0.5677192807197571, + "learning_rate": 4.442864903642428e-06, + "loss": 0.0227, + "reward": 1.8000000715255737, + "reward_std": 1.5792405605316162, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 590 + }, + { + "completion_length": 314.66668701171875, + "epoch": 2.0664335664335662, + "grad_norm": 0.657811164855957, + "kl": 0.20458662509918213, + "learning_rate": 4.440116017666365e-06, + "loss": 0.0082, + "reward": 3.116666793823242, + "reward_std": 1.3291600942611694, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 591 + }, + { + "completion_length": 516.0, + "epoch": 2.06993006993007, + "grad_norm": 0.473056823015213, + "kl": 0.19687163829803467, + "learning_rate": 4.437361221760449e-06, + "loss": 0.0079, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 592 + }, + { + "completion_length": 217.0, + "epoch": 2.0734265734265733, + "grad_norm": 0.793745756149292, + "kl": 0.2862774133682251, + "learning_rate": 4.434600524316266e-06, + "loss": 0.0115, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 593 + }, + { + "completion_length": 216.0, + "epoch": 2.076923076923077, + "grad_norm": 0.7589979767799377, + "kl": 0.2887541651725769, + "learning_rate": 4.431833933743378e-06, + "loss": 0.0116, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 594 + }, + { + "completion_length": 234.0, + "epoch": 2.0804195804195804, + "grad_norm": 0.952064037322998, + "kl": 0.30340343713760376, + "learning_rate": 4.4290614584693005e-06, + "loss": 0.0121, + "reward": 2.5375001430511475, + "reward_std": 0.9115578532218933, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 595 + }, + { + "completion_length": 1109.8333740234375, + "epoch": 2.0839160839160837, + "grad_norm": 0.382217139005661, + "kl": 0.1974603831768036, + "learning_rate": 4.426283106939474e-06, + "loss": 0.0079, + "reward": 1.7166666984558105, + "reward_std": 0.967298686504364, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7166666984558105, + "step": 596 + }, + { + "completion_length": 497.66668701171875, + "epoch": 2.0874125874125875, + "grad_norm": 0.7741627097129822, + "kl": 0.2393149733543396, + "learning_rate": 4.423498887617238e-06, + "loss": 0.0096, + "reward": 1.9583333730697632, + "reward_std": 1.400148868560791, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 597 + }, + { + "completion_length": 518.0, + "epoch": 2.090909090909091, + "grad_norm": 0.534230649471283, + "kl": 0.22715210914611816, + "learning_rate": 4.420708808983809e-06, + "loss": 0.0091, + "reward": 2.4583334922790527, + "reward_std": 1.3603003025054932, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 598 + }, + { + "completion_length": 502.8333435058594, + "epoch": 2.0944055944055946, + "grad_norm": 0.5411605834960938, + "kl": 0.2008448839187622, + "learning_rate": 4.41791287953825e-06, + "loss": 0.008, + "reward": 2.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 599 + }, + { + "completion_length": 545.6666870117188, + "epoch": 2.097902097902098, + "grad_norm": 0.44943779706954956, + "kl": 0.225155770778656, + "learning_rate": 4.415111107797445e-06, + "loss": 0.009, + "reward": 3.016666889190674, + "reward_std": 1.3952300548553467, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 600 + } + ], + "logging_steps": 1, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-600/training_args.bin b/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..404a67ca1097568ef818195412e92eb5df6df003 --- /dev/null +++ b/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9b809202c83316443ca7c3596f9666d891e249e918f031374256726d85b5070 +size 6008 diff --git a/checkpoint-650/README.md b/checkpoint-650/README.md new file mode 100644 index 0000000000000000000000000000000000000000..342a23987f57b711334f1f7c4b72004ab4751d11 --- /dev/null +++ b/checkpoint-650/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/checkpoint-650/adapter_config.json b/checkpoint-650/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca69f90ffbea02ffd530ac27f43588458c02af39 --- /dev/null +++ b/checkpoint-650/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "up_proj", + "o_proj", + "k_proj", + "gate_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-650/adapter_model.safetensors b/checkpoint-650/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a9f170d692c8586837826ad2aad0ef166ce5464b --- /dev/null +++ b/checkpoint-650/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d792de2e216c2e689f9ac3cf7a0a17647a95254692790cffefb56f504cd4be31 +size 778096664 diff --git a/checkpoint-650/optimizer.pt b/checkpoint-650/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..426c28dabcdc3562a4992d3a35e8db05e10e399b --- /dev/null +++ b/checkpoint-650/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad157cad64728b337d1833cf5521b778a722dc4c798e3491fede5727518cac7a +size 395571252 diff --git a/checkpoint-650/rng_state.pth b/checkpoint-650/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ed996ac1a2391c4d0c27df69b6185c04e2054888 --- /dev/null +++ b/checkpoint-650/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a2d540e31f043f043ecd90ce094efe51e072577fa623ea27f808cc4e8d1a0d8 +size 14244 diff --git a/checkpoint-650/scheduler.pt b/checkpoint-650/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..743a9ff31f051a31f97c03004592da6e81fdfbbb --- /dev/null +++ b/checkpoint-650/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a95c3a91211562cf55e70b2d855b42ecc024762e653c37627ded6a594d142fd +size 1064 diff --git a/checkpoint-650/special_tokens_map.json b/checkpoint-650/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-650/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-650/tokenizer.json b/checkpoint-650/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-650/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-650/tokenizer_config.json b/checkpoint-650/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f29bafcf7d24e386a389486e71a4e81dfef0f5c2 --- /dev/null +++ b/checkpoint-650/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-650/trainer_state.json b/checkpoint-650/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e6972fdc2988e246315de0d507eee8b9556c0c68 --- /dev/null +++ b/checkpoint-650/trainer_state.json @@ -0,0 +1,9783 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.2727272727272725, + "eval_steps": 500, + "global_step": 650, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 399.0, + "epoch": 0.0034965034965034965, + "grad_norm": 0.9857833385467529, + "kl": 0.0, + "learning_rate": 2.5000000000000002e-08, + "loss": 0.0, + "reward": 1.75, + "reward_std": 1.069111704826355, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4166666865348816, + "step": 1 + }, + { + "completion_length": 305.3333435058594, + "epoch": 0.006993006993006993, + "grad_norm": 1.3122953176498413, + "kl": 0.0, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.0, + "reward": 1.0500000715255737, + "reward_std": 0.6340347528457642, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 2 + }, + { + "completion_length": 475.3333435058594, + "epoch": 0.01048951048951049, + "grad_norm": 6.344944953918457, + "kl": 0.0006356238736771047, + "learning_rate": 7.500000000000001e-08, + "loss": 0.0, + "reward": 1.4750001430511475, + "reward_std": 1.025061011314392, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 3 + }, + { + "completion_length": 378.3333435058594, + "epoch": 0.013986013986013986, + "grad_norm": 0.9831988215446472, + "kl": 0.0006719424272887409, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.0, + "reward": 1.2208333015441895, + "reward_std": 1.3383214473724365, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.22083334624767303, + "step": 4 + }, + { + "completion_length": 925.0, + "epoch": 0.017482517482517484, + "grad_norm": 1.042701005935669, + "kl": 0.000699286290910095, + "learning_rate": 1.2500000000000002e-07, + "loss": 0.0, + "reward": 2.4666666984558105, + "reward_std": 1.618847370147705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 5 + }, + { + "completion_length": 130.6666717529297, + "epoch": 0.02097902097902098, + "grad_norm": 1.276957631111145, + "kl": 0.0007741473382338881, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.0, + "reward": 0.38333332538604736, + "reward_std": 0.7222649455070496, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 6 + }, + { + "completion_length": 185.5, + "epoch": 0.024475524475524476, + "grad_norm": 1.277024507522583, + "kl": 0.0007853443967178464, + "learning_rate": 1.7500000000000002e-07, + "loss": 0.0, + "reward": 0.7250000238418579, + "reward_std": 0.44017040729522705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 7 + }, + { + "completion_length": 113.83333587646484, + "epoch": 0.027972027972027972, + "grad_norm": 4.894377708435059, + "kl": 0.0010196010116487741, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.0, + "reward": 0.7250000238418579, + "reward_std": 0.5777109861373901, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 8 + }, + { + "completion_length": 195.33334350585938, + "epoch": 0.03146853146853147, + "grad_norm": 0.9491543769836426, + "kl": 0.0009398699621669948, + "learning_rate": 2.2500000000000002e-07, + "loss": 0.0, + "reward": 1.2750000953674316, + "reward_std": 0.673609733581543, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10833333432674408, + "step": 9 + }, + { + "completion_length": 168.83334350585938, + "epoch": 0.03496503496503497, + "grad_norm": 4.634313583374023, + "kl": 0.0008446139981970191, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.0, + "reward": 0.5791666507720947, + "reward_std": 1.011238932609558, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.07916666567325592, + "step": 10 + }, + { + "completion_length": 181.0, + "epoch": 0.038461538461538464, + "grad_norm": 0.9203607439994812, + "kl": 0.0005472182529047132, + "learning_rate": 2.75e-07, + "loss": 0.0, + "reward": 1.2833333015441895, + "reward_std": 0.9125057458877563, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.11666666716337204, + "step": 11 + }, + { + "completion_length": 181.1666717529297, + "epoch": 0.04195804195804196, + "grad_norm": 1.4339206218719482, + "kl": 0.0007050944259390235, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.0, + "reward": 1.7333333492279053, + "reward_std": 1.0063133239746094, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.23333333432674408, + "step": 12 + }, + { + "completion_length": 130.0, + "epoch": 0.045454545454545456, + "grad_norm": 1.073473334312439, + "kl": 0.0007636564550921321, + "learning_rate": 3.25e-07, + "loss": 0.0, + "reward": 1.058333396911621, + "reward_std": 0.7486097812652588, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 13 + }, + { + "completion_length": 356.16668701171875, + "epoch": 0.04895104895104895, + "grad_norm": 0.8452476859092712, + "kl": 0.0006562608177773654, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.0, + "reward": 0.7416666746139526, + "reward_std": 0.8662659525871277, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.24166667461395264, + "step": 14 + }, + { + "completion_length": 143.1666717529297, + "epoch": 0.05244755244755245, + "grad_norm": 0.9590725302696228, + "kl": 0.0008172739762812853, + "learning_rate": 3.75e-07, + "loss": 0.0, + "reward": 0.5541666746139526, + "reward_std": 0.9553031921386719, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05416666716337204, + "step": 15 + }, + { + "completion_length": 454.16668701171875, + "epoch": 0.055944055944055944, + "grad_norm": 1.2272268533706665, + "kl": 0.0007388863014057279, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.0, + "reward": 1.2083333730697632, + "reward_std": 1.0360583066940308, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 16 + }, + { + "completion_length": 152.5, + "epoch": 0.05944055944055944, + "grad_norm": 1.0074872970581055, + "kl": 0.0006766216829419136, + "learning_rate": 4.2500000000000006e-07, + "loss": 0.0, + "reward": 0.8916666507720947, + "reward_std": 0.8662659525871277, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 17 + }, + { + "completion_length": 250.1666717529297, + "epoch": 0.06293706293706294, + "grad_norm": 1.305372953414917, + "kl": 0.001035388559103012, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.0, + "reward": 0.7166666984558105, + "reward_std": 1.2201093435287476, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 18 + }, + { + "completion_length": 243.0, + "epoch": 0.06643356643356643, + "grad_norm": 1.0690687894821167, + "kl": 0.0006665514083579183, + "learning_rate": 4.7500000000000006e-07, + "loss": 0.0, + "reward": 0.9916666746139526, + "reward_std": 0.6167792677879333, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.15833333134651184, + "step": 19 + }, + { + "completion_length": 276.16668701171875, + "epoch": 0.06993006993006994, + "grad_norm": 1.052300214767456, + "kl": 0.0005925261066295207, + "learning_rate": 5.000000000000001e-07, + "loss": 0.0, + "reward": 1.5333333015441895, + "reward_std": 1.0186593532562256, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.20000001788139343, + "step": 20 + }, + { + "completion_length": 333.3333435058594, + "epoch": 0.07342657342657342, + "grad_norm": 0.95088130235672, + "kl": 0.0006341444095596671, + "learning_rate": 5.250000000000001e-07, + "loss": 0.0, + "reward": 1.8583333492279053, + "reward_std": 0.8458231687545776, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3583333194255829, + "step": 21 + }, + { + "completion_length": 166.6666717529297, + "epoch": 0.07692307692307693, + "grad_norm": 1.2825149297714233, + "kl": 0.0007712479564361274, + "learning_rate": 5.5e-07, + "loss": 0.0, + "reward": 0.7666666507720947, + "reward_std": 1.1881358623504639, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10000000894069672, + "step": 22 + }, + { + "completion_length": 380.0, + "epoch": 0.08041958041958042, + "grad_norm": 1.2229748964309692, + "kl": 0.0007141837850213051, + "learning_rate": 5.750000000000001e-07, + "loss": 0.0, + "reward": 1.933333396911621, + "reward_std": 0.7672461867332458, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 23 + }, + { + "completion_length": 250.0, + "epoch": 0.08391608391608392, + "grad_norm": 1.1869820356369019, + "kl": 0.0007901927456259727, + "learning_rate": 6.000000000000001e-07, + "loss": 0.0, + "reward": 0.9666666984558105, + "reward_std": 0.797287106513977, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 24 + }, + { + "completion_length": 224.33334350585938, + "epoch": 0.08741258741258741, + "grad_norm": 1.1140718460083008, + "kl": 0.0006676652701571584, + "learning_rate": 6.25e-07, + "loss": 0.0, + "reward": 1.125, + "reward_std": 1.069462537765503, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.125, + "step": 25 + }, + { + "completion_length": 112.33333587646484, + "epoch": 0.09090909090909091, + "grad_norm": 1.20625901222229, + "kl": 0.0006995900766924024, + "learning_rate": 6.5e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0, + "step": 26 + }, + { + "completion_length": 398.8333435058594, + "epoch": 0.0944055944055944, + "grad_norm": 5.332723617553711, + "kl": 0.0007186655420809984, + "learning_rate": 6.750000000000001e-07, + "loss": 0.0, + "reward": 1.6625001430511475, + "reward_std": 0.9664044380187988, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3291666805744171, + "step": 27 + }, + { + "completion_length": 336.3333435058594, + "epoch": 0.0979020979020979, + "grad_norm": 0.7707162499427795, + "kl": 0.0007305681938305497, + "learning_rate": 7.000000000000001e-07, + "loss": 0.0, + "reward": 1.441666603088379, + "reward_std": 0.9876319766044617, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.2750000059604645, + "step": 28 + }, + { + "completion_length": 355.8333435058594, + "epoch": 0.10139860139860139, + "grad_norm": 0.999113142490387, + "kl": 0.0006821553106419742, + "learning_rate": 7.25e-07, + "loss": 0.0, + "reward": 1.1583333015441895, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.15833333134651184, + "step": 29 + }, + { + "completion_length": 188.1666717529297, + "epoch": 0.1048951048951049, + "grad_norm": 1.1029480695724487, + "kl": 0.0007804523920640349, + "learning_rate": 7.5e-07, + "loss": 0.0, + "reward": 1.183333396911621, + "reward_std": 1.0680201053619385, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.18333333730697632, + "step": 30 + }, + { + "completion_length": 380.3333435058594, + "epoch": 0.10839160839160839, + "grad_norm": 0.9132871627807617, + "kl": 0.0008556495886296034, + "learning_rate": 7.750000000000001e-07, + "loss": 0.0, + "reward": 2.2375001907348633, + "reward_std": 1.4762918949127197, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40416666865348816, + "step": 31 + }, + { + "completion_length": 348.0, + "epoch": 0.11188811188811189, + "grad_norm": 1.549122929573059, + "kl": 0.0009064790210686624, + "learning_rate": 8.000000000000001e-07, + "loss": 0.0, + "reward": 0.8291666507720947, + "reward_std": 1.029613733291626, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.16250000894069672, + "step": 32 + }, + { + "completion_length": 349.5, + "epoch": 0.11538461538461539, + "grad_norm": 0.8771302700042725, + "kl": 0.0008574656676501036, + "learning_rate": 8.250000000000001e-07, + "loss": 0.0, + "reward": 1.133333444595337, + "reward_std": 0.9867455363273621, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.30000001192092896, + "step": 33 + }, + { + "completion_length": 698.8333740234375, + "epoch": 0.11888111888111888, + "grad_norm": 0.7568854689598083, + "kl": 0.0007735582767054439, + "learning_rate": 8.500000000000001e-07, + "loss": 0.0, + "reward": 1.933333396911621, + "reward_std": 1.1737406253814697, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 34 + }, + { + "completion_length": 655.3333740234375, + "epoch": 0.12237762237762238, + "grad_norm": 1.5077099800109863, + "kl": 0.0007145506679080427, + "learning_rate": 8.75e-07, + "loss": 0.0, + "reward": 1.337499976158142, + "reward_std": 0.7572566270828247, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5041667222976685, + "step": 35 + }, + { + "completion_length": 156.0, + "epoch": 0.1258741258741259, + "grad_norm": 1.1091190576553345, + "kl": 0.0010963345412164927, + "learning_rate": 9.000000000000001e-07, + "loss": 0.0, + "reward": 1.1583333015441895, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.15833333134651184, + "step": 36 + }, + { + "completion_length": 184.6666717529297, + "epoch": 0.12937062937062938, + "grad_norm": 1.1978340148925781, + "kl": 0.000993944238871336, + "learning_rate": 9.25e-07, + "loss": 0.0, + "reward": 0.8333333730697632, + "reward_std": 1.2944754362106323, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 37 + }, + { + "completion_length": 170.1666717529297, + "epoch": 0.13286713286713286, + "grad_norm": 0.9296630620956421, + "kl": 0.0012741987593472004, + "learning_rate": 9.500000000000001e-07, + "loss": 0.0001, + "reward": 1.25, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 38 + }, + { + "completion_length": 284.3333435058594, + "epoch": 0.13636363636363635, + "grad_norm": 1.3948841094970703, + "kl": 0.0010804318590089679, + "learning_rate": 9.750000000000002e-07, + "loss": 0.0, + "reward": 1.1083333492279053, + "reward_std": 1.263098120689392, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2750000059604645, + "step": 39 + }, + { + "completion_length": 132.1666717529297, + "epoch": 0.13986013986013987, + "grad_norm": 1.0202951431274414, + "kl": 0.0013121496886014938, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0001, + "reward": 0.3333333432674408, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0, + "step": 40 + }, + { + "completion_length": 156.1666717529297, + "epoch": 0.14335664335664336, + "grad_norm": 0.9724128246307373, + "kl": 0.0010785979684442282, + "learning_rate": 1.025e-06, + "loss": 0.0, + "reward": 0.6083333492279053, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10833333432674408, + "step": 41 + }, + { + "completion_length": 603.1666870117188, + "epoch": 0.14685314685314685, + "grad_norm": 0.7776791453361511, + "kl": 0.0006764258723706007, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.0, + "reward": 1.4500001668930054, + "reward_std": 0.30659419298171997, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.45000001788139343, + "step": 42 + }, + { + "completion_length": 183.1666717529297, + "epoch": 0.15034965034965034, + "grad_norm": 1.2581369876861572, + "kl": 0.0012429999187588692, + "learning_rate": 1.075e-06, + "loss": 0.0, + "reward": 1.1749999523162842, + "reward_std": 1.0567638874053955, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17499999701976776, + "step": 43 + }, + { + "completion_length": 379.16668701171875, + "epoch": 0.15384615384615385, + "grad_norm": 2.0310208797454834, + "kl": 0.0011767616961151361, + "learning_rate": 1.1e-06, + "loss": 0.0, + "reward": 2.633333683013916, + "reward_std": 1.0595598220825195, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.46666663885116577, + "step": 44 + }, + { + "completion_length": 637.3333740234375, + "epoch": 0.15734265734265734, + "grad_norm": 1.2500090599060059, + "kl": 0.001643048133701086, + "learning_rate": 1.125e-06, + "loss": 0.0001, + "reward": 1.1500000953674316, + "reward_std": 0.7307531237602234, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 45 + }, + { + "completion_length": 182.0, + "epoch": 0.16083916083916083, + "grad_norm": 2.3323163986206055, + "kl": 0.003556631039828062, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.0001, + "reward": 0.9666666984558105, + "reward_std": 1.0230672359466553, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.13333334028720856, + "step": 46 + }, + { + "completion_length": 109.83333587646484, + "epoch": 0.16433566433566432, + "grad_norm": 1.834832787513733, + "kl": 0.002168774139136076, + "learning_rate": 1.175e-06, + "loss": 0.0001, + "reward": 0.5583333373069763, + "reward_std": 0.6248332858085632, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 47 + }, + { + "completion_length": 337.16668701171875, + "epoch": 0.16783216783216784, + "grad_norm": 1.1725846529006958, + "kl": 0.002405840437859297, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0001, + "reward": 0.6500000357627869, + "reward_std": 0.7962412238121033, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 48 + }, + { + "completion_length": 437.3333435058594, + "epoch": 0.17132867132867133, + "grad_norm": 0.743201494216919, + "kl": 0.0013375936541706324, + "learning_rate": 1.2250000000000001e-06, + "loss": 0.0001, + "reward": 1.183333396911621, + "reward_std": 1.3611271381378174, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3499999940395355, + "step": 49 + }, + { + "completion_length": 533.8333740234375, + "epoch": 0.17482517482517482, + "grad_norm": 0.7576809525489807, + "kl": 0.0019401045283302665, + "learning_rate": 1.25e-06, + "loss": 0.0001, + "reward": 1.7291667461395264, + "reward_std": 0.7050561308860779, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5625, + "step": 50 + }, + { + "completion_length": 203.5, + "epoch": 0.17832167832167833, + "grad_norm": 1.4076164960861206, + "kl": 0.0030774520710110664, + "learning_rate": 1.275e-06, + "loss": 0.0001, + "reward": 0.7750000357627869, + "reward_std": 0.5135659575462341, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2750000059604645, + "step": 51 + }, + { + "completion_length": 409.0, + "epoch": 0.18181818181818182, + "grad_norm": 0.8726016879081726, + "kl": 0.0025800741277635098, + "learning_rate": 1.3e-06, + "loss": 0.0001, + "reward": 0.5916666984558105, + "reward_std": 0.7324047088623047, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 52 + }, + { + "completion_length": 356.5, + "epoch": 0.1853146853146853, + "grad_norm": 0.877477765083313, + "kl": 0.0021268115378916264, + "learning_rate": 1.3250000000000002e-06, + "loss": 0.0001, + "reward": 1.6166666746139526, + "reward_std": 0.6976150274276733, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.28333336114883423, + "step": 53 + }, + { + "completion_length": 243.33334350585938, + "epoch": 0.1888111888111888, + "grad_norm": 0.9792532324790955, + "kl": 0.0043938253074884415, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.0002, + "reward": 1.1708333492279053, + "reward_std": 1.282616138458252, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17083333432674408, + "step": 54 + }, + { + "completion_length": 393.3333435058594, + "epoch": 0.19230769230769232, + "grad_norm": 1.205925703048706, + "kl": 0.0031106050591915846, + "learning_rate": 1.3750000000000002e-06, + "loss": 0.0001, + "reward": 0.9666666984558105, + "reward_std": 0.8084965944290161, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 55 + }, + { + "completion_length": 228.83334350585938, + "epoch": 0.1958041958041958, + "grad_norm": 0.7984407544136047, + "kl": 0.007072250358760357, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.0003, + "reward": 0.6916667222976685, + "reward_std": 1.1655113697052002, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19166666269302368, + "step": 56 + }, + { + "completion_length": 361.66668701171875, + "epoch": 0.1993006993006993, + "grad_norm": 3.0838680267333984, + "kl": 0.006738494616001844, + "learning_rate": 1.425e-06, + "loss": 0.0003, + "reward": 1.3041667938232422, + "reward_std": 0.2600080370903015, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30416667461395264, + "step": 57 + }, + { + "completion_length": 502.66668701171875, + "epoch": 0.20279720279720279, + "grad_norm": 0.7226095795631409, + "kl": 0.0058082761242985725, + "learning_rate": 1.45e-06, + "loss": 0.0002, + "reward": 1.399999976158142, + "reward_std": 1.3337916135787964, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.40000003576278687, + "step": 58 + }, + { + "completion_length": 210.5, + "epoch": 0.2062937062937063, + "grad_norm": 1.079681158065796, + "kl": 0.009464471600949764, + "learning_rate": 1.475e-06, + "loss": 0.0004, + "reward": 0.9750000238418579, + "reward_std": 1.1890122890472412, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.14166668057441711, + "step": 59 + }, + { + "completion_length": 208.5, + "epoch": 0.2097902097902098, + "grad_norm": 1.8312753438949585, + "kl": 0.03959222882986069, + "learning_rate": 1.5e-06, + "loss": 0.0016, + "reward": 0.5333333611488342, + "reward_std": 0.8553751707077026, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.20000001788139343, + "step": 60 + }, + { + "completion_length": 285.5, + "epoch": 0.21328671328671328, + "grad_norm": 0.9337784051895142, + "kl": 0.011914614588022232, + "learning_rate": 1.525e-06, + "loss": 0.0005, + "reward": 1.4458332061767578, + "reward_std": 0.4955846071243286, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.27916666865348816, + "step": 61 + }, + { + "completion_length": 276.3333435058594, + "epoch": 0.21678321678321677, + "grad_norm": 1.4266396760940552, + "kl": 0.02391706220805645, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.001, + "reward": 1.1583333015441895, + "reward_std": 0.8598934412002563, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32500001788139343, + "step": 62 + }, + { + "completion_length": 381.3333435058594, + "epoch": 0.2202797202797203, + "grad_norm": 1.1708087921142578, + "kl": 0.012987270019948483, + "learning_rate": 1.5750000000000002e-06, + "loss": 0.0005, + "reward": 1.5416667461395264, + "reward_std": 1.3807305097579956, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 63 + }, + { + "completion_length": 237.0, + "epoch": 0.22377622377622378, + "grad_norm": 1.3068374395370483, + "kl": 0.027782242745161057, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0011, + "reward": 1.433333396911621, + "reward_std": 1.162611961364746, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2666666805744171, + "step": 64 + }, + { + "completion_length": 797.6666870117188, + "epoch": 0.22727272727272727, + "grad_norm": 0.7319328784942627, + "kl": 0.013491494581103325, + "learning_rate": 1.6250000000000001e-06, + "loss": 0.0005, + "reward": 1.3166667222976685, + "reward_std": 0.8604747653007507, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3166666626930237, + "step": 65 + }, + { + "completion_length": 237.1666717529297, + "epoch": 0.23076923076923078, + "grad_norm": 1.9626200199127197, + "kl": 0.015099573880434036, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0006, + "reward": 0.9666666388511658, + "reward_std": 0.797287106513977, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 66 + }, + { + "completion_length": 221.1666717529297, + "epoch": 0.23426573426573427, + "grad_norm": 0.7815642952919006, + "kl": 0.03964684158563614, + "learning_rate": 1.6750000000000003e-06, + "loss": 0.0016, + "reward": 1.6416667699813843, + "reward_std": 1.0584973096847534, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.14166668057441711, + "step": 67 + }, + { + "completion_length": 227.33334350585938, + "epoch": 0.23776223776223776, + "grad_norm": 1.5282418727874756, + "kl": 0.0695306807756424, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0028, + "reward": 0.75, + "reward_std": 0.7375635504722595, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.25, + "step": 68 + }, + { + "completion_length": 673.3333740234375, + "epoch": 0.24125874125874125, + "grad_norm": 0.8560697436332703, + "kl": 0.03540939837694168, + "learning_rate": 1.725e-06, + "loss": 0.0014, + "reward": 2.200000047683716, + "reward_std": 0.9581232070922852, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5333333611488342, + "step": 69 + }, + { + "completion_length": 254.6666717529297, + "epoch": 0.24475524475524477, + "grad_norm": 1.2371562719345093, + "kl": 0.03692096844315529, + "learning_rate": 1.75e-06, + "loss": 0.0015, + "reward": 1.8249998092651367, + "reward_std": 0.9968700408935547, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32499998807907104, + "step": 70 + }, + { + "completion_length": 234.6666717529297, + "epoch": 0.24825174825174826, + "grad_norm": 0.9824966192245483, + "kl": 0.07421376556158066, + "learning_rate": 1.7750000000000002e-06, + "loss": 0.003, + "reward": 1.1666667461395264, + "reward_std": 0.6485882997512817, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3333333432674408, + "step": 71 + }, + { + "completion_length": 580.0, + "epoch": 0.2517482517482518, + "grad_norm": 1.0504631996154785, + "kl": 0.048039551824331284, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0019, + "reward": 1.808333396911621, + "reward_std": 1.302849531173706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 72 + }, + { + "completion_length": 788.1666870117188, + "epoch": 0.25524475524475526, + "grad_norm": 0.6447965502738953, + "kl": 0.04130098968744278, + "learning_rate": 1.825e-06, + "loss": 0.0017, + "reward": 1.3875000476837158, + "reward_std": 1.093817949295044, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5541666746139526, + "step": 73 + }, + { + "completion_length": 376.16668701171875, + "epoch": 0.25874125874125875, + "grad_norm": 1.347108244895935, + "kl": 0.19923770427703857, + "learning_rate": 1.85e-06, + "loss": 0.008, + "reward": 1.529166579246521, + "reward_std": 0.6618943214416504, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19583332538604736, + "step": 74 + }, + { + "completion_length": 227.1666717529297, + "epoch": 0.26223776223776224, + "grad_norm": 0.8091520667076111, + "kl": 0.06355344504117966, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.0025, + "reward": 0.75, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 75 + }, + { + "completion_length": 502.3333435058594, + "epoch": 0.26573426573426573, + "grad_norm": 1.1315293312072754, + "kl": 0.11514662951231003, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0046, + "reward": 1.504166603088379, + "reward_std": 1.256027102470398, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.33750003576278687, + "step": 76 + }, + { + "completion_length": 306.16668701171875, + "epoch": 0.2692307692307692, + "grad_norm": 1.6002874374389648, + "kl": 0.07964249700307846, + "learning_rate": 1.925e-06, + "loss": 0.0032, + "reward": 1.7083333730697632, + "reward_std": 1.2195971012115479, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5416666865348816, + "step": 77 + }, + { + "completion_length": 253.0, + "epoch": 0.2727272727272727, + "grad_norm": 1.134474754333496, + "kl": 0.09407778084278107, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0038, + "reward": 1.8333333730697632, + "reward_std": 1.0842816829681396, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3333333432674408, + "step": 78 + }, + { + "completion_length": 456.3333435058594, + "epoch": 0.2762237762237762, + "grad_norm": 1.4590799808502197, + "kl": 0.08163408935070038, + "learning_rate": 1.975e-06, + "loss": 0.0033, + "reward": 1.1875, + "reward_std": 1.164232611656189, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3541666865348816, + "step": 79 + }, + { + "completion_length": 273.0, + "epoch": 0.27972027972027974, + "grad_norm": 1.589087724685669, + "kl": 0.08010071516036987, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0032, + "reward": 0.9125000238418579, + "reward_std": 0.9088110327720642, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.07916666567325592, + "step": 80 + }, + { + "completion_length": 196.1666717529297, + "epoch": 0.28321678321678323, + "grad_norm": 1.4217482805252075, + "kl": 0.0619954913854599, + "learning_rate": 2.025e-06, + "loss": 0.0025, + "reward": 1.058333396911621, + "reward_std": 0.7486097812652588, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 81 + }, + { + "completion_length": 340.8333435058594, + "epoch": 0.2867132867132867, + "grad_norm": 1.056475043296814, + "kl": 0.05495650693774223, + "learning_rate": 2.05e-06, + "loss": 0.0022, + "reward": 0.8625000715255737, + "reward_std": 0.5305068492889404, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19583332538604736, + "step": 82 + }, + { + "completion_length": 410.66668701171875, + "epoch": 0.2902097902097902, + "grad_norm": 0.5162915587425232, + "kl": 0.04134432598948479, + "learning_rate": 2.075e-06, + "loss": 0.0017, + "reward": 1.1875, + "reward_std": 0.7466174364089966, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1875, + "step": 83 + }, + { + "completion_length": 510.66668701171875, + "epoch": 0.2937062937062937, + "grad_norm": 0.9501734972000122, + "kl": 0.047528013586997986, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0019, + "reward": 1.258333444595337, + "reward_std": 1.1069854497909546, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 84 + }, + { + "completion_length": 476.0, + "epoch": 0.2972027972027972, + "grad_norm": 1.0745543241500854, + "kl": 0.04738708958029747, + "learning_rate": 2.125e-06, + "loss": 0.0019, + "reward": 0.7541666030883789, + "reward_std": 0.6050654649734497, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2541666626930237, + "step": 85 + }, + { + "completion_length": 346.16668701171875, + "epoch": 0.3006993006993007, + "grad_norm": 0.7894018888473511, + "kl": 0.03818603605031967, + "learning_rate": 2.15e-06, + "loss": 0.0015, + "reward": 1.5499999523162842, + "reward_std": 0.8729261159896851, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 86 + }, + { + "completion_length": 157.5, + "epoch": 0.3041958041958042, + "grad_norm": 1.2285088300704956, + "kl": 0.04852033406496048, + "learning_rate": 2.1750000000000004e-06, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 1.2284135818481445, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 87 + }, + { + "completion_length": 853.5, + "epoch": 0.3076923076923077, + "grad_norm": 1.1314716339111328, + "kl": 0.03052813559770584, + "learning_rate": 2.2e-06, + "loss": 0.0012, + "reward": 1.5625, + "reward_std": 1.093817949295044, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3958333432674408, + "step": 88 + }, + { + "completion_length": 372.66668701171875, + "epoch": 0.3111888111888112, + "grad_norm": 0.9353286623954773, + "kl": 0.027921725064516068, + "learning_rate": 2.2250000000000003e-06, + "loss": 0.0011, + "reward": 1.8250000476837158, + "reward_std": 0.9234446287155151, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32500001788139343, + "step": 89 + }, + { + "completion_length": 296.3333435058594, + "epoch": 0.3146853146853147, + "grad_norm": 1.140289306640625, + "kl": 0.04811665043234825, + "learning_rate": 2.25e-06, + "loss": 0.0019, + "reward": 1.125, + "reward_std": 1.1268318891525269, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 90 + }, + { + "completion_length": 99.83333587646484, + "epoch": 0.3181818181818182, + "grad_norm": 4.178561687469482, + "kl": 0.09318779408931732, + "learning_rate": 2.2750000000000002e-06, + "loss": 0.0037, + "reward": 0.5583333373069763, + "reward_std": 0.9645810127258301, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 91 + }, + { + "completion_length": 192.1666717529297, + "epoch": 0.32167832167832167, + "grad_norm": 1.560648798942566, + "kl": 0.03698144853115082, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0015, + "reward": 1.9249999523162842, + "reward_std": 0.718853235244751, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.25833335518836975, + "step": 92 + }, + { + "completion_length": 576.5, + "epoch": 0.32517482517482516, + "grad_norm": 1.093043327331543, + "kl": 0.021529672667384148, + "learning_rate": 2.325e-06, + "loss": 0.0009, + "reward": 1.070833444595337, + "reward_std": 0.6477686166763306, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.23749999701976776, + "step": 93 + }, + { + "completion_length": 335.8333435058594, + "epoch": 0.32867132867132864, + "grad_norm": 0.8303731679916382, + "kl": 0.019405633211135864, + "learning_rate": 2.35e-06, + "loss": 0.0008, + "reward": 0.8416666984558105, + "reward_std": 0.9183772802352905, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17499999701976776, + "step": 94 + }, + { + "completion_length": 569.5, + "epoch": 0.3321678321678322, + "grad_norm": 1.4912625551223755, + "kl": 0.014733041636645794, + "learning_rate": 2.375e-06, + "loss": 0.0006, + "reward": 1.4541667699813843, + "reward_std": 1.1459076404571533, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4541666507720947, + "step": 95 + }, + { + "completion_length": 232.83334350585938, + "epoch": 0.3356643356643357, + "grad_norm": 0.9174475073814392, + "kl": 0.018923718482255936, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0008, + "reward": 1.3333333730697632, + "reward_std": 0.9877583980560303, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 96 + }, + { + "completion_length": 742.1666870117188, + "epoch": 0.33916083916083917, + "grad_norm": 1.258750557899475, + "kl": 0.017664968967437744, + "learning_rate": 2.425e-06, + "loss": 0.0007, + "reward": 1.4583333730697632, + "reward_std": 0.6202150583267212, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 97 + }, + { + "completion_length": 270.8333435058594, + "epoch": 0.34265734265734266, + "grad_norm": 0.9259786605834961, + "kl": 0.05115365609526634, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.002, + "reward": 1.5500000715255737, + "reward_std": 0.8729261159896851, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.21666666865348816, + "step": 98 + }, + { + "completion_length": 476.3333435058594, + "epoch": 0.34615384615384615, + "grad_norm": 1.240902066230774, + "kl": 0.036602895706892014, + "learning_rate": 2.475e-06, + "loss": 0.0015, + "reward": 1.2791666984558105, + "reward_std": 1.1935679912567139, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.27916669845581055, + "step": 99 + }, + { + "completion_length": 213.6666717529297, + "epoch": 0.34965034965034963, + "grad_norm": 0.943215548992157, + "kl": 0.04590342566370964, + "learning_rate": 2.5e-06, + "loss": 0.0018, + "reward": 1.841666579246521, + "reward_std": 0.6873258352279663, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.34166666865348816, + "step": 100 + }, + { + "completion_length": 401.0, + "epoch": 0.3531468531468531, + "grad_norm": 0.7366496324539185, + "kl": 0.016905900090932846, + "learning_rate": 2.5250000000000004e-06, + "loss": 0.0007, + "reward": 1.3000000715255737, + "reward_std": 1.1256110668182373, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 101 + }, + { + "completion_length": 854.5, + "epoch": 0.35664335664335667, + "grad_norm": 8.089740753173828, + "kl": 0.08785610646009445, + "learning_rate": 2.55e-06, + "loss": 0.0035, + "reward": 1.316666603088379, + "reward_std": 1.2330517768859863, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 102 + }, + { + "completion_length": 455.16668701171875, + "epoch": 0.36013986013986016, + "grad_norm": 1.6066083908081055, + "kl": 0.03349429741501808, + "learning_rate": 2.5750000000000003e-06, + "loss": 0.0013, + "reward": 1.7333333492279053, + "reward_std": 1.6448911428451538, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40000003576278687, + "step": 103 + }, + { + "completion_length": 558.6666870117188, + "epoch": 0.36363636363636365, + "grad_norm": 1.2461860179901123, + "kl": 0.0453556627035141, + "learning_rate": 2.6e-06, + "loss": 0.0018, + "reward": 1.933333396911621, + "reward_std": 1.1851863861083984, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 104 + }, + { + "completion_length": 462.16668701171875, + "epoch": 0.36713286713286714, + "grad_norm": 0.9176071286201477, + "kl": 0.05445032939314842, + "learning_rate": 2.6250000000000003e-06, + "loss": 0.0022, + "reward": 1.2916667461395264, + "reward_std": 0.9144214391708374, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 105 + }, + { + "completion_length": 357.5, + "epoch": 0.3706293706293706, + "grad_norm": 1.1796709299087524, + "kl": 0.08697855472564697, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0035, + "reward": 0.9833333492279053, + "reward_std": 1.2412359714508057, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 106 + }, + { + "completion_length": 556.8333740234375, + "epoch": 0.3741258741258741, + "grad_norm": 1.1719709634780884, + "kl": 0.09557916224002838, + "learning_rate": 2.6750000000000002e-06, + "loss": 0.0038, + "reward": 0.9541666507720947, + "reward_std": 1.0742924213409424, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2875000238418579, + "step": 107 + }, + { + "completion_length": 490.8333435058594, + "epoch": 0.3776223776223776, + "grad_norm": 0.9839584827423096, + "kl": 0.07620736211538315, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.003, + "reward": 1.3416666984558105, + "reward_std": 1.0551856756210327, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5083333253860474, + "step": 108 + }, + { + "completion_length": 459.8333435058594, + "epoch": 0.3811188811188811, + "grad_norm": 1.0232492685317993, + "kl": 0.09754881262779236, + "learning_rate": 2.7250000000000006e-06, + "loss": 0.0039, + "reward": 1.7916667461395264, + "reward_std": 1.201422929763794, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 109 + }, + { + "completion_length": 432.5, + "epoch": 0.38461538461538464, + "grad_norm": 0.7946304082870483, + "kl": 0.043154411017894745, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0017, + "reward": 2.1000001430511475, + "reward_std": 0.8933085203170776, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 110 + }, + { + "completion_length": 346.8333435058594, + "epoch": 0.3881118881118881, + "grad_norm": 0.9842674136161804, + "kl": 0.1046643778681755, + "learning_rate": 2.7750000000000005e-06, + "loss": 0.0042, + "reward": 0.8166667222976685, + "reward_std": 0.7353004217147827, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 111 + }, + { + "completion_length": 214.5, + "epoch": 0.3916083916083916, + "grad_norm": 1.1671849489212036, + "kl": 0.1281026154756546, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0051, + "reward": 1.0500000715255737, + "reward_std": 0.14832398295402527, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 112 + }, + { + "completion_length": 908.6666870117188, + "epoch": 0.3951048951048951, + "grad_norm": 0.3388780951499939, + "kl": 0.022495290264487267, + "learning_rate": 2.825e-06, + "loss": 0.0009, + "reward": 2.3375000953674316, + "reward_std": 0.3727431893348694, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6708333492279053, + "step": 113 + }, + { + "completion_length": 891.6666870117188, + "epoch": 0.3986013986013986, + "grad_norm": 0.467278391122818, + "kl": 0.025123490020632744, + "learning_rate": 2.85e-06, + "loss": 0.001, + "reward": 1.8541667461395264, + "reward_std": 0.7543899416923523, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6875, + "step": 114 + }, + { + "completion_length": 546.1666870117188, + "epoch": 0.4020979020979021, + "grad_norm": 1.054366111755371, + "kl": 0.0783834159374237, + "learning_rate": 2.875e-06, + "loss": 0.0031, + "reward": 2.4000000953674316, + "reward_std": 1.306904673576355, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5666666626930237, + "step": 115 + }, + { + "completion_length": 835.1666870117188, + "epoch": 0.40559440559440557, + "grad_norm": 0.7376688122749329, + "kl": 0.04768560454249382, + "learning_rate": 2.9e-06, + "loss": 0.0019, + "reward": 1.5291666984558105, + "reward_std": 0.32841163873672485, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5291666984558105, + "step": 116 + }, + { + "completion_length": 368.3333435058594, + "epoch": 0.4090909090909091, + "grad_norm": 1.456405758857727, + "kl": 0.1393664926290512, + "learning_rate": 2.925e-06, + "loss": 0.0056, + "reward": 0.9541666507720947, + "reward_std": 0.7450531721115112, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2875000238418579, + "step": 117 + }, + { + "completion_length": 485.5, + "epoch": 0.4125874125874126, + "grad_norm": 1.4957919120788574, + "kl": 0.1291833370923996, + "learning_rate": 2.95e-06, + "loss": 0.0052, + "reward": 1.5833333730697632, + "reward_std": 1.4998888969421387, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4166666865348816, + "step": 118 + }, + { + "completion_length": 356.3333435058594, + "epoch": 0.4160839160839161, + "grad_norm": 1.178475022315979, + "kl": 0.10108506679534912, + "learning_rate": 2.9750000000000003e-06, + "loss": 0.004, + "reward": 0.7083333730697632, + "reward_std": 0.7506109476089478, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 119 + }, + { + "completion_length": 140.33334350585938, + "epoch": 0.4195804195804196, + "grad_norm": 1.4624924659729004, + "kl": 0.2249661386013031, + "learning_rate": 3e-06, + "loss": 0.009, + "reward": 0.9166666865348816, + "reward_std": 0.9174239635467529, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 120 + }, + { + "completion_length": 673.1666870117188, + "epoch": 0.4230769230769231, + "grad_norm": 1.0837116241455078, + "kl": 0.09312133491039276, + "learning_rate": 3.0250000000000003e-06, + "loss": 0.0037, + "reward": 2.2208335399627686, + "reward_std": 0.9818881750106812, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.38749998807907104, + "step": 121 + }, + { + "completion_length": 238.1666717529297, + "epoch": 0.42657342657342656, + "grad_norm": 1.0982871055603027, + "kl": 0.05689762160181999, + "learning_rate": 3.05e-06, + "loss": 0.0023, + "reward": 1.1166666746139526, + "reward_std": 0.7567474246025085, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.11666666716337204, + "step": 122 + }, + { + "completion_length": 576.1666870117188, + "epoch": 0.43006993006993005, + "grad_norm": 1.0922025442123413, + "kl": 0.04579655081033707, + "learning_rate": 3.075e-06, + "loss": 0.0018, + "reward": 2.4000000953674316, + "reward_std": 1.0807406902313232, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5666666626930237, + "step": 123 + }, + { + "completion_length": 736.6666870117188, + "epoch": 0.43356643356643354, + "grad_norm": 1.5019290447235107, + "kl": 0.030428007245063782, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0012, + "reward": 1.504166603088379, + "reward_std": 1.2472386360168457, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5041667222976685, + "step": 124 + }, + { + "completion_length": 603.5, + "epoch": 0.4370629370629371, + "grad_norm": 4.212569713592529, + "kl": 0.37697991728782654, + "learning_rate": 3.125e-06, + "loss": 0.0151, + "reward": 1.6416667699813843, + "reward_std": 0.8303112387657166, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6416667103767395, + "step": 125 + }, + { + "completion_length": 492.0, + "epoch": 0.4405594405594406, + "grad_norm": 0.9634215831756592, + "kl": 0.06763506680727005, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0027, + "reward": 2.125, + "reward_std": 1.2069590091705322, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 126 + }, + { + "completion_length": 792.1666870117188, + "epoch": 0.44405594405594406, + "grad_norm": 0.4220138192176819, + "kl": 0.03986603766679764, + "learning_rate": 3.175e-06, + "loss": 0.0016, + "reward": 1.1375000476837158, + "reward_std": 0.5137485265731812, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6375000476837158, + "step": 127 + }, + { + "completion_length": 535.5, + "epoch": 0.44755244755244755, + "grad_norm": 4.797938823699951, + "kl": 0.13327616453170776, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0053, + "reward": 1.1791666746139526, + "reward_std": 1.1582764387130737, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.34583336114883423, + "step": 128 + }, + { + "completion_length": 444.8333435058594, + "epoch": 0.45104895104895104, + "grad_norm": 0.7808079719543457, + "kl": 0.055326174944639206, + "learning_rate": 3.2250000000000005e-06, + "loss": 0.0022, + "reward": 1.495833396911621, + "reward_std": 0.7681823968887329, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.16250000894069672, + "step": 129 + }, + { + "completion_length": 454.66668701171875, + "epoch": 0.45454545454545453, + "grad_norm": 0.8776301741600037, + "kl": 0.11162035167217255, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0045, + "reward": 1.5750001668930054, + "reward_std": 1.4812158346176147, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.24166665971279144, + "step": 130 + }, + { + "completion_length": 769.6666870117188, + "epoch": 0.458041958041958, + "grad_norm": 0.4391367733478546, + "kl": 0.025292951613664627, + "learning_rate": 3.2750000000000004e-06, + "loss": 0.001, + "reward": 2.433333396911621, + "reward_std": 0.2746209502220154, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6000000238418579, + "step": 131 + }, + { + "completion_length": 528.6666870117188, + "epoch": 0.46153846153846156, + "grad_norm": 0.8809014558792114, + "kl": 0.12223925441503525, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0049, + "reward": 2.120833396911621, + "reward_std": 1.101410150527954, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4541666507720947, + "step": 132 + }, + { + "completion_length": 491.3333435058594, + "epoch": 0.46503496503496505, + "grad_norm": 1.0070464611053467, + "kl": 0.05908138304948807, + "learning_rate": 3.3250000000000004e-06, + "loss": 0.0024, + "reward": 0.5916666984558105, + "reward_std": 0.5335416197776794, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 133 + }, + { + "completion_length": 892.5, + "epoch": 0.46853146853146854, + "grad_norm": 0.4570764899253845, + "kl": 0.037701599299907684, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0015, + "reward": 1.7249999046325684, + "reward_std": 1.292478322982788, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 134 + }, + { + "completion_length": 806.8333740234375, + "epoch": 0.47202797202797203, + "grad_norm": 0.5572299361228943, + "kl": 0.05404336377978325, + "learning_rate": 3.3750000000000003e-06, + "loss": 0.0022, + "reward": 1.4583333730697632, + "reward_std": 0.990033745765686, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7916666269302368, + "step": 135 + }, + { + "completion_length": 589.0, + "epoch": 0.4755244755244755, + "grad_norm": 0.7575751543045044, + "kl": 0.04170485585927963, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0017, + "reward": 2.683333396911621, + "reward_std": 1.1075499057769775, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8500000238418579, + "step": 136 + }, + { + "completion_length": 1060.166748046875, + "epoch": 0.479020979020979, + "grad_norm": 0.5119641423225403, + "kl": 0.04976843297481537, + "learning_rate": 3.4250000000000007e-06, + "loss": 0.002, + "reward": 1.1125000715255737, + "reward_std": 0.39457258582115173, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6124999523162842, + "step": 137 + }, + { + "completion_length": 559.8333740234375, + "epoch": 0.4825174825174825, + "grad_norm": 0.6115387082099915, + "kl": 0.05675242468714714, + "learning_rate": 3.45e-06, + "loss": 0.0023, + "reward": 2.0416667461395264, + "reward_std": 0.5715476274490356, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5416666865348816, + "step": 138 + }, + { + "completion_length": 685.6666870117188, + "epoch": 0.486013986013986, + "grad_norm": 1.2578071355819702, + "kl": 0.07080799341201782, + "learning_rate": 3.475e-06, + "loss": 0.0028, + "reward": 1.379166603088379, + "reward_std": 1.0072758197784424, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7125000357627869, + "step": 139 + }, + { + "completion_length": 987.5, + "epoch": 0.48951048951048953, + "grad_norm": 0.6280319690704346, + "kl": 0.03268418833613396, + "learning_rate": 3.5e-06, + "loss": 0.0013, + "reward": 0.9291666746139526, + "reward_std": 0.6654728651046753, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5958333015441895, + "step": 140 + }, + { + "completion_length": 728.5, + "epoch": 0.493006993006993, + "grad_norm": 0.8773026466369629, + "kl": 0.032183535397052765, + "learning_rate": 3.525e-06, + "loss": 0.0013, + "reward": 2.862499952316284, + "reward_std": 0.7864078879356384, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6958333253860474, + "step": 141 + }, + { + "completion_length": 405.8333435058594, + "epoch": 0.4965034965034965, + "grad_norm": 0.8974792957305908, + "kl": 0.059865664690732956, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0024, + "reward": 1.6875, + "reward_std": 0.8300225734710693, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.3541666865348816, + "step": 142 + }, + { + "completion_length": 1081.666748046875, + "epoch": 0.5, + "grad_norm": 0.5286564230918884, + "kl": 0.022505857050418854, + "learning_rate": 3.575e-06, + "loss": 0.0009, + "reward": 2.370833396911621, + "reward_std": 0.4925486445426941, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8708332777023315, + "step": 143 + }, + { + "completion_length": 1141.3333740234375, + "epoch": 0.5034965034965035, + "grad_norm": 0.527409017086029, + "kl": 0.021072231233119965, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0008, + "reward": 1.9291666746139526, + "reward_std": 0.7955214381217957, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5958333611488342, + "step": 144 + }, + { + "completion_length": 515.5, + "epoch": 0.506993006993007, + "grad_norm": 2.5036261081695557, + "kl": 0.3181736469268799, + "learning_rate": 3.625e-06, + "loss": 0.0127, + "reward": 1.5833333730697632, + "reward_std": 0.9988327026367188, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5833333730697632, + "step": 145 + }, + { + "completion_length": 599.5, + "epoch": 0.5104895104895105, + "grad_norm": 0.7538139224052429, + "kl": 0.041587017476558685, + "learning_rate": 3.65e-06, + "loss": 0.0017, + "reward": 1.3583334684371948, + "reward_std": 0.6873258352279663, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6916666030883789, + "step": 146 + }, + { + "completion_length": 645.6666870117188, + "epoch": 0.513986013986014, + "grad_norm": 0.6815938353538513, + "kl": 0.031590305268764496, + "learning_rate": 3.6750000000000004e-06, + "loss": 0.0013, + "reward": 2.445833683013916, + "reward_std": 1.186003565788269, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6124999523162842, + "step": 147 + }, + { + "completion_length": 731.0, + "epoch": 0.5174825174825175, + "grad_norm": 1.4654277563095093, + "kl": 0.11272114515304565, + "learning_rate": 3.7e-06, + "loss": 0.0045, + "reward": 1.2125000953674316, + "reward_std": 0.7435977458953857, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7124999761581421, + "step": 148 + }, + { + "completion_length": 476.16668701171875, + "epoch": 0.5209790209790209, + "grad_norm": 3.388495683670044, + "kl": 0.9080104827880859, + "learning_rate": 3.7250000000000003e-06, + "loss": 0.0363, + "reward": 1.8958333730697632, + "reward_std": 0.9965461492538452, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3958333432674408, + "step": 149 + }, + { + "completion_length": 1053.166748046875, + "epoch": 0.5244755244755245, + "grad_norm": 0.4761454164981842, + "kl": 0.027715642005205154, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0011, + "reward": 3.2916667461395264, + "reward_std": 0.7417322397232056, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 150 + }, + { + "completion_length": 751.1666870117188, + "epoch": 0.527972027972028, + "grad_norm": 0.6827074885368347, + "kl": 0.0386313796043396, + "learning_rate": 3.7750000000000003e-06, + "loss": 0.0015, + "reward": 2.495833396911621, + "reward_std": 1.0227923393249512, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6625000238418579, + "step": 151 + }, + { + "completion_length": 721.8333740234375, + "epoch": 0.5314685314685315, + "grad_norm": 1.2814685106277466, + "kl": 0.041070081293582916, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0016, + "reward": 2.4666666984558105, + "reward_std": 0.8834120631217957, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 152 + }, + { + "completion_length": 513.0, + "epoch": 0.534965034965035, + "grad_norm": 0.6044140458106995, + "kl": 0.08036690950393677, + "learning_rate": 3.825000000000001e-06, + "loss": 0.0032, + "reward": 1.7875001430511475, + "reward_std": 1.1646621227264404, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6208333373069763, + "step": 153 + }, + { + "completion_length": 720.8333740234375, + "epoch": 0.5384615384615384, + "grad_norm": 0.7732751965522766, + "kl": 0.04927179962396622, + "learning_rate": 3.85e-06, + "loss": 0.002, + "reward": 2.383333206176758, + "reward_std": 1.4126808643341064, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 154 + }, + { + "completion_length": 708.8333740234375, + "epoch": 0.541958041958042, + "grad_norm": 0.6660548448562622, + "kl": 0.07937665283679962, + "learning_rate": 3.875e-06, + "loss": 0.0032, + "reward": 2.183333396911621, + "reward_std": 0.6377042531967163, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8500000238418579, + "step": 155 + }, + { + "completion_length": 1192.0, + "epoch": 0.5454545454545454, + "grad_norm": 0.3896901309490204, + "kl": 0.025209862738847733, + "learning_rate": 3.900000000000001e-06, + "loss": 0.001, + "reward": 1.8833332061767578, + "reward_std": 0.8691471815109253, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 156 + }, + { + "completion_length": 705.1666870117188, + "epoch": 0.548951048951049, + "grad_norm": 0.5750932097434998, + "kl": 0.04517858847975731, + "learning_rate": 3.9250000000000005e-06, + "loss": 0.0018, + "reward": 2.9541664123535156, + "reward_std": 0.6458360552787781, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6208333373069763, + "step": 157 + }, + { + "completion_length": 465.5, + "epoch": 0.5524475524475524, + "grad_norm": 0.8335661888122559, + "kl": 0.08351196348667145, + "learning_rate": 3.95e-06, + "loss": 0.0033, + "reward": 2.424999952316284, + "reward_std": 0.941673994064331, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5916666984558105, + "step": 158 + }, + { + "completion_length": 539.6666870117188, + "epoch": 0.5559440559440559, + "grad_norm": 1.1459757089614868, + "kl": 0.12647944688796997, + "learning_rate": 3.975000000000001e-06, + "loss": 0.0051, + "reward": 1.6416667699813843, + "reward_std": 1.2068209648132324, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 159 + }, + { + "completion_length": 798.0, + "epoch": 0.5594405594405595, + "grad_norm": 0.4939272105693817, + "kl": 0.051064085215330124, + "learning_rate": 4.000000000000001e-06, + "loss": 0.002, + "reward": 2.183333396911621, + "reward_std": 1.2081665992736816, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 160 + }, + { + "completion_length": 338.8333435058594, + "epoch": 0.5629370629370629, + "grad_norm": 0.8890612125396729, + "kl": 0.12327366322278976, + "learning_rate": 4.0250000000000004e-06, + "loss": 0.0049, + "reward": 2.575000286102295, + "reward_std": 0.9913375377655029, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40833336114883423, + "step": 161 + }, + { + "completion_length": 809.6666870117188, + "epoch": 0.5664335664335665, + "grad_norm": 0.3928314447402954, + "kl": 0.040153808891773224, + "learning_rate": 4.05e-06, + "loss": 0.0016, + "reward": 1.7208335399627686, + "reward_std": 0.5225937366485596, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7208333015441895, + "step": 162 + }, + { + "completion_length": 766.0, + "epoch": 0.5699300699300699, + "grad_norm": 0.7869060039520264, + "kl": 0.04531605541706085, + "learning_rate": 4.075e-06, + "loss": 0.0018, + "reward": 2.120833396911621, + "reward_std": 0.8866251707077026, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4541667103767395, + "step": 163 + }, + { + "completion_length": 1085.666748046875, + "epoch": 0.5734265734265734, + "grad_norm": 1.0671396255493164, + "kl": 0.06464602053165436, + "learning_rate": 4.1e-06, + "loss": 0.0026, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666865348816, + "step": 164 + }, + { + "completion_length": 628.1666870117188, + "epoch": 0.5769230769230769, + "grad_norm": 0.9583672285079956, + "kl": 0.06743767857551575, + "learning_rate": 4.125e-06, + "loss": 0.0027, + "reward": 2.137500286102295, + "reward_std": 1.376930594444275, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.637499988079071, + "step": 165 + }, + { + "completion_length": 351.8333435058594, + "epoch": 0.5804195804195804, + "grad_norm": 0.6946209669113159, + "kl": 0.09894745796918869, + "learning_rate": 4.15e-06, + "loss": 0.004, + "reward": 2.7750000953674316, + "reward_std": 0.7055140733718872, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4416666626930237, + "step": 166 + }, + { + "completion_length": 448.16668701171875, + "epoch": 0.583916083916084, + "grad_norm": 0.6712130308151245, + "kl": 0.0714031383395195, + "learning_rate": 4.175e-06, + "loss": 0.0029, + "reward": 1.9583333730697632, + "reward_std": 0.6499359011650085, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6250000596046448, + "step": 167 + }, + { + "completion_length": 763.0, + "epoch": 0.5874125874125874, + "grad_norm": 0.5934569239616394, + "kl": 0.039833370596170425, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0016, + "reward": 1.7208335399627686, + "reward_std": 0.6870983839035034, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.720833420753479, + "step": 168 + }, + { + "completion_length": 813.8333740234375, + "epoch": 0.5909090909090909, + "grad_norm": 0.46408811211586, + "kl": 0.0639135017991066, + "learning_rate": 4.225e-06, + "loss": 0.0026, + "reward": 2.6625001430511475, + "reward_std": 0.271454393863678, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6625000238418579, + "step": 169 + }, + { + "completion_length": 621.3333740234375, + "epoch": 0.5944055944055944, + "grad_norm": 1.6175382137298584, + "kl": 0.23431169986724854, + "learning_rate": 4.25e-06, + "loss": 0.0094, + "reward": 1.5250000953674316, + "reward_std": 1.00784432888031, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 170 + }, + { + "completion_length": 685.1666870117188, + "epoch": 0.5979020979020979, + "grad_norm": 0.7504808306694031, + "kl": 0.06654171645641327, + "learning_rate": 4.2750000000000006e-06, + "loss": 0.0027, + "reward": 2.4583334922790527, + "reward_std": 0.8187898397445679, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 171 + }, + { + "completion_length": 772.6666870117188, + "epoch": 0.6013986013986014, + "grad_norm": 0.39892545342445374, + "kl": 0.030765770003199577, + "learning_rate": 4.3e-06, + "loss": 0.0012, + "reward": 1.7333333492279053, + "reward_std": 0.40207797288894653, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 172 + }, + { + "completion_length": 600.8333740234375, + "epoch": 0.6048951048951049, + "grad_norm": 0.6147928833961487, + "kl": 0.07108036428689957, + "learning_rate": 4.325e-06, + "loss": 0.0028, + "reward": 2.054166793823242, + "reward_std": 0.5684225559234619, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7208333015441895, + "step": 173 + }, + { + "completion_length": 761.3333740234375, + "epoch": 0.6083916083916084, + "grad_norm": 1.1690645217895508, + "kl": 0.11572085320949554, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0046, + "reward": 1.9583333730697632, + "reward_std": 1.2491663694381714, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7916666865348816, + "step": 174 + }, + { + "completion_length": 800.6666870117188, + "epoch": 0.6118881118881119, + "grad_norm": 1.141146183013916, + "kl": 0.0763167217373848, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.0031, + "reward": 1.4458335638046265, + "reward_std": 1.0782413482666016, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6124999523162842, + "step": 175 + }, + { + "completion_length": 582.0, + "epoch": 0.6153846153846154, + "grad_norm": 0.9667629599571228, + "kl": 0.04065123200416565, + "learning_rate": 4.4e-06, + "loss": 0.0016, + "reward": 1.5625, + "reward_std": 1.3656271696090698, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5625, + "step": 176 + }, + { + "completion_length": 653.6666870117188, + "epoch": 0.6188811188811189, + "grad_norm": 0.7743256092071533, + "kl": 0.07254478335380554, + "learning_rate": 4.425e-06, + "loss": 0.0029, + "reward": 1.308333396911621, + "reward_std": 0.7324048280715942, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6416666507720947, + "step": 177 + }, + { + "completion_length": 624.8333740234375, + "epoch": 0.6223776223776224, + "grad_norm": 1.7900493144989014, + "kl": 0.2500300407409668, + "learning_rate": 4.450000000000001e-06, + "loss": 0.01, + "reward": 1.3583333492279053, + "reward_std": 0.7825705409049988, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916667222976685, + "step": 178 + }, + { + "completion_length": 1285.0, + "epoch": 0.6258741258741258, + "grad_norm": 0.3387628197669983, + "kl": 0.025821728631854057, + "learning_rate": 4.475e-06, + "loss": 0.001, + "reward": 2.7916667461395264, + "reward_std": 0.678355872631073, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666269302368, + "step": 179 + }, + { + "completion_length": 975.8333740234375, + "epoch": 0.6293706293706294, + "grad_norm": 0.41932833194732666, + "kl": 0.04700490087270737, + "learning_rate": 4.5e-06, + "loss": 0.0019, + "reward": 1.8500001430511475, + "reward_std": 0.6782330274581909, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8499999046325684, + "step": 180 + }, + { + "completion_length": 771.8333740234375, + "epoch": 0.6328671328671329, + "grad_norm": 0.6049262881278992, + "kl": 0.05856431648135185, + "learning_rate": 4.525000000000001e-06, + "loss": 0.0023, + "reward": 1.6624999046325684, + "reward_std": 1.5213277339935303, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6625000238418579, + "step": 181 + }, + { + "completion_length": 718.3333740234375, + "epoch": 0.6363636363636364, + "grad_norm": 0.519266664981842, + "kl": 0.05408002436161041, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0022, + "reward": 3.012500286102295, + "reward_std": 1.0839452743530273, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8458333015441895, + "step": 182 + }, + { + "completion_length": 417.3333435058594, + "epoch": 0.6398601398601399, + "grad_norm": 1.159592866897583, + "kl": 0.06883987784385681, + "learning_rate": 4.575e-06, + "loss": 0.0028, + "reward": 2.308333396911621, + "reward_std": 1.089686393737793, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416666507720947, + "step": 183 + }, + { + "completion_length": 403.66668701171875, + "epoch": 0.6433566433566433, + "grad_norm": 0.9109689593315125, + "kl": 0.12938742339611053, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0052, + "reward": 2.829166889190674, + "reward_std": 0.9263390898704529, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4958333373069763, + "step": 184 + }, + { + "completion_length": 584.1666870117188, + "epoch": 0.6468531468531469, + "grad_norm": 1.3091282844543457, + "kl": 0.1182996854186058, + "learning_rate": 4.625000000000001e-06, + "loss": 0.0047, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 185 + }, + { + "completion_length": 715.8333740234375, + "epoch": 0.6503496503496503, + "grad_norm": 0.8944427967071533, + "kl": 0.07471362501382828, + "learning_rate": 4.65e-06, + "loss": 0.003, + "reward": 2.5500001907348633, + "reward_std": 1.0044898986816406, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 186 + }, + { + "completion_length": 328.66668701171875, + "epoch": 0.6538461538461539, + "grad_norm": 2.0265045166015625, + "kl": 0.3070363402366638, + "learning_rate": 4.675000000000001e-06, + "loss": 0.0123, + "reward": 2.0291666984558105, + "reward_std": 0.9910117983818054, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.36250001192092896, + "step": 187 + }, + { + "completion_length": 463.8333435058594, + "epoch": 0.6573426573426573, + "grad_norm": 1.1863874197006226, + "kl": 0.07772837579250336, + "learning_rate": 4.7e-06, + "loss": 0.0031, + "reward": 2.5333335399627686, + "reward_std": 0.9558593034744263, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5333333611488342, + "step": 188 + }, + { + "completion_length": 516.5, + "epoch": 0.6608391608391608, + "grad_norm": 0.690477192401886, + "kl": 0.08707510679960251, + "learning_rate": 4.7250000000000005e-06, + "loss": 0.0035, + "reward": 3.4000000953674316, + "reward_std": 1.2024973630905151, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 189 + }, + { + "completion_length": 656.8333740234375, + "epoch": 0.6643356643356644, + "grad_norm": 0.7191756963729858, + "kl": 0.05152536556124687, + "learning_rate": 4.75e-06, + "loss": 0.0021, + "reward": 1.7833335399627686, + "reward_std": 0.5288351774215698, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 190 + }, + { + "completion_length": 510.16668701171875, + "epoch": 0.6678321678321678, + "grad_norm": 1.589722990989685, + "kl": 0.11165278404951096, + "learning_rate": 4.775e-06, + "loss": 0.0045, + "reward": 1.5916666984558105, + "reward_std": 1.1620744466781616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5916666984558105, + "step": 191 + }, + { + "completion_length": 463.3333435058594, + "epoch": 0.6713286713286714, + "grad_norm": 1.1402506828308105, + "kl": 0.12224837392568588, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0049, + "reward": 3.0166664123535156, + "reward_std": 0.46224093437194824, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6833333373069763, + "step": 192 + }, + { + "completion_length": 668.8333740234375, + "epoch": 0.6748251748251748, + "grad_norm": 0.829407811164856, + "kl": 0.04827030003070831, + "learning_rate": 4.825e-06, + "loss": 0.0019, + "reward": 2.516666889190674, + "reward_std": 0.9416297674179077, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 193 + }, + { + "completion_length": 653.1666870117188, + "epoch": 0.6783216783216783, + "grad_norm": 0.8737359642982483, + "kl": 0.11687206476926804, + "learning_rate": 4.85e-06, + "loss": 0.0047, + "reward": 1.883333444595337, + "reward_std": 0.9978310465812683, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666388511658, + "step": 194 + }, + { + "completion_length": 521.1666870117188, + "epoch": 0.6818181818181818, + "grad_norm": 1.265020728111267, + "kl": 0.1497541069984436, + "learning_rate": 4.875e-06, + "loss": 0.006, + "reward": 1.6666667461395264, + "reward_std": 1.1578716039657593, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6666666865348816, + "step": 195 + }, + { + "completion_length": 720.3333740234375, + "epoch": 0.6853146853146853, + "grad_norm": 0.5844486355781555, + "kl": 0.07905390858650208, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0032, + "reward": 2.683333396911621, + "reward_std": 0.7659417986869812, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 196 + }, + { + "completion_length": 654.3333740234375, + "epoch": 0.6888111888111889, + "grad_norm": 1.0279442071914673, + "kl": 0.05869147181510925, + "learning_rate": 4.925e-06, + "loss": 0.0023, + "reward": 1.8250000476837158, + "reward_std": 1.047735571861267, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.824999988079071, + "step": 197 + }, + { + "completion_length": 696.5, + "epoch": 0.6923076923076923, + "grad_norm": 0.5949178338050842, + "kl": 0.10564576834440231, + "learning_rate": 4.95e-06, + "loss": 0.0042, + "reward": 2.7958333492279053, + "reward_std": 0.8044278621673584, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 198 + }, + { + "completion_length": 667.3333740234375, + "epoch": 0.6958041958041958, + "grad_norm": 1.4045933485031128, + "kl": 0.2249039262533188, + "learning_rate": 4.975000000000001e-06, + "loss": 0.009, + "reward": 1.7833333015441895, + "reward_std": 1.2967909574508667, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 199 + }, + { + "completion_length": 549.0, + "epoch": 0.6993006993006993, + "grad_norm": 11.491266250610352, + "kl": 2.7085909843444824, + "learning_rate": 5e-06, + "loss": 0.1083, + "reward": 1.883333444595337, + "reward_std": 1.0934655666351318, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 200 + }, + { + "completion_length": 1157.666748046875, + "epoch": 0.7027972027972028, + "grad_norm": 0.3758504092693329, + "kl": 0.03439244627952576, + "learning_rate": 4.99999619228322e-06, + "loss": 0.0014, + "reward": 1.5375001430511475, + "reward_std": 0.490853875875473, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 201 + }, + { + "completion_length": 276.66668701171875, + "epoch": 0.7062937062937062, + "grad_norm": 1.4240407943725586, + "kl": 0.09711845219135284, + "learning_rate": 4.999984769144476e-06, + "loss": 0.0039, + "reward": 1.774999976158142, + "reward_std": 1.4250439405441284, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.44166669249534607, + "step": 202 + }, + { + "completion_length": 506.16668701171875, + "epoch": 0.7097902097902098, + "grad_norm": 0.8863720893859863, + "kl": 0.0886097177863121, + "learning_rate": 4.999965730618567e-06, + "loss": 0.0035, + "reward": 2.4166667461395264, + "reward_std": 0.7717944979667664, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.75, + "step": 203 + }, + { + "completion_length": 558.8333740234375, + "epoch": 0.7132867132867133, + "grad_norm": 1.036176323890686, + "kl": 0.11752279102802277, + "learning_rate": 4.999939076763487e-06, + "loss": 0.0047, + "reward": 1.8583334684371948, + "reward_std": 0.7761551141738892, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916666030883789, + "step": 204 + }, + { + "completion_length": 590.3333740234375, + "epoch": 0.7167832167832168, + "grad_norm": 1.2968803644180298, + "kl": 0.1260688155889511, + "learning_rate": 4.9999048076604286e-06, + "loss": 0.005, + "reward": 1.883333444595337, + "reward_std": 1.0934655666351318, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666984558105, + "step": 205 + }, + { + "completion_length": 653.3333740234375, + "epoch": 0.7202797202797203, + "grad_norm": 1.9041389226913452, + "kl": 0.350026935338974, + "learning_rate": 4.999862923413781e-06, + "loss": 0.014, + "reward": 1.8041666746139526, + "reward_std": 0.5104941129684448, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6375000476837158, + "step": 206 + }, + { + "completion_length": 359.3333435058594, + "epoch": 0.7237762237762237, + "grad_norm": 1.4652067422866821, + "kl": 0.09337612986564636, + "learning_rate": 4.9998134241511305e-06, + "loss": 0.0037, + "reward": 1.875, + "reward_std": 1.1440061330795288, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5416666865348816, + "step": 207 + }, + { + "completion_length": 393.3333435058594, + "epoch": 0.7272727272727273, + "grad_norm": 0.8172839879989624, + "kl": 0.11479752510786057, + "learning_rate": 4.999756310023261e-06, + "loss": 0.0046, + "reward": 3.2916667461395264, + "reward_std": 0.46627962589263916, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.625, + "step": 208 + }, + { + "completion_length": 1035.166748046875, + "epoch": 0.7307692307692307, + "grad_norm": 0.45489755272865295, + "kl": 0.03647574782371521, + "learning_rate": 4.9996915812041515e-06, + "loss": 0.0015, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 209 + }, + { + "completion_length": 561.5, + "epoch": 0.7342657342657343, + "grad_norm": 0.7732179164886475, + "kl": 0.10910838097333908, + "learning_rate": 4.9996192378909785e-06, + "loss": 0.0044, + "reward": 3.075000286102295, + "reward_std": 0.9852665662765503, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416667342185974, + "step": 210 + }, + { + "completion_length": 327.3333435058594, + "epoch": 0.7377622377622378, + "grad_norm": 1.1959446668624878, + "kl": 0.18659886717796326, + "learning_rate": 4.999539280304111e-06, + "loss": 0.0075, + "reward": 1.7333333492279053, + "reward_std": 0.6875075697898865, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5666666626930237, + "step": 211 + }, + { + "completion_length": 698.1666870117188, + "epoch": 0.7412587412587412, + "grad_norm": 0.5885636806488037, + "kl": 0.06670037657022476, + "learning_rate": 4.999451708687114e-06, + "loss": 0.0027, + "reward": 2.7750003337860107, + "reward_std": 0.8341163396835327, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7749999761581421, + "step": 212 + }, + { + "completion_length": 679.8333740234375, + "epoch": 0.7447552447552448, + "grad_norm": 0.9122396111488342, + "kl": 0.10316199064254761, + "learning_rate": 4.999356523306746e-06, + "loss": 0.0041, + "reward": 2.008333444595337, + "reward_std": 1.2973692417144775, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5083333253860474, + "step": 213 + }, + { + "completion_length": 604.1666870117188, + "epoch": 0.7482517482517482, + "grad_norm": 0.7414869070053101, + "kl": 0.08340045064687729, + "learning_rate": 4.9992537244529585e-06, + "loss": 0.0033, + "reward": 3.299999952316284, + "reward_std": 0.41713306307792664, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8000000715255737, + "step": 214 + }, + { + "completion_length": 704.5, + "epoch": 0.7517482517482518, + "grad_norm": 2.09073543548584, + "kl": 0.10594753921031952, + "learning_rate": 4.999143312438893e-06, + "loss": 0.0042, + "reward": 1.7416666746139526, + "reward_std": 0.9259679317474365, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7416667342185974, + "step": 215 + }, + { + "completion_length": 587.8333740234375, + "epoch": 0.7552447552447552, + "grad_norm": 1.304240107536316, + "kl": 0.1295248121023178, + "learning_rate": 4.999025287600886e-06, + "loss": 0.0052, + "reward": 2.616666793823242, + "reward_std": 1.6061341762542725, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6166666746139526, + "step": 216 + }, + { + "completion_length": 495.8333435058594, + "epoch": 0.7587412587412588, + "grad_norm": 1.2090598344802856, + "kl": 0.11880560964345932, + "learning_rate": 4.9988996502984604e-06, + "loss": 0.0048, + "reward": 2.7333333492279053, + "reward_std": 1.022578477859497, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5666667222976685, + "step": 217 + }, + { + "completion_length": 565.6666870117188, + "epoch": 0.7622377622377622, + "grad_norm": 0.553954005241394, + "kl": 0.052788302302360535, + "learning_rate": 4.998766400914329e-06, + "loss": 0.0021, + "reward": 2.6999998092651367, + "reward_std": 0.9705669283866882, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.699999988079071, + "step": 218 + }, + { + "completion_length": 645.6666870117188, + "epoch": 0.7657342657342657, + "grad_norm": 2.507683038711548, + "kl": 0.2849184274673462, + "learning_rate": 4.998625539854394e-06, + "loss": 0.0114, + "reward": 2.6000001430511475, + "reward_std": 1.0089600086212158, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7666666507720947, + "step": 219 + }, + { + "completion_length": 321.66668701171875, + "epoch": 0.7692307692307693, + "grad_norm": 1.2175945043563843, + "kl": 0.0842239186167717, + "learning_rate": 4.99847706754774e-06, + "loss": 0.0034, + "reward": 2.933333158493042, + "reward_std": 0.6516644954681396, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6000000238418579, + "step": 220 + }, + { + "completion_length": 700.5, + "epoch": 0.7727272727272727, + "grad_norm": 2.048892021179199, + "kl": 0.16157689690589905, + "learning_rate": 4.9983209844466404e-06, + "loss": 0.0065, + "reward": 1.625, + "reward_std": 1.0167349576950073, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 221 + }, + { + "completion_length": 833.5, + "epoch": 0.7762237762237763, + "grad_norm": 0.9171572327613831, + "kl": 0.06645169854164124, + "learning_rate": 4.998157291026553e-06, + "loss": 0.0027, + "reward": 2.9083335399627686, + "reward_std": 1.2068209648132324, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7416666746139526, + "step": 222 + }, + { + "completion_length": 506.3333435058594, + "epoch": 0.7797202797202797, + "grad_norm": 19.220211029052734, + "kl": 3.192702293395996, + "learning_rate": 4.9979859877861155e-06, + "loss": 0.1277, + "reward": 3.191666603088379, + "reward_std": 1.2146673202514648, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.6916667222976685, + "step": 223 + }, + { + "completion_length": 593.0, + "epoch": 0.7832167832167832, + "grad_norm": 0.8852243423461914, + "kl": 0.09442658722400665, + "learning_rate": 4.997807075247147e-06, + "loss": 0.0038, + "reward": 3.2750003337860107, + "reward_std": 0.6691412925720215, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7749999761581421, + "step": 224 + }, + { + "completion_length": 831.1666870117188, + "epoch": 0.7867132867132867, + "grad_norm": 0.4429211914539337, + "kl": 0.04310205578804016, + "learning_rate": 4.997620553954645e-06, + "loss": 0.0017, + "reward": 3.1541666984558105, + "reward_std": 1.132741928100586, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8208333849906921, + "step": 225 + }, + { + "completion_length": 731.0, + "epoch": 0.7902097902097902, + "grad_norm": 0.4210525155067444, + "kl": 0.0507250651717186, + "learning_rate": 4.997426424476787e-06, + "loss": 0.002, + "reward": 3.758333206176758, + "reward_std": 0.40052053332328796, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 226 + }, + { + "completion_length": 683.1666870117188, + "epoch": 0.7937062937062938, + "grad_norm": 1.443489670753479, + "kl": 0.1432674527168274, + "learning_rate": 4.9972246874049254e-06, + "loss": 0.0057, + "reward": 2.7166666984558105, + "reward_std": 1.075019359588623, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 227 + }, + { + "completion_length": 749.0, + "epoch": 0.7972027972027972, + "grad_norm": 0.4731828272342682, + "kl": 0.05084119364619255, + "learning_rate": 4.9970153433535855e-06, + "loss": 0.002, + "reward": 2.5250000953674316, + "reward_std": 0.49371039867401123, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8583332896232605, + "step": 228 + }, + { + "completion_length": 749.6666870117188, + "epoch": 0.8006993006993007, + "grad_norm": 1.1463042497634888, + "kl": 0.0917380303144455, + "learning_rate": 4.996798392960466e-06, + "loss": 0.0037, + "reward": 3.1000001430511475, + "reward_std": 1.1304867267608643, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 229 + }, + { + "completion_length": 444.3333435058594, + "epoch": 0.8041958041958042, + "grad_norm": 2.1588308811187744, + "kl": 0.2637466788291931, + "learning_rate": 4.9965738368864345e-06, + "loss": 0.0105, + "reward": 1.4583333730697632, + "reward_std": 0.665895402431488, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4583333432674408, + "step": 230 + }, + { + "completion_length": 563.8333740234375, + "epoch": 0.8076923076923077, + "grad_norm": 1.7064660787582397, + "kl": 0.15527644753456116, + "learning_rate": 4.99634167581553e-06, + "loss": 0.0062, + "reward": 2.9208335876464844, + "reward_std": 1.1095513105392456, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5874999761581421, + "step": 231 + }, + { + "completion_length": 571.6666870117188, + "epoch": 0.8111888111888111, + "grad_norm": 0.7909032106399536, + "kl": 0.10144728422164917, + "learning_rate": 4.996101910454953e-06, + "loss": 0.0041, + "reward": 3.200000286102295, + "reward_std": 0.6928204298019409, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.699999988079071, + "step": 232 + }, + { + "completion_length": 442.16668701171875, + "epoch": 0.8146853146853147, + "grad_norm": 2.3640758991241455, + "kl": 0.1561039686203003, + "learning_rate": 4.995854541535072e-06, + "loss": 0.0062, + "reward": 2.8583333492279053, + "reward_std": 1.5499732494354248, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 233 + }, + { + "completion_length": 635.0, + "epoch": 0.8181818181818182, + "grad_norm": 1.519736409187317, + "kl": 0.08059443533420563, + "learning_rate": 4.995599569809414e-06, + "loss": 0.0032, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 234 + }, + { + "completion_length": 867.1666870117188, + "epoch": 0.8216783216783217, + "grad_norm": 1.0411657094955444, + "kl": 0.18848155438899994, + "learning_rate": 4.995336996054668e-06, + "loss": 0.0075, + "reward": 2.566666603088379, + "reward_std": 0.8010410666465759, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 235 + }, + { + "completion_length": 767.0, + "epoch": 0.8251748251748252, + "grad_norm": 1.3162877559661865, + "kl": 0.1943603754043579, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.0078, + "reward": 2.8458335399627686, + "reward_std": 1.271457552909851, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8458333015441895, + "step": 236 + }, + { + "completion_length": 971.0, + "epoch": 0.8286713286713286, + "grad_norm": 0.7847824096679688, + "kl": 0.07626049965620041, + "learning_rate": 4.994789045680448e-06, + "loss": 0.0031, + "reward": 2.766666889190674, + "reward_std": 1.1245739459991455, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7666666507720947, + "step": 237 + }, + { + "completion_length": 552.0, + "epoch": 0.8321678321678322, + "grad_norm": 0.7410560250282288, + "kl": 0.10457824170589447, + "learning_rate": 4.994503670730126e-06, + "loss": 0.0042, + "reward": 3.391666889190674, + "reward_std": 0.7059863805770874, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7250000238418579, + "step": 238 + }, + { + "completion_length": 725.6666870117188, + "epoch": 0.8356643356643356, + "grad_norm": 0.4836815595626831, + "kl": 0.05600851774215698, + "learning_rate": 4.9942106970890136e-06, + "loss": 0.0022, + "reward": 2.7333333492279053, + "reward_std": 0.40207791328430176, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8999999761581421, + "step": 239 + }, + { + "completion_length": 670.1666870117188, + "epoch": 0.8391608391608392, + "grad_norm": 1.1572860479354858, + "kl": 0.09645780920982361, + "learning_rate": 4.993910125649561e-06, + "loss": 0.0039, + "reward": 1.945833444595337, + "reward_std": 1.1002748012542725, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.612500011920929, + "step": 240 + }, + { + "completion_length": 716.0, + "epoch": 0.8426573426573427, + "grad_norm": 0.6385201811790466, + "kl": 0.10877624154090881, + "learning_rate": 4.993601957327361e-06, + "loss": 0.0044, + "reward": 1.7999999523162842, + "reward_std": 1.3168143033981323, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 241 + }, + { + "completion_length": 783.0, + "epoch": 0.8461538461538461, + "grad_norm": 0.4785465598106384, + "kl": 0.06399235874414444, + "learning_rate": 4.993286193061145e-06, + "loss": 0.0026, + "reward": 2.258333444595337, + "reward_std": 0.5389031767845154, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9249999523162842, + "step": 242 + }, + { + "completion_length": 660.6666870117188, + "epoch": 0.8496503496503497, + "grad_norm": 0.7678278684616089, + "kl": 0.07323874533176422, + "learning_rate": 4.9929628338127904e-06, + "loss": 0.0029, + "reward": 2.575000047683716, + "reward_std": 1.0048632621765137, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7416666746139526, + "step": 243 + }, + { + "completion_length": 904.5, + "epoch": 0.8531468531468531, + "grad_norm": 0.41908255219459534, + "kl": 0.049275174736976624, + "learning_rate": 4.992631880567301e-06, + "loss": 0.002, + "reward": 1.9250000715255737, + "reward_std": 0.6354132890701294, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.9249999523162842, + "step": 244 + }, + { + "completion_length": 524.8333740234375, + "epoch": 0.8566433566433567, + "grad_norm": 0.9670363068580627, + "kl": 0.17363564670085907, + "learning_rate": 4.992293334332821e-06, + "loss": 0.0069, + "reward": 1.558333396911621, + "reward_std": 1.3331979513168335, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333373069763, + "step": 245 + }, + { + "completion_length": 869.1666870117188, + "epoch": 0.8601398601398601, + "grad_norm": 0.45620983839035034, + "kl": 0.0668826699256897, + "learning_rate": 4.991947196140619e-06, + "loss": 0.0027, + "reward": 2.5416667461395264, + "reward_std": 0.9057685732841492, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 246 + }, + { + "completion_length": 841.3333740234375, + "epoch": 0.8636363636363636, + "grad_norm": 0.559363603591919, + "kl": 0.0583985298871994, + "learning_rate": 4.991593467045092e-06, + "loss": 0.0023, + "reward": 2.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 247 + }, + { + "completion_length": 599.1666870117188, + "epoch": 0.8671328671328671, + "grad_norm": 0.9642091989517212, + "kl": 0.11994724720716476, + "learning_rate": 4.9912321481237616e-06, + "loss": 0.0048, + "reward": 2.5250000953674316, + "reward_std": 1.0810874700546265, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 248 + }, + { + "completion_length": 462.16668701171875, + "epoch": 0.8706293706293706, + "grad_norm": 36.93287658691406, + "kl": 9.688800811767578, + "learning_rate": 4.990863240477266e-06, + "loss": 0.3876, + "reward": 2.133333444595337, + "reward_std": 1.5154757499694824, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.46666666865348816, + "step": 249 + }, + { + "completion_length": 339.0, + "epoch": 0.8741258741258742, + "grad_norm": 26.625389099121094, + "kl": 0.959087610244751, + "learning_rate": 4.990486745229364e-06, + "loss": 0.0384, + "reward": 2.4000000953674316, + "reward_std": 1.4926488399505615, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5666666626930237, + "step": 250 + }, + { + "completion_length": 618.1666870117188, + "epoch": 0.8776223776223776, + "grad_norm": 0.8756181597709656, + "kl": 0.1540575623512268, + "learning_rate": 4.990102663526925e-06, + "loss": 0.0062, + "reward": 2.3583335876464844, + "reward_std": 0.7564169764518738, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916666030883789, + "step": 251 + }, + { + "completion_length": 659.0, + "epoch": 0.8811188811188811, + "grad_norm": 1.4729007482528687, + "kl": 0.22244331240653992, + "learning_rate": 4.989710996539926e-06, + "loss": 0.0089, + "reward": 2.6666667461395264, + "reward_std": 1.386602759361267, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6666666865348816, + "step": 252 + }, + { + "completion_length": 471.0, + "epoch": 0.8846153846153846, + "grad_norm": 1.7183626890182495, + "kl": 0.19531545042991638, + "learning_rate": 4.989311745461456e-06, + "loss": 0.0078, + "reward": 2.2624998092651367, + "reward_std": 1.547720193862915, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.42916664481163025, + "step": 253 + }, + { + "completion_length": 809.5, + "epoch": 0.8881118881118881, + "grad_norm": 1.3393943309783936, + "kl": 0.06276177614927292, + "learning_rate": 4.9889049115077e-06, + "loss": 0.0025, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 254 + }, + { + "completion_length": 696.0, + "epoch": 0.8916083916083916, + "grad_norm": 0.5159295201301575, + "kl": 0.06829811632633209, + "learning_rate": 4.988490495917948e-06, + "loss": 0.0027, + "reward": 2.375, + "reward_std": 0.8226482272148132, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.875, + "step": 255 + }, + { + "completion_length": 469.8333435058594, + "epoch": 0.8951048951048951, + "grad_norm": 15.731892585754395, + "kl": 5.195942401885986, + "learning_rate": 4.988068499954578e-06, + "loss": 0.2078, + "reward": 2.5333333015441895, + "reward_std": 1.7218208312988281, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5333333611488342, + "step": 256 + }, + { + "completion_length": 267.66668701171875, + "epoch": 0.8986013986013986, + "grad_norm": 2.6494510173797607, + "kl": 0.2645886242389679, + "learning_rate": 4.987638924903066e-06, + "loss": 0.0106, + "reward": 1.9833333492279053, + "reward_std": 1.6277797222137451, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4833333194255829, + "step": 257 + }, + { + "completion_length": 772.3333740234375, + "epoch": 0.9020979020979021, + "grad_norm": 0.4527927339076996, + "kl": 0.06693247705698013, + "learning_rate": 4.987201772071971e-06, + "loss": 0.0027, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 258 + }, + { + "completion_length": 585.6666870117188, + "epoch": 0.9055944055944056, + "grad_norm": 0.689224362373352, + "kl": 0.08530323952436447, + "learning_rate": 4.9867570427929356e-06, + "loss": 0.0034, + "reward": 0.7916666865348816, + "reward_std": 0.9183772802352905, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4583333432674408, + "step": 259 + }, + { + "completion_length": 537.1666870117188, + "epoch": 0.9090909090909091, + "grad_norm": 0.6728858947753906, + "kl": 0.0897747129201889, + "learning_rate": 4.986304738420684e-06, + "loss": 0.0036, + "reward": 3.129167079925537, + "reward_std": 1.1996268033981323, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7958333492279053, + "step": 260 + }, + { + "completion_length": 407.8333435058594, + "epoch": 0.9125874125874126, + "grad_norm": 1.1994887590408325, + "kl": 0.09183052182197571, + "learning_rate": 4.985844860333012e-06, + "loss": 0.0037, + "reward": 2.3583335876464844, + "reward_std": 1.752831220626831, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 261 + }, + { + "completion_length": 677.5, + "epoch": 0.916083916083916, + "grad_norm": 0.508855402469635, + "kl": 0.07326661795377731, + "learning_rate": 4.985377409930789e-06, + "loss": 0.0029, + "reward": 3.375, + "reward_std": 0.8635681867599487, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 262 + }, + { + "completion_length": 736.8333740234375, + "epoch": 0.9195804195804196, + "grad_norm": 0.9614912271499634, + "kl": 0.09196578711271286, + "learning_rate": 4.98490238863795e-06, + "loss": 0.0037, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 263 + }, + { + "completion_length": 770.8333740234375, + "epoch": 0.9230769230769231, + "grad_norm": 0.47455278038978577, + "kl": 0.06785900890827179, + "learning_rate": 4.984419797901491e-06, + "loss": 0.0027, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 264 + }, + { + "completion_length": 623.6666870117188, + "epoch": 0.9265734265734266, + "grad_norm": 0.5573136210441589, + "kl": 0.08627455681562424, + "learning_rate": 4.9839296391914696e-06, + "loss": 0.0035, + "reward": 3.116666793823242, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 265 + }, + { + "completion_length": 391.3333435058594, + "epoch": 0.9300699300699301, + "grad_norm": 1.9462356567382812, + "kl": 0.16661277413368225, + "learning_rate": 4.983431914000991e-06, + "loss": 0.0067, + "reward": 2.4749999046325684, + "reward_std": 1.4665435552597046, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 266 + }, + { + "completion_length": 397.3333435058594, + "epoch": 0.9335664335664335, + "grad_norm": 1.011677622795105, + "kl": 0.23764805495738983, + "learning_rate": 4.982926623846216e-06, + "loss": 0.0095, + "reward": 3.366666793823242, + "reward_std": 0.6274287104606628, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7000000476837158, + "step": 267 + }, + { + "completion_length": 417.0, + "epoch": 0.9370629370629371, + "grad_norm": 1.4490914344787598, + "kl": 0.13754335045814514, + "learning_rate": 4.9824137702663424e-06, + "loss": 0.0055, + "reward": 1.7125000953674316, + "reward_std": 0.9684717655181885, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7124999761581421, + "step": 268 + }, + { + "completion_length": 410.5, + "epoch": 0.9405594405594405, + "grad_norm": 0.8436146974563599, + "kl": 0.14260268211364746, + "learning_rate": 4.981893354823614e-06, + "loss": 0.0057, + "reward": 1.8125, + "reward_std": 1.1806514263153076, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6458333730697632, + "step": 269 + }, + { + "completion_length": 644.6666870117188, + "epoch": 0.9440559440559441, + "grad_norm": 0.7549885511398315, + "kl": 0.09023593366146088, + "learning_rate": 4.981365379103306e-06, + "loss": 0.0036, + "reward": 2.3500001430511475, + "reward_std": 1.3856406211853027, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 270 + }, + { + "completion_length": 195.5, + "epoch": 0.9475524475524476, + "grad_norm": 1.895914077758789, + "kl": 0.29670989513397217, + "learning_rate": 4.980829844713722e-06, + "loss": 0.0119, + "reward": 1.649999976158142, + "reward_std": 1.0168579816818237, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.3166666626930237, + "step": 271 + }, + { + "completion_length": 359.8333435058594, + "epoch": 0.951048951048951, + "grad_norm": 1.0856112241744995, + "kl": 0.255443274974823, + "learning_rate": 4.980286753286196e-06, + "loss": 0.0102, + "reward": 2.2916667461395264, + "reward_std": 1.2310227155685425, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.625, + "step": 272 + }, + { + "completion_length": 726.8333740234375, + "epoch": 0.9545454545454546, + "grad_norm": 0.2943981885910034, + "kl": 0.12990406155586243, + "learning_rate": 4.979736106475075e-06, + "loss": 0.0064, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 273 + }, + { + "completion_length": 680.0, + "epoch": 0.958041958041958, + "grad_norm": 0.5072641372680664, + "kl": 0.07472037523984909, + "learning_rate": 4.979177905957726e-06, + "loss": 0.003, + "reward": 3.012500286102295, + "reward_std": 1.1379531621932983, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8458333015441895, + "step": 274 + }, + { + "completion_length": 491.5, + "epoch": 0.9615384615384616, + "grad_norm": 0.6770206689834595, + "kl": 0.13075995445251465, + "learning_rate": 4.978612153434527e-06, + "loss": 0.0052, + "reward": 2.008333444595337, + "reward_std": 0.7618508338928223, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6750000715255737, + "step": 275 + }, + { + "completion_length": 749.6666870117188, + "epoch": 0.965034965034965, + "grad_norm": 0.5412439107894897, + "kl": 0.10561086982488632, + "learning_rate": 4.978038850628855e-06, + "loss": 0.0042, + "reward": 2.870833396911621, + "reward_std": 0.6615166068077087, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 276 + }, + { + "completion_length": 511.5, + "epoch": 0.9685314685314685, + "grad_norm": 1.1368520259857178, + "kl": 0.14474637806415558, + "learning_rate": 4.977457999287091e-06, + "loss": 0.0058, + "reward": 1.7583332061767578, + "reward_std": 1.0646204948425293, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 277 + }, + { + "completion_length": 750.6666870117188, + "epoch": 0.972027972027972, + "grad_norm": 1.0957084894180298, + "kl": 0.10108073800802231, + "learning_rate": 4.9768696011786095e-06, + "loss": 0.004, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 278 + }, + { + "completion_length": 324.3333435058594, + "epoch": 0.9755244755244755, + "grad_norm": 1.0172570943832397, + "kl": 0.31204575300216675, + "learning_rate": 4.976273658095772e-06, + "loss": 0.0125, + "reward": 0.908333420753479, + "reward_std": 1.0532886981964111, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40833330154418945, + "step": 279 + }, + { + "completion_length": 329.66668701171875, + "epoch": 0.9790209790209791, + "grad_norm": 0.753690242767334, + "kl": 0.09907300770282745, + "learning_rate": 4.975670171853926e-06, + "loss": 0.004, + "reward": 2.7750003337860107, + "reward_std": 1.0994317531585693, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7749999761581421, + "step": 280 + }, + { + "completion_length": 615.3333740234375, + "epoch": 0.9825174825174825, + "grad_norm": 0.8215593695640564, + "kl": 0.09376661479473114, + "learning_rate": 4.975059144291395e-06, + "loss": 0.0038, + "reward": 2.375, + "reward_std": 0.6509608626365662, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8749999403953552, + "step": 281 + }, + { + "completion_length": 435.8333435058594, + "epoch": 0.986013986013986, + "grad_norm": 1.3309355974197388, + "kl": 0.21346941590309143, + "learning_rate": 4.974440577269473e-06, + "loss": 0.0085, + "reward": 2.0333333015441895, + "reward_std": 1.6485350131988525, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5333333611488342, + "step": 282 + }, + { + "completion_length": 470.3333435058594, + "epoch": 0.9895104895104895, + "grad_norm": 1.1230376958847046, + "kl": 0.1047142893075943, + "learning_rate": 4.973814472672424e-06, + "loss": 0.0042, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 283 + }, + { + "completion_length": 887.5, + "epoch": 0.993006993006993, + "grad_norm": 0.6477030515670776, + "kl": 0.08142790198326111, + "learning_rate": 4.973180832407471e-06, + "loss": 0.0033, + "reward": 1.4250000715255737, + "reward_std": 0.9661781191825867, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5916666388511658, + "step": 284 + }, + { + "completion_length": 566.3333740234375, + "epoch": 0.9965034965034965, + "grad_norm": 0.7089259624481201, + "kl": 0.1486695259809494, + "learning_rate": 4.972539658404793e-06, + "loss": 0.0059, + "reward": 1.7166666984558105, + "reward_std": 0.7332576513290405, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 285 + }, + { + "completion_length": 899.3333740234375, + "epoch": 1.0, + "grad_norm": 0.6575971841812134, + "kl": 0.0989997610449791, + "learning_rate": 4.971890952617515e-06, + "loss": 0.004, + "reward": 2.8583335876464844, + "reward_std": 0.9960757493972778, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 286 + }, + { + "completion_length": 414.5, + "epoch": 1.0034965034965035, + "grad_norm": 1.0364247560501099, + "kl": 0.19011634588241577, + "learning_rate": 4.971234717021709e-06, + "loss": 0.0076, + "reward": 1.7916667461395264, + "reward_std": 1.7468304634094238, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6250000596046448, + "step": 287 + }, + { + "completion_length": 524.0, + "epoch": 1.006993006993007, + "grad_norm": 0.9833644032478333, + "kl": 0.14835724234580994, + "learning_rate": 4.970570953616383e-06, + "loss": 0.0059, + "reward": 2.3583335876464844, + "reward_std": 1.1191142797470093, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916666030883789, + "step": 288 + }, + { + "completion_length": 681.1666870117188, + "epoch": 1.0104895104895104, + "grad_norm": 0.6175888180732727, + "kl": 0.10941031575202942, + "learning_rate": 4.969899664423473e-06, + "loss": 0.0044, + "reward": 2.704166889190674, + "reward_std": 0.7567061185836792, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8708333373069763, + "step": 289 + }, + { + "completion_length": 386.5, + "epoch": 1.013986013986014, + "grad_norm": 2.7495882511138916, + "kl": 0.5513795614242554, + "learning_rate": 4.9692208514878445e-06, + "loss": 0.0221, + "reward": 1.3666666746139526, + "reward_std": 1.0023306608200073, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5333333611488342, + "step": 290 + }, + { + "completion_length": 679.6666870117188, + "epoch": 1.0174825174825175, + "grad_norm": 0.9174596667289734, + "kl": 0.14350205659866333, + "learning_rate": 4.968534516877279e-06, + "loss": 0.0057, + "reward": 2.879167079925537, + "reward_std": 1.0047906637191772, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7124999761581421, + "step": 291 + }, + { + "completion_length": 322.0, + "epoch": 1.020979020979021, + "grad_norm": 6.856034278869629, + "kl": 3.479478597640991, + "learning_rate": 4.96784066268247e-06, + "loss": 0.1392, + "reward": 0.875, + "reward_std": 0.9832345247268677, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.375, + "step": 292 + }, + { + "completion_length": 500.5, + "epoch": 1.0244755244755244, + "grad_norm": 0.8394511938095093, + "kl": 0.14955884218215942, + "learning_rate": 4.967139291017018e-06, + "loss": 0.006, + "reward": 2.133333206176758, + "reward_std": 1.149202585220337, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 293 + }, + { + "completion_length": 470.5, + "epoch": 1.027972027972028, + "grad_norm": 1.0547795295715332, + "kl": 0.26865124702453613, + "learning_rate": 4.966430404017424e-06, + "loss": 0.0107, + "reward": 1.7916667461395264, + "reward_std": 1.1534368991851807, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 294 + }, + { + "completion_length": 357.3333435058594, + "epoch": 1.0314685314685315, + "grad_norm": 1.61123788356781, + "kl": 0.2728823125362396, + "learning_rate": 4.965714003843079e-06, + "loss": 0.0109, + "reward": 3.266666889190674, + "reward_std": 1.6014575958251953, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7666666507720947, + "step": 295 + }, + { + "completion_length": 388.3333435058594, + "epoch": 1.034965034965035, + "grad_norm": 0.8229731917381287, + "kl": 0.33708059787750244, + "learning_rate": 4.964990092676263e-06, + "loss": 0.0135, + "reward": 0.7125000357627869, + "reward_std": 0.5300353765487671, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3791666626930237, + "step": 296 + }, + { + "completion_length": 667.0, + "epoch": 1.0384615384615385, + "grad_norm": 1.0831242799758911, + "kl": 0.26999422907829285, + "learning_rate": 4.964258672722135e-06, + "loss": 0.0108, + "reward": 2.5458335876464844, + "reward_std": 1.5425965785980225, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7124999761581421, + "step": 297 + }, + { + "completion_length": 804.1666870117188, + "epoch": 1.0419580419580419, + "grad_norm": 0.625715434551239, + "kl": 0.12136679887771606, + "learning_rate": 4.963519746208726e-06, + "loss": 0.0049, + "reward": 1.5791667699813843, + "reward_std": 1.2249915599822998, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7458333373069763, + "step": 298 + }, + { + "completion_length": 615.3333740234375, + "epoch": 1.0454545454545454, + "grad_norm": 0.9705678820610046, + "kl": 0.2214520424604416, + "learning_rate": 4.962773315386935e-06, + "loss": 0.0089, + "reward": 2.516666889190674, + "reward_std": 1.2355836629867554, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 299 + }, + { + "completion_length": 836.1666870117188, + "epoch": 1.048951048951049, + "grad_norm": 1.5465428829193115, + "kl": 0.24709966778755188, + "learning_rate": 4.962019382530521e-06, + "loss": 0.0099, + "reward": 2.0458333492279053, + "reward_std": 1.097544550895691, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 300 + }, + { + "completion_length": 597.6666870117188, + "epoch": 1.0524475524475525, + "grad_norm": 3.8257570266723633, + "kl": 0.9686455130577087, + "learning_rate": 4.961257949936092e-06, + "loss": 0.0387, + "reward": 1.4750001430511475, + "reward_std": 1.025061011314392, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 301 + }, + { + "completion_length": 516.6666870117188, + "epoch": 1.055944055944056, + "grad_norm": 2.1578736305236816, + "kl": 0.25257474184036255, + "learning_rate": 4.960489019923105e-06, + "loss": 0.0101, + "reward": 1.712499976158142, + "reward_std": 1.2360976934432983, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7125000357627869, + "step": 302 + }, + { + "completion_length": 390.3333435058594, + "epoch": 1.0594405594405594, + "grad_norm": 1.1851695775985718, + "kl": 0.30646514892578125, + "learning_rate": 4.959712594833855e-06, + "loss": 0.0123, + "reward": 1.3875000476837158, + "reward_std": 1.3440377712249756, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5541666746139526, + "step": 303 + }, + { + "completion_length": 329.66668701171875, + "epoch": 1.062937062937063, + "grad_norm": 1.7874314785003662, + "kl": 0.5978689193725586, + "learning_rate": 4.958928677033465e-06, + "loss": 0.0239, + "reward": 2.5625, + "reward_std": 1.447562575340271, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5625, + "step": 304 + }, + { + "completion_length": 676.5, + "epoch": 1.0664335664335665, + "grad_norm": 1.6353819370269775, + "kl": 0.2865048348903656, + "learning_rate": 4.958137268909887e-06, + "loss": 0.0115, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 305 + }, + { + "completion_length": 685.1666870117188, + "epoch": 1.06993006993007, + "grad_norm": 0.5405178666114807, + "kl": 0.16403402388095856, + "learning_rate": 4.957338372873886e-06, + "loss": 0.0066, + "reward": 1.1166666746139526, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 306 + }, + { + "completion_length": 377.16668701171875, + "epoch": 1.0734265734265733, + "grad_norm": 1.3861095905303955, + "kl": 0.5912900567054749, + "learning_rate": 4.956531991359038e-06, + "loss": 0.0237, + "reward": 0.9541667699813843, + "reward_std": 0.9423928260803223, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4541666507720947, + "step": 307 + }, + { + "completion_length": 568.1666870117188, + "epoch": 1.0769230769230769, + "grad_norm": 2.0841739177703857, + "kl": 0.3946326673030853, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.0158, + "reward": 1.2583333253860474, + "reward_std": 1.1876096725463867, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666388511658, + "step": 308 + }, + { + "completion_length": 610.1666870117188, + "epoch": 1.0804195804195804, + "grad_norm": 0.7838713526725769, + "kl": 0.20940952003002167, + "learning_rate": 4.95489678174111e-06, + "loss": 0.0084, + "reward": 1.1750000715255737, + "reward_std": 1.1035170555114746, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6750000715255737, + "step": 309 + }, + { + "completion_length": 780.3333740234375, + "epoch": 1.083916083916084, + "grad_norm": 0.91953444480896, + "kl": 0.13563194870948792, + "learning_rate": 4.9540679586191605e-06, + "loss": 0.0054, + "reward": 1.8500001430511475, + "reward_std": 1.006479024887085, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 310 + }, + { + "completion_length": 468.66668701171875, + "epoch": 1.0874125874125875, + "grad_norm": 1.1062681674957275, + "kl": 0.36474311351776123, + "learning_rate": 4.953231659980613e-06, + "loss": 0.0146, + "reward": 2.058333396911621, + "reward_std": 1.7576736211776733, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 311 + }, + { + "completion_length": 571.3333740234375, + "epoch": 1.0909090909090908, + "grad_norm": 0.7562583088874817, + "kl": 0.17403468489646912, + "learning_rate": 4.9523878883729794e-06, + "loss": 0.007, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 312 + }, + { + "completion_length": 580.6666870117188, + "epoch": 1.0944055944055944, + "grad_norm": 0.7236371040344238, + "kl": 0.20542237162590027, + "learning_rate": 4.9515366463665324e-06, + "loss": 0.0082, + "reward": 2.4000000953674316, + "reward_std": 0.8803409337997437, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 313 + }, + { + "completion_length": 372.5, + "epoch": 1.097902097902098, + "grad_norm": 0.736242949962616, + "kl": 0.19798314571380615, + "learning_rate": 4.9506779365543054e-06, + "loss": 0.0079, + "reward": 3.0916666984558105, + "reward_std": 0.4247548282146454, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9250000715255737, + "step": 314 + }, + { + "completion_length": 660.8333740234375, + "epoch": 1.1013986013986015, + "grad_norm": 0.7641960978507996, + "kl": 0.29524654150009155, + "learning_rate": 4.949811761552074e-06, + "loss": 0.0118, + "reward": 2.4166669845581055, + "reward_std": 1.2176480293273926, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7499999403953552, + "step": 315 + }, + { + "completion_length": 838.3333740234375, + "epoch": 1.104895104895105, + "grad_norm": 0.5717921853065491, + "kl": 0.14558419585227966, + "learning_rate": 4.94893812399836e-06, + "loss": 0.0058, + "reward": 2.258333206176758, + "reward_std": 0.8662660121917725, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9249999523162842, + "step": 316 + }, + { + "completion_length": 308.8333435058594, + "epoch": 1.1083916083916083, + "grad_norm": 1.5407124757766724, + "kl": 0.36382099986076355, + "learning_rate": 4.948057026554415e-06, + "loss": 0.0146, + "reward": 1.2291667461395264, + "reward_std": 1.2054479122161865, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5625, + "step": 317 + }, + { + "completion_length": 582.1666870117188, + "epoch": 1.1118881118881119, + "grad_norm": 0.5300387144088745, + "kl": 0.19406351447105408, + "learning_rate": 4.947168471904213e-06, + "loss": 0.0078, + "reward": 1.375, + "reward_std": 0.4937104880809784, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8749999403953552, + "step": 318 + }, + { + "completion_length": 889.3333740234375, + "epoch": 1.1153846153846154, + "grad_norm": 0.7921298146247864, + "kl": 0.14385448396205902, + "learning_rate": 4.946272462754447e-06, + "loss": 0.0058, + "reward": 1.629166603088379, + "reward_std": 0.8614546656608582, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 319 + }, + { + "completion_length": 576.6666870117188, + "epoch": 1.118881118881119, + "grad_norm": 2.1564207077026367, + "kl": 0.8259252309799194, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.033, + "reward": 1.399999976158142, + "reward_std": 1.3337916135787964, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40000003576278687, + "step": 320 + }, + { + "completion_length": 471.8333435058594, + "epoch": 1.1223776223776223, + "grad_norm": 1.2515596151351929, + "kl": 0.24163812398910522, + "learning_rate": 4.944458091896515e-06, + "loss": 0.0097, + "reward": 1.883333444595337, + "reward_std": 1.3325413465499878, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666984558105, + "step": 321 + }, + { + "completion_length": 416.66668701171875, + "epoch": 1.1258741258741258, + "grad_norm": 0.7721207141876221, + "kl": 0.2213769555091858, + "learning_rate": 4.9435397357152406e-06, + "loss": 0.0089, + "reward": 1.899999976158142, + "reward_std": 0.6442049741744995, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 322 + }, + { + "completion_length": 1349.5, + "epoch": 1.1293706293706294, + "grad_norm": 0.3130567967891693, + "kl": 0.10197386145591736, + "learning_rate": 4.94261393608816e-06, + "loss": 0.0041, + "reward": 1.9666666984558105, + "reward_std": 0.9277212023735046, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7999999523162842, + "step": 323 + }, + { + "completion_length": 669.5, + "epoch": 1.132867132867133, + "grad_norm": 0.9291994571685791, + "kl": 0.22598087787628174, + "learning_rate": 4.9416806958354206e-06, + "loss": 0.009, + "reward": 0.949999988079071, + "reward_std": 0.6595453023910522, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 324 + }, + { + "completion_length": 184.1666717529297, + "epoch": 1.1363636363636362, + "grad_norm": 2.9357590675354004, + "kl": 0.44805118441581726, + "learning_rate": 4.9407400177998335e-06, + "loss": 0.0179, + "reward": 2.450000047683716, + "reward_std": 1.4673106670379639, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 325 + }, + { + "completion_length": 786.8333740234375, + "epoch": 1.1398601398601398, + "grad_norm": 0.7112540006637573, + "kl": 0.23709163069725037, + "learning_rate": 4.939791904846869e-06, + "loss": 0.0095, + "reward": 2.7333335876464844, + "reward_std": 0.40207797288894653, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 326 + }, + { + "completion_length": 391.16668701171875, + "epoch": 1.1433566433566433, + "grad_norm": 1.6311299800872803, + "kl": 0.31598275899887085, + "learning_rate": 4.938836359864641e-06, + "loss": 0.0126, + "reward": 2.2791666984558105, + "reward_std": 0.9937827587127686, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6124999523162842, + "step": 327 + }, + { + "completion_length": 325.8333435058594, + "epoch": 1.1468531468531469, + "grad_norm": 1.6858141422271729, + "kl": 0.40026235580444336, + "learning_rate": 4.937873385763909e-06, + "loss": 0.016, + "reward": 2.0250000953674316, + "reward_std": 1.1339092254638672, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 328 + }, + { + "completion_length": 313.3333435058594, + "epoch": 1.1503496503496504, + "grad_norm": 1.9852374792099, + "kl": 0.36842843890190125, + "learning_rate": 4.936902985478055e-06, + "loss": 0.0147, + "reward": 2.5250000953674316, + "reward_std": 1.1206024885177612, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 329 + }, + { + "completion_length": 333.66668701171875, + "epoch": 1.1538461538461537, + "grad_norm": 1.0456072092056274, + "kl": 0.3002980351448059, + "learning_rate": 4.935925161963089e-06, + "loss": 0.012, + "reward": 2.1083335876464844, + "reward_std": 0.9068719744682312, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7749999761581421, + "step": 330 + }, + { + "completion_length": 419.16668701171875, + "epoch": 1.1573426573426573, + "grad_norm": 0.9209095239639282, + "kl": 0.19463126361370087, + "learning_rate": 4.93493991819763e-06, + "loss": 0.0078, + "reward": 3.566666603088379, + "reward_std": 0.9389710426330566, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 331 + }, + { + "completion_length": 501.3333435058594, + "epoch": 1.1608391608391608, + "grad_norm": 0.9894822239875793, + "kl": 0.23653444647789001, + "learning_rate": 4.933947257182901e-06, + "loss": 0.0095, + "reward": 2.4583334922790527, + "reward_std": 1.6280101537704468, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 332 + }, + { + "completion_length": 283.8333435058594, + "epoch": 1.1643356643356644, + "grad_norm": 1.3056206703186035, + "kl": 0.3558562397956848, + "learning_rate": 4.932947181942721e-06, + "loss": 0.0142, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 333 + }, + { + "completion_length": 617.8333740234375, + "epoch": 1.167832167832168, + "grad_norm": 0.7905691266059875, + "kl": 0.2221965491771698, + "learning_rate": 4.9319396955234925e-06, + "loss": 0.0089, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 334 + }, + { + "completion_length": 802.3333740234375, + "epoch": 1.1713286713286712, + "grad_norm": 0.650930643081665, + "kl": 0.2902371287345886, + "learning_rate": 4.930924800994192e-06, + "loss": 0.0116, + "reward": 2.9375, + "reward_std": 0.9523326754570007, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7708332538604736, + "step": 335 + }, + { + "completion_length": 571.5, + "epoch": 1.1748251748251748, + "grad_norm": 2.592233180999756, + "kl": 0.44388240575790405, + "learning_rate": 4.9299025014463665e-06, + "loss": 0.0178, + "reward": 2.625, + "reward_std": 1.0167349576950073, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 336 + }, + { + "completion_length": 765.0, + "epoch": 1.1783216783216783, + "grad_norm": 0.8478806018829346, + "kl": 0.23496964573860168, + "learning_rate": 4.928872799994116e-06, + "loss": 0.0094, + "reward": 2.4166665077209473, + "reward_std": 1.0943796634674072, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.75, + "step": 337 + }, + { + "completion_length": 369.5, + "epoch": 1.1818181818181819, + "grad_norm": 1.2003388404846191, + "kl": 0.283313125371933, + "learning_rate": 4.92783569977409e-06, + "loss": 0.0113, + "reward": 2.4625000953674316, + "reward_std": 1.1056389808654785, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 338 + }, + { + "completion_length": 241.1666717529297, + "epoch": 1.1853146853146854, + "grad_norm": 1.1362509727478027, + "kl": 0.36542683839797974, + "learning_rate": 4.926791203945477e-06, + "loss": 0.0146, + "reward": 2.941667079925537, + "reward_std": 1.237908124923706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7749999761581421, + "step": 339 + }, + { + "completion_length": 262.3333435058594, + "epoch": 1.1888111888111887, + "grad_norm": 2.5425589084625244, + "kl": 0.46542689204216003, + "learning_rate": 4.925739315689991e-06, + "loss": 0.0186, + "reward": 2.2166666984558105, + "reward_std": 1.3840761184692383, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666388511658, + "step": 340 + }, + { + "completion_length": 458.8333435058594, + "epoch": 1.1923076923076923, + "grad_norm": 1.0685269832611084, + "kl": 0.28533288836479187, + "learning_rate": 4.924680038211868e-06, + "loss": 0.0114, + "reward": 3.0375001430511475, + "reward_std": 0.7974568605422974, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 341 + }, + { + "completion_length": 680.6666870117188, + "epoch": 1.1958041958041958, + "grad_norm": 1.049636960029602, + "kl": 0.2565695643424988, + "learning_rate": 4.923613374737848e-06, + "loss": 0.0103, + "reward": 2.875, + "reward_std": 0.7834221124649048, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.875, + "step": 342 + }, + { + "completion_length": 669.5, + "epoch": 1.1993006993006994, + "grad_norm": 0.47562330961227417, + "kl": 0.15911276638507843, + "learning_rate": 4.922539328517174e-06, + "loss": 0.0064, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 343 + }, + { + "completion_length": 533.1666870117188, + "epoch": 1.2027972027972027, + "grad_norm": 2.7278823852539062, + "kl": 0.42878812551498413, + "learning_rate": 4.921457902821578e-06, + "loss": 0.0172, + "reward": 2.191666603088379, + "reward_std": 1.1499637365341187, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 344 + }, + { + "completion_length": 410.5, + "epoch": 1.2062937062937062, + "grad_norm": 1.2009421586990356, + "kl": 0.30361247062683105, + "learning_rate": 4.92036910094527e-06, + "loss": 0.0121, + "reward": 2.2958333492279053, + "reward_std": 0.7362772822380066, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7958333492279053, + "step": 345 + }, + { + "completion_length": 678.0, + "epoch": 1.2097902097902098, + "grad_norm": 1.1339452266693115, + "kl": 0.36994367837905884, + "learning_rate": 4.9192729262049285e-06, + "loss": 0.0148, + "reward": 1.375, + "reward_std": 1.7195203304290771, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.375, + "step": 346 + }, + { + "completion_length": 364.66668701171875, + "epoch": 1.2132867132867133, + "grad_norm": 1.0105022192001343, + "kl": 0.22824347019195557, + "learning_rate": 4.918169381939693e-06, + "loss": 0.0091, + "reward": 1.75, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.75, + "step": 347 + }, + { + "completion_length": 231.83334350585938, + "epoch": 1.2167832167832167, + "grad_norm": 2.2665371894836426, + "kl": 0.5012367963790894, + "learning_rate": 4.917058471511149e-06, + "loss": 0.02, + "reward": 0.8916667699813843, + "reward_std": 0.8929818868637085, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 348 + }, + { + "completion_length": 149.6666717529297, + "epoch": 1.2202797202797202, + "grad_norm": 1.465401530265808, + "kl": 0.71610426902771, + "learning_rate": 4.915940198303324e-06, + "loss": 0.0286, + "reward": 2.183333396911621, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5166666507720947, + "step": 349 + }, + { + "completion_length": 265.66668701171875, + "epoch": 1.2237762237762237, + "grad_norm": 1.1324924230575562, + "kl": 0.39196571707725525, + "learning_rate": 4.914814565722671e-06, + "loss": 0.0157, + "reward": 2.016666889190674, + "reward_std": 0.9521905779838562, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 350 + }, + { + "completion_length": 228.1666717529297, + "epoch": 1.2272727272727273, + "grad_norm": 2.361294746398926, + "kl": 0.5443918704986572, + "learning_rate": 4.913681577198063e-06, + "loss": 0.0218, + "reward": 2.7916667461395264, + "reward_std": 1.5844295024871826, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 351 + }, + { + "completion_length": 645.1666870117188, + "epoch": 1.2307692307692308, + "grad_norm": 1.6541866064071655, + "kl": 0.3587082326412201, + "learning_rate": 4.912541236180779e-06, + "loss": 0.0143, + "reward": 3.0208334922790527, + "reward_std": 1.1969144344329834, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6875, + "step": 352 + }, + { + "completion_length": 592.1666870117188, + "epoch": 1.2342657342657342, + "grad_norm": 3.038172483444214, + "kl": 0.6741119623184204, + "learning_rate": 4.9113935461444955e-06, + "loss": 0.027, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 353 + }, + { + "completion_length": 416.16668701171875, + "epoch": 1.2377622377622377, + "grad_norm": 1.0763347148895264, + "kl": 0.32444697618484497, + "learning_rate": 4.910238510585275e-06, + "loss": 0.013, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 354 + }, + { + "completion_length": 276.3333435058594, + "epoch": 1.2412587412587412, + "grad_norm": 2.7986843585968018, + "kl": 0.9174998998641968, + "learning_rate": 4.909076133021558e-06, + "loss": 0.0367, + "reward": 2.625, + "reward_std": 1.616709589958191, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 355 + }, + { + "completion_length": 269.16668701171875, + "epoch": 1.2447552447552448, + "grad_norm": 0.9633187055587769, + "kl": 0.3955456614494324, + "learning_rate": 4.907906416994146e-06, + "loss": 0.0158, + "reward": 3.066667079925537, + "reward_std": 0.4490731656551361, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8999999761581421, + "step": 356 + }, + { + "completion_length": 313.16668701171875, + "epoch": 1.2482517482517483, + "grad_norm": 2.313849449157715, + "kl": 0.662523627281189, + "learning_rate": 4.906729366066197e-06, + "loss": 0.0265, + "reward": 1.7666667699813843, + "reward_std": 1.1767185926437378, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 357 + }, + { + "completion_length": 216.0, + "epoch": 1.2517482517482517, + "grad_norm": 4.379472255706787, + "kl": 0.7677586078643799, + "learning_rate": 4.905544983823214e-06, + "loss": 0.0307, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 358 + }, + { + "completion_length": 860.3333740234375, + "epoch": 1.2552447552447552, + "grad_norm": 2.9275009632110596, + "kl": 0.6438803672790527, + "learning_rate": 4.904353273873029e-06, + "loss": 0.0258, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 359 + }, + { + "completion_length": 217.83334350585938, + "epoch": 1.2587412587412588, + "grad_norm": 2.738201141357422, + "kl": 0.6947124004364014, + "learning_rate": 4.903154239845798e-06, + "loss": 0.0278, + "reward": 2.125, + "reward_std": 1.3129165172576904, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 360 + }, + { + "completion_length": 850.6666870117188, + "epoch": 1.2622377622377623, + "grad_norm": 0.6407853364944458, + "kl": 0.21777069568634033, + "learning_rate": 4.901947885393986e-06, + "loss": 0.0087, + "reward": 3.066667079925537, + "reward_std": 0.9389710426330566, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 361 + }, + { + "completion_length": 430.5, + "epoch": 1.2657342657342658, + "grad_norm": 3.934774398803711, + "kl": 1.3171093463897705, + "learning_rate": 4.900734214192358e-06, + "loss": 0.0527, + "reward": 2.4666666984558105, + "reward_std": 1.7380064725875854, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 362 + }, + { + "completion_length": 1049.0, + "epoch": 1.2692307692307692, + "grad_norm": 1.0587317943572998, + "kl": 0.3339938521385193, + "learning_rate": 4.899513229937968e-06, + "loss": 0.0134, + "reward": 1.183333396911621, + "reward_std": 0.6088240146636963, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8499999046325684, + "step": 363 + }, + { + "completion_length": 752.5, + "epoch": 1.2727272727272727, + "grad_norm": 0.9463182687759399, + "kl": 0.2867739796638489, + "learning_rate": 4.898284936350144e-06, + "loss": 0.0115, + "reward": 1.445833444595337, + "reward_std": 1.1011831760406494, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.612500011920929, + "step": 364 + }, + { + "completion_length": 302.3333435058594, + "epoch": 1.2762237762237763, + "grad_norm": 1.0470837354660034, + "kl": 0.4384109377861023, + "learning_rate": 4.897049337170483e-06, + "loss": 0.0175, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 365 + }, + { + "completion_length": 299.5, + "epoch": 1.2797202797202798, + "grad_norm": 1.4532350301742554, + "kl": 0.48457586765289307, + "learning_rate": 4.8958064361628334e-06, + "loss": 0.0194, + "reward": 2.183333396911621, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 366 + }, + { + "completion_length": 591.1666870117188, + "epoch": 1.2832167832167833, + "grad_norm": 1.7987697124481201, + "kl": 0.44638824462890625, + "learning_rate": 4.894556237113287e-06, + "loss": 0.0179, + "reward": 1.1166666746139526, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 367 + }, + { + "completion_length": 1384.5, + "epoch": 1.2867132867132867, + "grad_norm": 0.4040040373802185, + "kl": 0.12767352163791656, + "learning_rate": 4.893298743830168e-06, + "loss": 0.0051, + "reward": 1.691666841506958, + "reward_std": 1.4019334316253662, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 368 + }, + { + "completion_length": 440.3333435058594, + "epoch": 1.2902097902097902, + "grad_norm": 1.9347208738327026, + "kl": 0.46111249923706055, + "learning_rate": 4.89203396014402e-06, + "loss": 0.0184, + "reward": 1.9333332777023315, + "reward_std": 1.0510313510894775, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7666666507720947, + "step": 369 + }, + { + "completion_length": 602.8333740234375, + "epoch": 1.2937062937062938, + "grad_norm": 1.7568728923797607, + "kl": 0.5643346309661865, + "learning_rate": 4.890761889907589e-06, + "loss": 0.0226, + "reward": 1.2333333492279053, + "reward_std": 1.1513760089874268, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40000003576278687, + "step": 370 + }, + { + "completion_length": 584.1666870117188, + "epoch": 1.297202797202797, + "grad_norm": 2.6727964878082275, + "kl": 0.5424228310585022, + "learning_rate": 4.889482536995826e-06, + "loss": 0.0217, + "reward": 2.7916667461395264, + "reward_std": 1.4527275562286377, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 371 + }, + { + "completion_length": 302.16668701171875, + "epoch": 1.3006993006993006, + "grad_norm": 1.0215359926223755, + "kl": 0.38776999711990356, + "learning_rate": 4.888195905305859e-06, + "loss": 0.0155, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 372 + }, + { + "completion_length": 1038.5, + "epoch": 1.3041958041958042, + "grad_norm": 0.8328973054885864, + "kl": 0.31271958351135254, + "learning_rate": 4.886901998756995e-06, + "loss": 0.0125, + "reward": 1.4750001430511475, + "reward_std": 1.0486897230148315, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 373 + }, + { + "completion_length": 407.16668701171875, + "epoch": 1.3076923076923077, + "grad_norm": 1.812672734260559, + "kl": 0.3156376779079437, + "learning_rate": 4.885600821290692e-06, + "loss": 0.0126, + "reward": 3.183333396911621, + "reward_std": 1.2027745246887207, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 374 + }, + { + "completion_length": 264.16668701171875, + "epoch": 1.3111888111888113, + "grad_norm": 4.727421760559082, + "kl": 1.329188585281372, + "learning_rate": 4.884292376870567e-06, + "loss": 0.0532, + "reward": 2.0916666984558105, + "reward_std": 0.94890296459198, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 375 + }, + { + "completion_length": 516.5, + "epoch": 1.3146853146853146, + "grad_norm": 2.27711820602417, + "kl": 0.6330995559692383, + "learning_rate": 4.882976669482368e-06, + "loss": 0.0253, + "reward": 1.3583333492279053, + "reward_std": 1.1029127836227417, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6916667222976685, + "step": 376 + }, + { + "completion_length": 420.66668701171875, + "epoch": 1.3181818181818181, + "grad_norm": 2.9678735733032227, + "kl": 0.8875288367271423, + "learning_rate": 4.881653703133966e-06, + "loss": 0.0355, + "reward": 1.883333444595337, + "reward_std": 1.3325413465499878, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 377 + }, + { + "completion_length": 753.1666870117188, + "epoch": 1.3216783216783217, + "grad_norm": 0.774476945400238, + "kl": 0.36767667531967163, + "learning_rate": 4.880323481855347e-06, + "loss": 0.0147, + "reward": 2.3583335876464844, + "reward_std": 1.55962073802948, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 378 + }, + { + "completion_length": 182.5, + "epoch": 1.3251748251748252, + "grad_norm": 1.207739233970642, + "kl": 0.43915602564811707, + "learning_rate": 4.878986009698596e-06, + "loss": 0.0176, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 379 + }, + { + "completion_length": 341.0, + "epoch": 1.3286713286713288, + "grad_norm": 0.7512596249580383, + "kl": 0.3403867483139038, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.0136, + "reward": 3.0416667461395264, + "reward_std": 1.4800056219100952, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.875, + "step": 380 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.332167832167832, + "grad_norm": 2.4150354862213135, + "kl": 0.6687287092208862, + "learning_rate": 4.87628932906946e-06, + "loss": 0.0267, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 381 + }, + { + "completion_length": 657.5, + "epoch": 1.3356643356643356, + "grad_norm": 1.1033812761306763, + "kl": 0.2525772750377655, + "learning_rate": 4.874930128811631e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 382 + }, + { + "completion_length": 655.6666870117188, + "epoch": 1.3391608391608392, + "grad_norm": 2.7283008098602295, + "kl": 0.7087686061859131, + "learning_rate": 4.87356369410476e-06, + "loss": 0.0284, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6333333253860474, + "step": 383 + }, + { + "completion_length": 1037.166748046875, + "epoch": 1.3426573426573427, + "grad_norm": 1.4860605001449585, + "kl": 0.35516053438186646, + "learning_rate": 4.8721900291112415e-06, + "loss": 0.0142, + "reward": 1.3416666984558105, + "reward_std": 1.0956352949142456, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6750000715255737, + "step": 384 + }, + { + "completion_length": 776.0, + "epoch": 1.3461538461538463, + "grad_norm": 2.1169064044952393, + "kl": 0.6649973392486572, + "learning_rate": 4.870809138015499e-06, + "loss": 0.0266, + "reward": 1.4750001430511475, + "reward_std": 1.2451908588409424, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 385 + }, + { + "completion_length": 803.8333740234375, + "epoch": 1.3496503496503496, + "grad_norm": 1.5138658285140991, + "kl": 0.5593903064727783, + "learning_rate": 4.869421025023965e-06, + "loss": 0.0224, + "reward": 1.2250001430511475, + "reward_std": 1.229125738143921, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5583333373069763, + "step": 386 + }, + { + "completion_length": 579.8333740234375, + "epoch": 1.3531468531468531, + "grad_norm": 0.8988491892814636, + "kl": 0.2851899266242981, + "learning_rate": 4.868025694365073e-06, + "loss": 0.0114, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 387 + }, + { + "completion_length": 173.5, + "epoch": 1.3566433566433567, + "grad_norm": 1.3644022941589355, + "kl": 0.5744073390960693, + "learning_rate": 4.866623150289241e-06, + "loss": 0.023, + "reward": 1.9666666984558105, + "reward_std": 1.2412359714508057, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 388 + }, + { + "completion_length": 578.3333740234375, + "epoch": 1.3601398601398602, + "grad_norm": 0.8156600594520569, + "kl": 0.2687755227088928, + "learning_rate": 4.865213397068864e-06, + "loss": 0.0108, + "reward": 2.2916667461395264, + "reward_std": 1.4901063442230225, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 389 + }, + { + "completion_length": 1756.8333740234375, + "epoch": 1.3636363636363638, + "grad_norm": 0.36968812346458435, + "kl": 0.11372655630111694, + "learning_rate": 4.863796438998293e-06, + "loss": 0.0045, + "reward": 1.4666666984558105, + "reward_std": 0.9174239635467529, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6333333253860474, + "step": 390 + }, + { + "completion_length": 605.5, + "epoch": 1.367132867132867, + "grad_norm": 1.086455225944519, + "kl": 0.2938157916069031, + "learning_rate": 4.862372280393828e-06, + "loss": 0.0118, + "reward": 2.4375, + "reward_std": 1.2702115774154663, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7708333730697632, + "step": 391 + }, + { + "completion_length": 736.0, + "epoch": 1.3706293706293706, + "grad_norm": 3.411510705947876, + "kl": 0.9218753576278687, + "learning_rate": 4.860940925593703e-06, + "loss": 0.0369, + "reward": 1.4583333730697632, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666865348816, + "step": 392 + }, + { + "completion_length": 166.5, + "epoch": 1.3741258741258742, + "grad_norm": 1.464406132698059, + "kl": 0.34225571155548096, + "learning_rate": 4.8595023789580745e-06, + "loss": 0.0137, + "reward": 1.6041667461395264, + "reward_std": 0.7573666572570801, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7708332538604736, + "step": 393 + }, + { + "completion_length": 646.5, + "epoch": 1.3776223776223775, + "grad_norm": 1.6122732162475586, + "kl": 0.4424184560775757, + "learning_rate": 4.858056644869002e-06, + "loss": 0.0177, + "reward": 1.3250000476837158, + "reward_std": 0.9527591466903687, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8250000476837158, + "step": 394 + }, + { + "completion_length": 641.1666870117188, + "epoch": 1.381118881118881, + "grad_norm": 0.6985570192337036, + "kl": 0.23967330157756805, + "learning_rate": 4.856603727730446e-06, + "loss": 0.0096, + "reward": 2.5458333492279053, + "reward_std": 1.5425965785980225, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7125000357627869, + "step": 395 + }, + { + "completion_length": 161.83334350585938, + "epoch": 1.3846153846153846, + "grad_norm": 1.9270485639572144, + "kl": 0.7514389753341675, + "learning_rate": 4.855143631968242e-06, + "loss": 0.0301, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 396 + }, + { + "completion_length": 166.0, + "epoch": 1.3881118881118881, + "grad_norm": 1.2144757509231567, + "kl": 0.35039469599723816, + "learning_rate": 4.853676362030095e-06, + "loss": 0.014, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 397 + }, + { + "completion_length": 569.0, + "epoch": 1.3916083916083917, + "grad_norm": 6.755039215087891, + "kl": 0.7890805006027222, + "learning_rate": 4.852201922385564e-06, + "loss": 0.0316, + "reward": 2.1083333492279053, + "reward_std": 1.7987264394760132, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6083333492279053, + "step": 398 + }, + { + "completion_length": 909.0, + "epoch": 1.395104895104895, + "grad_norm": 0.7347401976585388, + "kl": 0.18117789924144745, + "learning_rate": 4.850720317526047e-06, + "loss": 0.0072, + "reward": 1.962499976158142, + "reward_std": 0.534263551235199, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7958333492279053, + "step": 399 + }, + { + "completion_length": 793.5, + "epoch": 1.3986013986013985, + "grad_norm": 0.849243700504303, + "kl": 0.27008673548698425, + "learning_rate": 4.849231551964771e-06, + "loss": 0.0108, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 400 + }, + { + "completion_length": 554.1666870117188, + "epoch": 1.402097902097902, + "grad_norm": 2.7050747871398926, + "kl": 0.5240260362625122, + "learning_rate": 4.847735630236773e-06, + "loss": 0.021, + "reward": 3.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 401 + }, + { + "completion_length": 215.83334350585938, + "epoch": 1.4055944055944056, + "grad_norm": 0.9243234992027283, + "kl": 0.3121068477630615, + "learning_rate": 4.84623255689889e-06, + "loss": 0.0125, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 402 + }, + { + "completion_length": 235.6666717529297, + "epoch": 1.4090909090909092, + "grad_norm": 3.3891875743865967, + "kl": 0.5218031406402588, + "learning_rate": 4.844722336529745e-06, + "loss": 0.0209, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 403 + }, + { + "completion_length": 923.5, + "epoch": 1.4125874125874125, + "grad_norm": 3.197908878326416, + "kl": 0.7076524496078491, + "learning_rate": 4.84320497372973e-06, + "loss": 0.0283, + "reward": 2.0458335876464844, + "reward_std": 1.3396285772323608, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7125000357627869, + "step": 404 + }, + { + "completion_length": 197.83334350585938, + "epoch": 1.416083916083916, + "grad_norm": 1.1261261701583862, + "kl": 0.3264281153678894, + "learning_rate": 4.841680473120994e-06, + "loss": 0.0131, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 405 + }, + { + "completion_length": 554.5, + "epoch": 1.4195804195804196, + "grad_norm": 3.3561604022979736, + "kl": 0.8642048835754395, + "learning_rate": 4.840148839347434e-06, + "loss": 0.0346, + "reward": 1.8500001430511475, + "reward_std": 1.0315039157867432, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 406 + }, + { + "completion_length": 795.8333740234375, + "epoch": 1.4230769230769231, + "grad_norm": 4.25921630859375, + "kl": 0.770601749420166, + "learning_rate": 4.838610077074669e-06, + "loss": 0.0308, + "reward": 1.2916667461395264, + "reward_std": 1.0551856756210327, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 407 + }, + { + "completion_length": 915.0, + "epoch": 1.4265734265734267, + "grad_norm": 0.571506142616272, + "kl": 0.20412606000900269, + "learning_rate": 4.837064190990036e-06, + "loss": 0.0082, + "reward": 2.241666793823242, + "reward_std": 1.3698238134384155, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7416666746139526, + "step": 408 + }, + { + "completion_length": 520.6666870117188, + "epoch": 1.43006993006993, + "grad_norm": 0.9773194193840027, + "kl": 0.29276588559150696, + "learning_rate": 4.835511185802574e-06, + "loss": 0.0117, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 409 + }, + { + "completion_length": 357.5, + "epoch": 1.4335664335664335, + "grad_norm": 2.5951545238494873, + "kl": 0.4989779591560364, + "learning_rate": 4.833951066243004e-06, + "loss": 0.02, + "reward": 1.945833444595337, + "reward_std": 1.279689073562622, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.612500011920929, + "step": 410 + }, + { + "completion_length": 794.3333740234375, + "epoch": 1.437062937062937, + "grad_norm": 0.761000394821167, + "kl": 0.20721551775932312, + "learning_rate": 4.832383837063723e-06, + "loss": 0.0083, + "reward": 2.0416667461395264, + "reward_std": 1.100189447402954, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 411 + }, + { + "completion_length": 1086.5, + "epoch": 1.4405594405594406, + "grad_norm": 0.9872347116470337, + "kl": 0.296750009059906, + "learning_rate": 4.830809503038781e-06, + "loss": 0.0119, + "reward": 2.0916666984558105, + "reward_std": 1.442365050315857, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 412 + }, + { + "completion_length": 168.5, + "epoch": 1.4440559440559442, + "grad_norm": 1.2185351848602295, + "kl": 0.34197482466697693, + "learning_rate": 4.829228068963873e-06, + "loss": 0.0137, + "reward": 3.2333333492279053, + "reward_std": 0.5671566724777222, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 413 + }, + { + "completion_length": 775.3333740234375, + "epoch": 1.4475524475524475, + "grad_norm": 1.1913334131240845, + "kl": 0.3759481906890869, + "learning_rate": 4.8276395396563215e-06, + "loss": 0.015, + "reward": 0.8916667699813843, + "reward_std": 0.5633975267410278, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7250000834465027, + "step": 414 + }, + { + "completion_length": 203.6666717529297, + "epoch": 1.451048951048951, + "grad_norm": 1.0359302759170532, + "kl": 0.31211602687835693, + "learning_rate": 4.826043919955062e-06, + "loss": 0.0125, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 415 + }, + { + "completion_length": 543.6666870117188, + "epoch": 1.4545454545454546, + "grad_norm": 0.7396105527877808, + "kl": 0.25116777420043945, + "learning_rate": 4.824441214720629e-06, + "loss": 0.01, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 416 + }, + { + "completion_length": 253.0, + "epoch": 1.458041958041958, + "grad_norm": 2.3947131633758545, + "kl": 0.3577002286911011, + "learning_rate": 4.8228314288351405e-06, + "loss": 0.0143, + "reward": 1.8500001430511475, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 417 + }, + { + "completion_length": 776.0, + "epoch": 1.4615384615384617, + "grad_norm": 0.9339893460273743, + "kl": 0.2636467218399048, + "learning_rate": 4.821214567202284e-06, + "loss": 0.0105, + "reward": 2.2333333492279053, + "reward_std": 0.5671566724777222, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 418 + }, + { + "completion_length": 185.33334350585938, + "epoch": 1.465034965034965, + "grad_norm": 3.6216635704040527, + "kl": 0.6233493685722351, + "learning_rate": 4.8195906347473e-06, + "loss": 0.0249, + "reward": 1.8000000715255737, + "reward_std": 1.579240322113037, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 419 + }, + { + "completion_length": 1112.0, + "epoch": 1.4685314685314685, + "grad_norm": 0.6356344223022461, + "kl": 0.26539915800094604, + "learning_rate": 4.817959636416969e-06, + "loss": 0.0106, + "reward": 2.375, + "reward_std": 1.001873254776001, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 420 + }, + { + "completion_length": 531.1666870117188, + "epoch": 1.472027972027972, + "grad_norm": 0.8300501108169556, + "kl": 0.31844228506088257, + "learning_rate": 4.816321577179594e-06, + "loss": 0.0127, + "reward": 2.875, + "reward_std": 0.7834221124649048, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 421 + }, + { + "completion_length": 218.83334350585938, + "epoch": 1.4755244755244754, + "grad_norm": 0.796237051486969, + "kl": 0.331187903881073, + "learning_rate": 4.814676462024988e-06, + "loss": 0.0132, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 422 + }, + { + "completion_length": 186.83334350585938, + "epoch": 1.479020979020979, + "grad_norm": 1.279965877532959, + "kl": 0.3236890733242035, + "learning_rate": 4.8130242959644555e-06, + "loss": 0.0129, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 423 + }, + { + "completion_length": 249.0, + "epoch": 1.4825174825174825, + "grad_norm": 4.079779624938965, + "kl": 0.39256423711776733, + "learning_rate": 4.811365084030784e-06, + "loss": 0.0157, + "reward": 1.7125000953674316, + "reward_std": 0.9684717655181885, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7125000357627869, + "step": 424 + }, + { + "completion_length": 183.33334350585938, + "epoch": 1.486013986013986, + "grad_norm": 1.1069165468215942, + "kl": 0.262847363948822, + "learning_rate": 4.809698831278217e-06, + "loss": 0.0105, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 425 + }, + { + "completion_length": 199.6666717529297, + "epoch": 1.4895104895104896, + "grad_norm": 1.413517713546753, + "kl": 0.39733991026878357, + "learning_rate": 4.808025542782453e-06, + "loss": 0.0159, + "reward": 2.7083334922790527, + "reward_std": 0.8662660121917725, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 426 + }, + { + "completion_length": 235.6666717529297, + "epoch": 1.493006993006993, + "grad_norm": 0.9659198522567749, + "kl": 0.2365071177482605, + "learning_rate": 4.806345223640616e-06, + "loss": 0.0095, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 427 + }, + { + "completion_length": 774.1666870117188, + "epoch": 1.4965034965034965, + "grad_norm": 0.830765962600708, + "kl": 0.33350443840026855, + "learning_rate": 4.804657878971252e-06, + "loss": 0.0133, + "reward": 2.183333396911621, + "reward_std": 1.3265244960784912, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 428 + }, + { + "completion_length": 203.0, + "epoch": 1.5, + "grad_norm": 1.0319793224334717, + "kl": 0.27221041917800903, + "learning_rate": 4.802963513914304e-06, + "loss": 0.0109, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 429 + }, + { + "completion_length": 461.16668701171875, + "epoch": 1.5034965034965035, + "grad_norm": 1.0231879949569702, + "kl": 0.24733422696590424, + "learning_rate": 4.801262133631101e-06, + "loss": 0.0099, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 430 + }, + { + "completion_length": 244.83334350585938, + "epoch": 1.506993006993007, + "grad_norm": 0.9520881772041321, + "kl": 0.31419527530670166, + "learning_rate": 4.799553743304345e-06, + "loss": 0.0126, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 431 + }, + { + "completion_length": 206.83334350585938, + "epoch": 1.5104895104895104, + "grad_norm": 0.8148533701896667, + "kl": 0.2550124228000641, + "learning_rate": 4.7978383481380865e-06, + "loss": 0.0102, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 432 + }, + { + "completion_length": 1087.8333740234375, + "epoch": 1.513986013986014, + "grad_norm": 0.3516090214252472, + "kl": 0.2816867530345917, + "learning_rate": 4.796115953357718e-06, + "loss": 0.0113, + "reward": 2.2833333015441895, + "reward_std": 1.2408331632614136, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 433 + }, + { + "completion_length": 556.3333740234375, + "epoch": 1.5174825174825175, + "grad_norm": 3.6779227256774902, + "kl": 0.4250108003616333, + "learning_rate": 4.794386564209953e-06, + "loss": 0.017, + "reward": 2.4083335399627686, + "reward_std": 1.687132716178894, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 434 + }, + { + "completion_length": 707.8333740234375, + "epoch": 1.5209790209790208, + "grad_norm": 1.121485948562622, + "kl": 0.24696388840675354, + "learning_rate": 4.79265018596281e-06, + "loss": 0.0099, + "reward": 2.9000000953674316, + "reward_std": 0.9027735590934753, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 435 + }, + { + "completion_length": 469.8333435058594, + "epoch": 1.5244755244755246, + "grad_norm": 2.6518046855926514, + "kl": 0.7716752886772156, + "learning_rate": 4.790906823905599e-06, + "loss": 0.0309, + "reward": 1.8000000715255737, + "reward_std": 1.447066068649292, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 436 + }, + { + "completion_length": 192.83334350585938, + "epoch": 1.527972027972028, + "grad_norm": 1.165176272392273, + "kl": 0.2884241044521332, + "learning_rate": 4.7891564833489035e-06, + "loss": 0.0115, + "reward": 2.450000286102295, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 437 + }, + { + "completion_length": 254.6666717529297, + "epoch": 1.5314685314685315, + "grad_norm": 0.8783808350563049, + "kl": 0.26613113284111023, + "learning_rate": 4.787399169624562e-06, + "loss": 0.0106, + "reward": 3.370833396911621, + "reward_std": 1.011238932609558, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 438 + }, + { + "completion_length": 158.5, + "epoch": 1.534965034965035, + "grad_norm": 2.008617877960205, + "kl": 0.5028926134109497, + "learning_rate": 4.7856348880856595e-06, + "loss": 0.0201, + "reward": 1.7416666746139526, + "reward_std": 1.1517016887664795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7416666746139526, + "step": 439 + }, + { + "completion_length": 208.5, + "epoch": 1.5384615384615383, + "grad_norm": 0.8693957924842834, + "kl": 0.2799164056777954, + "learning_rate": 4.783863644106502e-06, + "loss": 0.0112, + "reward": 2.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 440 + }, + { + "completion_length": 211.5, + "epoch": 1.541958041958042, + "grad_norm": 1.5437381267547607, + "kl": 0.3011782467365265, + "learning_rate": 4.782085443082607e-06, + "loss": 0.012, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 441 + }, + { + "completion_length": 491.8333435058594, + "epoch": 1.5454545454545454, + "grad_norm": 3.308060884475708, + "kl": 0.43526870012283325, + "learning_rate": 4.780300290430683e-06, + "loss": 0.0174, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 442 + }, + { + "completion_length": 177.1666717529297, + "epoch": 1.548951048951049, + "grad_norm": 2.3108198642730713, + "kl": 0.6005208492279053, + "learning_rate": 4.778508191588613e-06, + "loss": 0.024, + "reward": 2.683333396911621, + "reward_std": 1.2110600471496582, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 443 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.5524475524475525, + "grad_norm": 0.9576809406280518, + "kl": 0.3041282296180725, + "learning_rate": 4.776709152015443e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 444 + }, + { + "completion_length": 807.3333740234375, + "epoch": 1.5559440559440558, + "grad_norm": 0.6298768520355225, + "kl": 0.2337806224822998, + "learning_rate": 4.774903177191358e-06, + "loss": 0.0094, + "reward": 2.5458335876464844, + "reward_std": 1.3377609252929688, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 445 + }, + { + "completion_length": 523.6666870117188, + "epoch": 1.5594405594405596, + "grad_norm": 1.1019190549850464, + "kl": 0.39509618282318115, + "learning_rate": 4.773090272617672e-06, + "loss": 0.0158, + "reward": 2.049999952316284, + "reward_std": 1.5391557216644287, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 446 + }, + { + "completion_length": 787.6666870117188, + "epoch": 1.562937062937063, + "grad_norm": 0.893694281578064, + "kl": 0.37470337748527527, + "learning_rate": 4.771270443816805e-06, + "loss": 0.015, + "reward": 2.2083334922790527, + "reward_std": 0.8720186948776245, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 447 + }, + { + "completion_length": 546.8333740234375, + "epoch": 1.5664335664335665, + "grad_norm": 0.837485134601593, + "kl": 0.22402605414390564, + "learning_rate": 4.769443696332272e-06, + "loss": 0.009, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 448 + }, + { + "completion_length": 177.6666717529297, + "epoch": 1.56993006993007, + "grad_norm": 1.617317795753479, + "kl": 0.3958384692668915, + "learning_rate": 4.767610035728663e-06, + "loss": 0.0158, + "reward": 2.875, + "reward_std": 1.0068515539169312, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 449 + }, + { + "completion_length": 147.33334350585938, + "epoch": 1.5734265734265733, + "grad_norm": 0.9628480076789856, + "kl": 0.3490566611289978, + "learning_rate": 4.765769467591626e-06, + "loss": 0.014, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 450 + }, + { + "completion_length": 203.83334350585938, + "epoch": 1.5769230769230769, + "grad_norm": 0.9194980263710022, + "kl": 0.3181028962135315, + "learning_rate": 4.763921997527849e-06, + "loss": 0.0127, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 451 + }, + { + "completion_length": 167.5, + "epoch": 1.5804195804195804, + "grad_norm": 3.041954517364502, + "kl": 0.426164835691452, + "learning_rate": 4.762067631165049e-06, + "loss": 0.017, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 452 + }, + { + "completion_length": 212.33334350585938, + "epoch": 1.583916083916084, + "grad_norm": 1.1762245893478394, + "kl": 0.2974995970726013, + "learning_rate": 4.760206374151947e-06, + "loss": 0.0119, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 453 + }, + { + "completion_length": 493.66668701171875, + "epoch": 1.5874125874125875, + "grad_norm": 1.3206851482391357, + "kl": 0.36789295077323914, + "learning_rate": 4.7583382321582525e-06, + "loss": 0.0147, + "reward": 1.9166667461395264, + "reward_std": 1.2738393545150757, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.75, + "step": 454 + }, + { + "completion_length": 205.0, + "epoch": 1.5909090909090908, + "grad_norm": 1.0482568740844727, + "kl": 0.2594867944717407, + "learning_rate": 4.7564632108746524e-06, + "loss": 0.0104, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 455 + }, + { + "completion_length": 502.8333435058594, + "epoch": 1.5944055944055944, + "grad_norm": 2.1341159343719482, + "kl": 0.4591405391693115, + "learning_rate": 4.754581316012785e-06, + "loss": 0.0184, + "reward": 3.7083334922790527, + "reward_std": 0.591960072517395, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 456 + }, + { + "completion_length": 633.3333740234375, + "epoch": 1.597902097902098, + "grad_norm": 1.0107204914093018, + "kl": 0.24642407894134521, + "learning_rate": 4.752692553305229e-06, + "loss": 0.0099, + "reward": 3.0375001430511475, + "reward_std": 0.7974569201469421, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708333373069763, + "step": 457 + }, + { + "completion_length": 517.0, + "epoch": 1.6013986013986012, + "grad_norm": 0.6217291355133057, + "kl": 0.22938358783721924, + "learning_rate": 4.750796928505484e-06, + "loss": 0.0092, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 458 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.604895104895105, + "grad_norm": 0.5446264743804932, + "kl": 0.1968853920698166, + "learning_rate": 4.7488944473879515e-06, + "loss": 0.0079, + "reward": 1.625, + "reward_std": 0.7960842847824097, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 459 + }, + { + "completion_length": 193.83334350585938, + "epoch": 1.6083916083916083, + "grad_norm": 0.8946224451065063, + "kl": 0.25773894786834717, + "learning_rate": 4.746985115747918e-06, + "loss": 0.0103, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 460 + }, + { + "completion_length": 204.6666717529297, + "epoch": 1.6118881118881119, + "grad_norm": 0.8260864019393921, + "kl": 0.2527741491794586, + "learning_rate": 4.745068939401539e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 461 + }, + { + "completion_length": 848.6666870117188, + "epoch": 1.6153846153846154, + "grad_norm": 1.5746495723724365, + "kl": 0.3351367712020874, + "learning_rate": 4.743145924185821e-06, + "loss": 0.0134, + "reward": 2.25, + "reward_std": 0.7803846597671509, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.75, + "step": 462 + }, + { + "completion_length": 190.0, + "epoch": 1.6188811188811187, + "grad_norm": 1.0435597896575928, + "kl": 0.26553571224212646, + "learning_rate": 4.741216075958602e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 463 + }, + { + "completion_length": 209.6666717529297, + "epoch": 1.6223776223776225, + "grad_norm": 1.0996354818344116, + "kl": 0.31133967638015747, + "learning_rate": 4.7392794005985324e-06, + "loss": 0.0125, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 464 + }, + { + "completion_length": 512.6666870117188, + "epoch": 1.6258741258741258, + "grad_norm": 0.7010518908500671, + "kl": 0.21432137489318848, + "learning_rate": 4.737335904005063e-06, + "loss": 0.0086, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 465 + }, + { + "completion_length": 527.0, + "epoch": 1.6293706293706294, + "grad_norm": 0.5995029211044312, + "kl": 0.22433510422706604, + "learning_rate": 4.735385592098421e-06, + "loss": 0.009, + "reward": 2.7916667461395264, + "reward_std": 1.4527275562286377, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 466 + }, + { + "completion_length": 191.0, + "epoch": 1.632867132867133, + "grad_norm": 1.2079272270202637, + "kl": 0.2614157795906067, + "learning_rate": 4.733428470819595e-06, + "loss": 0.0105, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 467 + }, + { + "completion_length": 783.1666870117188, + "epoch": 1.6363636363636362, + "grad_norm": 2.2251851558685303, + "kl": 0.6713162660598755, + "learning_rate": 4.731464546130315e-06, + "loss": 0.0269, + "reward": 2.4375, + "reward_std": 1.3401259183883667, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7708333730697632, + "step": 468 + }, + { + "completion_length": 529.1666870117188, + "epoch": 1.63986013986014, + "grad_norm": 0.5742272138595581, + "kl": 0.23623262345790863, + "learning_rate": 4.729493824013036e-06, + "loss": 0.0094, + "reward": 2.2125000953674316, + "reward_std": 1.234073519706726, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7124999761581421, + "step": 469 + }, + { + "completion_length": 181.0, + "epoch": 1.6433566433566433, + "grad_norm": 1.7596086263656616, + "kl": 0.33919036388397217, + "learning_rate": 4.72751631047092e-06, + "loss": 0.0136, + "reward": 1.8500001430511475, + "reward_std": 1.2247450351715088, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 470 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.6468531468531469, + "grad_norm": 1.0671755075454712, + "kl": 0.27314767241477966, + "learning_rate": 4.725532011527817e-06, + "loss": 0.0109, + "reward": 3.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 471 + }, + { + "completion_length": 189.6666717529297, + "epoch": 1.6503496503496504, + "grad_norm": 1.0676515102386475, + "kl": 0.2805836498737335, + "learning_rate": 4.723540933228245e-06, + "loss": 0.0112, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 472 + }, + { + "completion_length": 836.5, + "epoch": 1.6538461538461537, + "grad_norm": 0.8203516006469727, + "kl": 0.172221839427948, + "learning_rate": 4.721543081637372e-06, + "loss": 0.0069, + "reward": 1.5833333730697632, + "reward_std": 1.0308573246002197, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7499999403953552, + "step": 473 + }, + { + "completion_length": 169.0, + "epoch": 1.6573426573426573, + "grad_norm": 1.7924721240997314, + "kl": 0.30363911390304565, + "learning_rate": 4.719538462841003e-06, + "loss": 0.0121, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 474 + }, + { + "completion_length": 176.6666717529297, + "epoch": 1.6608391608391608, + "grad_norm": 0.19596193730831146, + "kl": 0.24111799895763397, + "learning_rate": 4.717527082945555e-06, + "loss": 0.0108, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 475 + }, + { + "completion_length": 234.6666717529297, + "epoch": 1.6643356643356644, + "grad_norm": 0.9966434240341187, + "kl": 0.25714850425720215, + "learning_rate": 4.715508948078037e-06, + "loss": 0.0103, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 476 + }, + { + "completion_length": 1046.8333740234375, + "epoch": 1.667832167832168, + "grad_norm": 0.6285001635551453, + "kl": 0.1687658280134201, + "learning_rate": 4.71348406438604e-06, + "loss": 0.0068, + "reward": 2.0250000953674316, + "reward_std": 1.4372718334197998, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 477 + }, + { + "completion_length": 219.1666717529297, + "epoch": 1.6713286713286712, + "grad_norm": 1.0476932525634766, + "kl": 0.29544544219970703, + "learning_rate": 4.71145243803771e-06, + "loss": 0.0118, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 478 + }, + { + "completion_length": 561.1666870117188, + "epoch": 1.6748251748251748, + "grad_norm": 1.0641223192214966, + "kl": 0.1950298398733139, + "learning_rate": 4.709414075221734e-06, + "loss": 0.0078, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 479 + }, + { + "completion_length": 228.5, + "epoch": 1.6783216783216783, + "grad_norm": 0.8561164736747742, + "kl": 0.26422810554504395, + "learning_rate": 4.707368982147318e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 480 + }, + { + "completion_length": 509.3333435058594, + "epoch": 1.6818181818181817, + "grad_norm": 0.5843437314033508, + "kl": 0.20474323630332947, + "learning_rate": 4.70531716504417e-06, + "loss": 0.0082, + "reward": 2.183333396911621, + "reward_std": 1.2027745246887207, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 481 + }, + { + "completion_length": 548.6666870117188, + "epoch": 1.6853146853146854, + "grad_norm": 0.648353636264801, + "kl": 0.18905925750732422, + "learning_rate": 4.703258630162481e-06, + "loss": 0.0076, + "reward": 2.2125000953674316, + "reward_std": 1.3164108991622925, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7124999761581421, + "step": 482 + }, + { + "completion_length": 219.6666717529297, + "epoch": 1.6888111888111887, + "grad_norm": 4.2207932472229, + "kl": 1.0905920267105103, + "learning_rate": 4.701193383772905e-06, + "loss": 0.0436, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 483 + }, + { + "completion_length": 1049.166748046875, + "epoch": 1.6923076923076923, + "grad_norm": 0.5171648859977722, + "kl": 0.20516209304332733, + "learning_rate": 4.699121432166542e-06, + "loss": 0.0082, + "reward": 2.2333333492279053, + "reward_std": 0.9174240827560425, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 484 + }, + { + "completion_length": 201.6666717529297, + "epoch": 1.6958041958041958, + "grad_norm": 1.1004559993743896, + "kl": 0.2839426100254059, + "learning_rate": 4.697042781654913e-06, + "loss": 0.0114, + "reward": 1.870833396911621, + "reward_std": 0.193917915225029, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 485 + }, + { + "completion_length": 190.33334350585938, + "epoch": 1.6993006993006992, + "grad_norm": 1.0573567152023315, + "kl": 0.22315821051597595, + "learning_rate": 4.6949574385699514e-06, + "loss": 0.0089, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 486 + }, + { + "completion_length": 835.5, + "epoch": 1.702797202797203, + "grad_norm": 0.7173390984535217, + "kl": 0.1510881930589676, + "learning_rate": 4.6928654092639725e-06, + "loss": 0.006, + "reward": 1.5500000715255737, + "reward_std": 1.0904128551483154, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666388511658, + "step": 487 + }, + { + "completion_length": 615.8333740234375, + "epoch": 1.7062937062937062, + "grad_norm": 0.8014463186264038, + "kl": 0.22651296854019165, + "learning_rate": 4.690766700109659e-06, + "loss": 0.0091, + "reward": 2.7083334922790527, + "reward_std": 1.315453052520752, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 488 + }, + { + "completion_length": 169.6666717529297, + "epoch": 1.7097902097902098, + "grad_norm": 3.6473190784454346, + "kl": 0.40026336908340454, + "learning_rate": 4.688661317500045e-06, + "loss": 0.016, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 489 + }, + { + "completion_length": 1151.5, + "epoch": 1.7132867132867133, + "grad_norm": 0.8561959266662598, + "kl": 0.16577297449111938, + "learning_rate": 4.68654926784849e-06, + "loss": 0.0066, + "reward": 2.7083334922790527, + "reward_std": 1.0641508102416992, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.875, + "step": 490 + }, + { + "completion_length": 397.3333435058594, + "epoch": 1.7167832167832167, + "grad_norm": 1.0723934173583984, + "kl": 0.21682481467723846, + "learning_rate": 4.6844305575886635e-06, + "loss": 0.0087, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 491 + }, + { + "completion_length": 169.6666717529297, + "epoch": 1.7202797202797204, + "grad_norm": 1.4164685010910034, + "kl": 0.245243638753891, + "learning_rate": 4.682305193174524e-06, + "loss": 0.0098, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 492 + }, + { + "completion_length": 110.33333587646484, + "epoch": 1.7237762237762237, + "grad_norm": 5.974154949188232, + "kl": 1.1889418363571167, + "learning_rate": 4.680173181080302e-06, + "loss": 0.0476, + "reward": 3.075000286102295, + "reward_std": 1.1660832166671753, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 493 + }, + { + "completion_length": 215.5, + "epoch": 1.7272727272727273, + "grad_norm": 0.9199399352073669, + "kl": 0.2431143820285797, + "learning_rate": 4.6780345278004744e-06, + "loss": 0.0097, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 494 + }, + { + "completion_length": 190.83334350585938, + "epoch": 1.7307692307692308, + "grad_norm": 0.9801461696624756, + "kl": 0.25382137298583984, + "learning_rate": 4.675889239849749e-06, + "loss": 0.0102, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 495 + }, + { + "completion_length": 846.6666870117188, + "epoch": 1.7342657342657342, + "grad_norm": 0.6822401881217957, + "kl": 0.21501430869102478, + "learning_rate": 4.673737323763048e-06, + "loss": 0.0086, + "reward": 2.679166793823242, + "reward_std": 1.3748105764389038, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8458333015441895, + "step": 496 + }, + { + "completion_length": 182.33334350585938, + "epoch": 1.737762237762238, + "grad_norm": 6.3415422439575195, + "kl": 1.284159541130066, + "learning_rate": 4.671578786095479e-06, + "loss": 0.0514, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 497 + }, + { + "completion_length": 164.83334350585938, + "epoch": 1.7412587412587412, + "grad_norm": 1.421428918838501, + "kl": 0.3243716359138489, + "learning_rate": 4.669413633422322e-06, + "loss": 0.013, + "reward": 3.566666603088379, + "reward_std": 0.6013872623443604, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 498 + }, + { + "completion_length": 229.6666717529297, + "epoch": 1.7447552447552448, + "grad_norm": 0.8355535864830017, + "kl": 0.24279817938804626, + "learning_rate": 4.667241872339007e-06, + "loss": 0.0097, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 499 + }, + { + "completion_length": 672.6666870117188, + "epoch": 1.7482517482517483, + "grad_norm": 0.5215955376625061, + "kl": 0.19877499341964722, + "learning_rate": 4.665063509461098e-06, + "loss": 0.008, + "reward": 2.924999952316284, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 500 + }, + { + "completion_length": 198.83334350585938, + "epoch": 1.7517482517482517, + "grad_norm": 0.9148537516593933, + "kl": 0.24169328808784485, + "learning_rate": 4.6628785514242615e-06, + "loss": 0.0097, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 501 + }, + { + "completion_length": 928.5, + "epoch": 1.7552447552447552, + "grad_norm": 0.4413454532623291, + "kl": 0.15593400597572327, + "learning_rate": 4.6606870048842626e-06, + "loss": 0.0062, + "reward": 2.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 502 + }, + { + "completion_length": 508.0, + "epoch": 1.7587412587412588, + "grad_norm": 0.7536454796791077, + "kl": 0.24186736345291138, + "learning_rate": 4.658488876516929e-06, + "loss": 0.0097, + "reward": 2.625, + "reward_std": 1.616709589958191, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 503 + }, + { + "completion_length": 208.33334350585938, + "epoch": 1.762237762237762, + "grad_norm": 1.1730728149414062, + "kl": 0.2987002432346344, + "learning_rate": 4.656284173018144e-06, + "loss": 0.0119, + "reward": 2.758333206176758, + "reward_std": 1.0394309759140015, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 504 + }, + { + "completion_length": 197.1666717529297, + "epoch": 1.7657342657342658, + "grad_norm": 2.2083706855773926, + "kl": 0.3215945363044739, + "learning_rate": 4.654072901103815e-06, + "loss": 0.0129, + "reward": 2.0416667461395264, + "reward_std": 0.9002315402030945, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 505 + }, + { + "completion_length": 572.0, + "epoch": 1.7692307692307692, + "grad_norm": 0.8655341863632202, + "kl": 0.24153539538383484, + "learning_rate": 4.65185506750986e-06, + "loss": 0.0097, + "reward": 1.870833396911621, + "reward_std": 1.0137083530426025, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 506 + }, + { + "completion_length": 517.5, + "epoch": 1.7727272727272727, + "grad_norm": 0.49979329109191895, + "kl": 0.16330799460411072, + "learning_rate": 4.649630678992184e-06, + "loss": 0.0065, + "reward": 2.4000000953674316, + "reward_std": 0.9460445642471313, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 507 + }, + { + "completion_length": 324.16668701171875, + "epoch": 1.7762237762237763, + "grad_norm": 0.9129101037979126, + "kl": 0.26079505681991577, + "learning_rate": 4.6473997423266615e-06, + "loss": 0.0104, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 508 + }, + { + "completion_length": 316.16668701171875, + "epoch": 1.7797202797202796, + "grad_norm": 0.7381297945976257, + "kl": 0.34089159965515137, + "learning_rate": 4.645162264309112e-06, + "loss": 0.0136, + "reward": 3.2333335876464844, + "reward_std": 0.849509596824646, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 509 + }, + { + "completion_length": 207.83334350585938, + "epoch": 1.7832167832167833, + "grad_norm": 1.0436253547668457, + "kl": 0.2835765480995178, + "learning_rate": 4.642918251755281e-06, + "loss": 0.0113, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 510 + }, + { + "completion_length": 230.33334350585938, + "epoch": 1.7867132867132867, + "grad_norm": 0.9628374576568604, + "kl": 0.2641430199146271, + "learning_rate": 4.640667711500821e-06, + "loss": 0.0106, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 511 + }, + { + "completion_length": 507.66668701171875, + "epoch": 1.7902097902097902, + "grad_norm": 0.3851446211338043, + "kl": 0.251933217048645, + "learning_rate": 4.638410650401267e-06, + "loss": 0.0101, + "reward": 3.516666889190674, + "reward_std": 1.0614454746246338, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 512 + }, + { + "completion_length": 192.0, + "epoch": 1.7937062937062938, + "grad_norm": 1.3856638669967651, + "kl": 0.2984909415245056, + "learning_rate": 4.636147075332019e-06, + "loss": 0.0119, + "reward": 3.0916666984558105, + "reward_std": 1.2249150276184082, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 513 + }, + { + "completion_length": 206.83334350585938, + "epoch": 1.797202797202797, + "grad_norm": 0.9139816164970398, + "kl": 0.24960675835609436, + "learning_rate": 4.633876993188319e-06, + "loss": 0.01, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 514 + }, + { + "completion_length": 538.0, + "epoch": 1.8006993006993008, + "grad_norm": 0.7666388750076294, + "kl": 0.2067805826663971, + "learning_rate": 4.631600410885231e-06, + "loss": 0.0083, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 515 + }, + { + "completion_length": 186.0, + "epoch": 1.8041958041958042, + "grad_norm": 0.9322411417961121, + "kl": 0.24232684075832367, + "learning_rate": 4.62931733535762e-06, + "loss": 0.0097, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 516 + }, + { + "completion_length": 170.6666717529297, + "epoch": 1.8076923076923077, + "grad_norm": 1.5746034383773804, + "kl": 0.36948150396347046, + "learning_rate": 4.627027773560129e-06, + "loss": 0.0148, + "reward": 2.516666889190674, + "reward_std": 1.525341510772705, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 517 + }, + { + "completion_length": 193.0, + "epoch": 1.8111888111888113, + "grad_norm": 0.9759989380836487, + "kl": 0.3557225167751312, + "learning_rate": 4.62473173246716e-06, + "loss": 0.0142, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 518 + }, + { + "completion_length": 523.6666870117188, + "epoch": 1.8146853146853146, + "grad_norm": 0.9804190993309021, + "kl": 0.2574712038040161, + "learning_rate": 4.622429219072854e-06, + "loss": 0.0103, + "reward": 1.633333444595337, + "reward_std": 1.1919171810150146, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 519 + }, + { + "completion_length": 1029.166748046875, + "epoch": 1.8181818181818183, + "grad_norm": 0.5941687822341919, + "kl": 0.1915300190448761, + "learning_rate": 4.620120240391065e-06, + "loss": 0.0077, + "reward": 2.883333206176758, + "reward_std": 1.7192052602767944, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 520 + }, + { + "completion_length": 157.1666717529297, + "epoch": 1.8216783216783217, + "grad_norm": 3.1836304664611816, + "kl": 0.6161837577819824, + "learning_rate": 4.6178048034553435e-06, + "loss": 0.0246, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 521 + }, + { + "completion_length": 201.33334350585938, + "epoch": 1.8251748251748252, + "grad_norm": 1.5185062885284424, + "kl": 0.31097742915153503, + "learning_rate": 4.6154829153189105e-06, + "loss": 0.0124, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 522 + }, + { + "completion_length": 186.1666717529297, + "epoch": 1.8286713286713288, + "grad_norm": 0.936562180519104, + "kl": 0.3272198438644409, + "learning_rate": 4.613154583054641e-06, + "loss": 0.0131, + "reward": 2.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 523 + }, + { + "completion_length": 216.6666717529297, + "epoch": 1.832167832167832, + "grad_norm": 0.9323495626449585, + "kl": 0.3112618923187256, + "learning_rate": 4.610819813755038e-06, + "loss": 0.0125, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 524 + }, + { + "completion_length": 525.3333740234375, + "epoch": 1.8356643356643356, + "grad_norm": 0.40873953700065613, + "kl": 0.241009920835495, + "learning_rate": 4.608478614532215e-06, + "loss": 0.0096, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 525 + }, + { + "completion_length": 160.83334350585938, + "epoch": 1.8391608391608392, + "grad_norm": 1.1447237730026245, + "kl": 0.37633103132247925, + "learning_rate": 4.60613099251787e-06, + "loss": 0.0151, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 526 + }, + { + "completion_length": 176.5, + "epoch": 1.8426573426573427, + "grad_norm": 1.4215019941329956, + "kl": 0.31421756744384766, + "learning_rate": 4.603776954863266e-06, + "loss": 0.0126, + "reward": 2.2083334922790527, + "reward_std": 0.6003471612930298, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 527 + }, + { + "completion_length": 511.16668701171875, + "epoch": 1.8461538461538463, + "grad_norm": 0.7890862226486206, + "kl": 0.21260276436805725, + "learning_rate": 4.601416508739211e-06, + "loss": 0.0085, + "reward": 2.2916667461395264, + "reward_std": 1.4901063442230225, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 528 + }, + { + "completion_length": 145.6666717529297, + "epoch": 1.8496503496503496, + "grad_norm": 2.972633123397827, + "kl": 1.6821321249008179, + "learning_rate": 4.599049661336033e-06, + "loss": 0.0673, + "reward": 2.4583334922790527, + "reward_std": 1.3603004217147827, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 529 + }, + { + "completion_length": 337.66668701171875, + "epoch": 1.8531468531468531, + "grad_norm": 0.4933686852455139, + "kl": 0.2972989082336426, + "learning_rate": 4.596676419863561e-06, + "loss": 0.0119, + "reward": 3.758333206176758, + "reward_std": 0.4694856107234955, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9250000715255737, + "step": 530 + }, + { + "completion_length": 1491.166748046875, + "epoch": 1.8566433566433567, + "grad_norm": 0.7114420533180237, + "kl": 0.16526620090007782, + "learning_rate": 4.5942967915510975e-06, + "loss": 0.0066, + "reward": 2.683333396911621, + "reward_std": 0.8942409753799438, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 531 + }, + { + "completion_length": 822.0, + "epoch": 1.86013986013986, + "grad_norm": 0.4190931022167206, + "kl": 0.21502110362052917, + "learning_rate": 4.591910783647405e-06, + "loss": 0.0086, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 532 + }, + { + "completion_length": 739.5, + "epoch": 1.8636363636363638, + "grad_norm": 0.5615747570991516, + "kl": 0.223265141248703, + "learning_rate": 4.589518403420676e-06, + "loss": 0.0089, + "reward": 2.3500001430511475, + "reward_std": 1.5231547355651855, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 533 + }, + { + "completion_length": 188.6666717529297, + "epoch": 1.867132867132867, + "grad_norm": 0.754673957824707, + "kl": 0.2731919288635254, + "learning_rate": 4.587119658158517e-06, + "loss": 0.0109, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 534 + }, + { + "completion_length": 528.3333740234375, + "epoch": 1.8706293706293706, + "grad_norm": 0.45285508036613464, + "kl": 0.21540388464927673, + "learning_rate": 4.584714555167921e-06, + "loss": 0.0086, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 535 + }, + { + "completion_length": 513.1666870117188, + "epoch": 1.8741258741258742, + "grad_norm": 0.6436936259269714, + "kl": 0.2541727125644684, + "learning_rate": 4.582303101775249e-06, + "loss": 0.0102, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 536 + }, + { + "completion_length": 503.3333435058594, + "epoch": 1.8776223776223775, + "grad_norm": 0.5080775618553162, + "kl": 0.2073960304260254, + "learning_rate": 4.579885305326206e-06, + "loss": 0.0083, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 537 + }, + { + "completion_length": 209.6666717529297, + "epoch": 1.8811188811188813, + "grad_norm": 0.9030362963676453, + "kl": 0.283308744430542, + "learning_rate": 4.577461173185821e-06, + "loss": 0.0113, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 538 + }, + { + "completion_length": 121.5, + "epoch": 1.8846153846153846, + "grad_norm": 2.8895628452301025, + "kl": 0.8616495132446289, + "learning_rate": 4.5750307127384194e-06, + "loss": 0.0345, + "reward": 1.4666666984558105, + "reward_std": 1.2002778053283691, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 539 + }, + { + "completion_length": 208.83334350585938, + "epoch": 1.8881118881118881, + "grad_norm": 1.0781502723693848, + "kl": 0.2666887640953064, + "learning_rate": 4.572593931387604e-06, + "loss": 0.0107, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 540 + }, + { + "completion_length": 529.8333740234375, + "epoch": 1.8916083916083917, + "grad_norm": 0.8341970443725586, + "kl": 0.27578771114349365, + "learning_rate": 4.570150836556236e-06, + "loss": 0.011, + "reward": 2.683333396911621, + "reward_std": 0.9092121124267578, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 541 + }, + { + "completion_length": 509.0, + "epoch": 1.895104895104895, + "grad_norm": 0.7221694588661194, + "kl": 0.20753830671310425, + "learning_rate": 4.567701435686405e-06, + "loss": 0.0083, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 542 + }, + { + "completion_length": 999.0, + "epoch": 1.8986013986013988, + "grad_norm": 0.8567831516265869, + "kl": 0.2119346261024475, + "learning_rate": 4.5652457362394094e-06, + "loss": 0.0085, + "reward": 1.808333396911621, + "reward_std": 2.014302968978882, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 543 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.902097902097902, + "grad_norm": 0.5826951265335083, + "kl": 0.2415902316570282, + "learning_rate": 4.562783745695738e-06, + "loss": 0.0097, + "reward": 2.183333396911621, + "reward_std": 1.0230673551559448, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 544 + }, + { + "completion_length": 831.0, + "epoch": 1.9055944055944056, + "grad_norm": 0.5661029815673828, + "kl": 0.2621002495288849, + "learning_rate": 4.560315471555039e-06, + "loss": 0.0105, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 545 + }, + { + "completion_length": 190.6666717529297, + "epoch": 1.9090909090909092, + "grad_norm": 0.8984940648078918, + "kl": 0.261735200881958, + "learning_rate": 4.5578409213361055e-06, + "loss": 0.0105, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 546 + }, + { + "completion_length": 672.5, + "epoch": 1.9125874125874125, + "grad_norm": 0.6307451128959656, + "kl": 0.3331562280654907, + "learning_rate": 4.555360102576844e-06, + "loss": 0.0133, + "reward": 3.5916666984558105, + "reward_std": 0.5571505427360535, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9250000715255737, + "step": 547 + }, + { + "completion_length": 193.5, + "epoch": 1.916083916083916, + "grad_norm": 0.9689189791679382, + "kl": 0.31761375069618225, + "learning_rate": 4.55287302283426e-06, + "loss": 0.0127, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 548 + }, + { + "completion_length": 477.0, + "epoch": 1.9195804195804196, + "grad_norm": 1.1217161417007446, + "kl": 0.4803551435470581, + "learning_rate": 4.550379689684431e-06, + "loss": 0.0192, + "reward": 2.924999952316284, + "reward_std": 0.06123730167746544, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.9249999523162842, + "step": 549 + }, + { + "completion_length": 501.66668701171875, + "epoch": 1.9230769230769231, + "grad_norm": 0.48732584714889526, + "kl": 0.3280116021633148, + "learning_rate": 4.54788011072248e-06, + "loss": 0.0131, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 550 + }, + { + "completion_length": 190.5, + "epoch": 1.9265734265734267, + "grad_norm": 0.05169845372438431, + "kl": 0.2321687638759613, + "learning_rate": 4.545374293562559e-06, + "loss": 0.0117, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 551 + }, + { + "completion_length": 226.33334350585938, + "epoch": 1.93006993006993, + "grad_norm": 1.1284880638122559, + "kl": 0.3435511291027069, + "learning_rate": 4.542862245837821e-06, + "loss": 0.0137, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 552 + }, + { + "completion_length": 197.5, + "epoch": 1.9335664335664335, + "grad_norm": 0.8085185289382935, + "kl": 0.2905815541744232, + "learning_rate": 4.540343975200401e-06, + "loss": 0.0116, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 553 + }, + { + "completion_length": 504.8333435058594, + "epoch": 1.937062937062937, + "grad_norm": 0.38323989510536194, + "kl": 0.26971811056137085, + "learning_rate": 4.537819489321385e-06, + "loss": 0.0108, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 554 + }, + { + "completion_length": 172.5, + "epoch": 1.9405594405594404, + "grad_norm": 1.8462821245193481, + "kl": 0.32645952701568604, + "learning_rate": 4.535288795890799e-06, + "loss": 0.0131, + "reward": 2.8500001430511475, + "reward_std": 1.3190906047821045, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 555 + }, + { + "completion_length": 508.66668701171875, + "epoch": 1.9440559440559442, + "grad_norm": 0.48262494802474976, + "kl": 0.26610442996025085, + "learning_rate": 4.5327519026175694e-06, + "loss": 0.0106, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 556 + }, + { + "completion_length": 205.33334350585938, + "epoch": 1.9475524475524475, + "grad_norm": 0.8724077343940735, + "kl": 0.34979626536369324, + "learning_rate": 4.530208817229516e-06, + "loss": 0.014, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 557 + }, + { + "completion_length": 466.3333435058594, + "epoch": 1.951048951048951, + "grad_norm": 1.2409106492996216, + "kl": 0.5075003504753113, + "learning_rate": 4.527659547473317e-06, + "loss": 0.0203, + "reward": 1.774999976158142, + "reward_std": 1.3299436569213867, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6083333492279053, + "step": 558 + }, + { + "completion_length": 201.0, + "epoch": 1.9545454545454546, + "grad_norm": 0.9538130760192871, + "kl": 0.22750967741012573, + "learning_rate": 4.5251041011144905e-06, + "loss": 0.0091, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 559 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.958041958041958, + "grad_norm": 0.8161240220069885, + "kl": 0.28019654750823975, + "learning_rate": 4.522542485937369e-06, + "loss": 0.0112, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 560 + }, + { + "completion_length": 515.5, + "epoch": 1.9615384615384617, + "grad_norm": 0.6905736327171326, + "kl": 0.20913702249526978, + "learning_rate": 4.519974709745076e-06, + "loss": 0.0084, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 561 + }, + { + "completion_length": 201.5, + "epoch": 1.965034965034965, + "grad_norm": 1.109075665473938, + "kl": 0.29383933544158936, + "learning_rate": 4.517400780359505e-06, + "loss": 0.0118, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 562 + }, + { + "completion_length": 849.0, + "epoch": 1.9685314685314685, + "grad_norm": 0.5454800128936768, + "kl": 0.16988810896873474, + "learning_rate": 4.51482070562129e-06, + "loss": 0.0068, + "reward": 2.4666666984558105, + "reward_std": 1.949530005455017, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 563 + }, + { + "completion_length": 826.0, + "epoch": 1.972027972027972, + "grad_norm": 0.521063506603241, + "kl": 0.2149253934621811, + "learning_rate": 4.512234493389785e-06, + "loss": 0.0086, + "reward": 1.9666666984558105, + "reward_std": 1.5233734846115112, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 564 + }, + { + "completion_length": 502.8333435058594, + "epoch": 1.9755244755244754, + "grad_norm": 0.4798555076122284, + "kl": 0.26902374625205994, + "learning_rate": 4.509642151543043e-06, + "loss": 0.0108, + "reward": 1.625, + "reward_std": 0.7960842847824097, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 565 + }, + { + "completion_length": 525.0, + "epoch": 1.9790209790209792, + "grad_norm": 0.566384494304657, + "kl": 0.2703857123851776, + "learning_rate": 4.507043687977787e-06, + "loss": 0.0108, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 566 + }, + { + "completion_length": 194.33334350585938, + "epoch": 1.9825174825174825, + "grad_norm": 2.502077579498291, + "kl": 0.4179210364818573, + "learning_rate": 4.504439110609385e-06, + "loss": 0.0167, + "reward": 1.383333444595337, + "reward_std": 0.8920015096664429, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 567 + }, + { + "completion_length": 199.33334350585938, + "epoch": 1.986013986013986, + "grad_norm": 0.07109465450048447, + "kl": 0.2686344385147095, + "learning_rate": 4.501828427371834e-06, + "loss": 0.0131, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 568 + }, + { + "completion_length": 190.83334350585938, + "epoch": 1.9895104895104896, + "grad_norm": 1.11842942237854, + "kl": 0.2603175640106201, + "learning_rate": 4.4992116462177274e-06, + "loss": 0.0104, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 569 + }, + { + "completion_length": 513.8333740234375, + "epoch": 1.993006993006993, + "grad_norm": 0.47602808475494385, + "kl": 0.20756664872169495, + "learning_rate": 4.496588775118232e-06, + "loss": 0.0083, + "reward": 2.2916667461395264, + "reward_std": 1.19181227684021, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 570 + }, + { + "completion_length": 197.1666717529297, + "epoch": 1.9965034965034965, + "grad_norm": 0.7599025368690491, + "kl": 0.23664715886116028, + "learning_rate": 4.4939598220630724e-06, + "loss": 0.0095, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 571 + }, + { + "completion_length": 207.83334350585938, + "epoch": 2.0, + "grad_norm": 0.7908173203468323, + "kl": 0.28615739941596985, + "learning_rate": 4.491324795060491e-06, + "loss": 0.0114, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 572 + }, + { + "completion_length": 194.6666717529297, + "epoch": 2.0034965034965033, + "grad_norm": 0.9715352654457092, + "kl": 0.3183891177177429, + "learning_rate": 4.48868370213724e-06, + "loss": 0.0127, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 573 + }, + { + "completion_length": 173.83334350585938, + "epoch": 2.006993006993007, + "grad_norm": 2.3841874599456787, + "kl": 1.3214149475097656, + "learning_rate": 4.4860365513385456e-06, + "loss": 0.0529, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 574 + }, + { + "completion_length": 213.83334350585938, + "epoch": 2.0104895104895104, + "grad_norm": 0.9496575593948364, + "kl": 0.22735705971717834, + "learning_rate": 4.4833833507280884e-06, + "loss": 0.0091, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 575 + }, + { + "completion_length": 511.0, + "epoch": 2.013986013986014, + "grad_norm": 0.6045878529548645, + "kl": 0.25393787026405334, + "learning_rate": 4.4807241083879774e-06, + "loss": 0.0102, + "reward": 1.4583333730697632, + "reward_std": 0.8187898397445679, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 576 + }, + { + "completion_length": 222.1666717529297, + "epoch": 2.0174825174825175, + "grad_norm": 0.7379043102264404, + "kl": 0.22020569443702698, + "learning_rate": 4.478058832418726e-06, + "loss": 0.0088, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 577 + }, + { + "completion_length": 204.6666717529297, + "epoch": 2.020979020979021, + "grad_norm": 0.9404547810554504, + "kl": 0.2797861695289612, + "learning_rate": 4.475387530939226e-06, + "loss": 0.0112, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 578 + }, + { + "completion_length": 206.6666717529297, + "epoch": 2.0244755244755246, + "grad_norm": 0.8784480690956116, + "kl": 0.24152153730392456, + "learning_rate": 4.4727102120867274e-06, + "loss": 0.0097, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 579 + }, + { + "completion_length": 414.66668701171875, + "epoch": 2.027972027972028, + "grad_norm": 0.6715477705001831, + "kl": 0.21307629346847534, + "learning_rate": 4.470026884016805e-06, + "loss": 0.0085, + "reward": 2.4583334922790527, + "reward_std": 1.5001389980316162, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 580 + }, + { + "completion_length": 528.5, + "epoch": 2.0314685314685317, + "grad_norm": 0.7886191010475159, + "kl": 0.4145243763923645, + "learning_rate": 4.467337554903344e-06, + "loss": 0.0166, + "reward": 3.5416667461395264, + "reward_std": 1.0002083778381348, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.875, + "step": 581 + }, + { + "completion_length": 457.5, + "epoch": 2.034965034965035, + "grad_norm": 5.719381809234619, + "kl": 1.370613932609558, + "learning_rate": 4.464642232938505e-06, + "loss": 0.0548, + "reward": 1.9750001430511475, + "reward_std": 2.163504123687744, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 582 + }, + { + "completion_length": 361.5, + "epoch": 2.0384615384615383, + "grad_norm": 0.5381609201431274, + "kl": 0.23687216639518738, + "learning_rate": 4.461940926332708e-06, + "loss": 0.0095, + "reward": 1.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 583 + }, + { + "completion_length": 874.6666870117188, + "epoch": 2.041958041958042, + "grad_norm": 0.45025861263275146, + "kl": 0.16833463311195374, + "learning_rate": 4.4592336433146e-06, + "loss": 0.0067, + "reward": 2.9583334922790527, + "reward_std": 1.6554203033447266, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 584 + }, + { + "completion_length": 726.3333740234375, + "epoch": 2.0454545454545454, + "grad_norm": 0.4446694254875183, + "kl": 0.17844387888908386, + "learning_rate": 4.456520392131035e-06, + "loss": 0.0071, + "reward": 1.133333444595337, + "reward_std": 0.9595138430595398, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 585 + }, + { + "completion_length": 830.3333740234375, + "epoch": 2.0489510489510487, + "grad_norm": 0.8371572494506836, + "kl": 0.16316595673561096, + "learning_rate": 4.453801181047047e-06, + "loss": 0.0065, + "reward": 1.524999976158142, + "reward_std": 1.1206024885177612, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 586 + }, + { + "completion_length": 110.5, + "epoch": 2.0524475524475525, + "grad_norm": 3.6648356914520264, + "kl": 0.4860494136810303, + "learning_rate": 4.4510760183458246e-06, + "loss": 0.0194, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 587 + }, + { + "completion_length": 228.6666717529297, + "epoch": 2.055944055944056, + "grad_norm": 0.8717478513717651, + "kl": 0.28448450565338135, + "learning_rate": 4.448344912328686e-06, + "loss": 0.0114, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 588 + }, + { + "completion_length": 614.0, + "epoch": 2.0594405594405596, + "grad_norm": 0.352130651473999, + "kl": 0.19009076058864594, + "learning_rate": 4.445607871315053e-06, + "loss": 0.0076, + "reward": 1.7333333492279053, + "reward_std": 0.5307227969169617, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 589 + }, + { + "completion_length": 476.3333435058594, + "epoch": 2.062937062937063, + "grad_norm": 2.5581870079040527, + "kl": 0.5677192807197571, + "learning_rate": 4.442864903642428e-06, + "loss": 0.0227, + "reward": 1.8000000715255737, + "reward_std": 1.5792405605316162, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 590 + }, + { + "completion_length": 314.66668701171875, + "epoch": 2.0664335664335662, + "grad_norm": 0.657811164855957, + "kl": 0.20458662509918213, + "learning_rate": 4.440116017666365e-06, + "loss": 0.0082, + "reward": 3.116666793823242, + "reward_std": 1.3291600942611694, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 591 + }, + { + "completion_length": 516.0, + "epoch": 2.06993006993007, + "grad_norm": 0.473056823015213, + "kl": 0.19687163829803467, + "learning_rate": 4.437361221760449e-06, + "loss": 0.0079, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 592 + }, + { + "completion_length": 217.0, + "epoch": 2.0734265734265733, + "grad_norm": 0.793745756149292, + "kl": 0.2862774133682251, + "learning_rate": 4.434600524316266e-06, + "loss": 0.0115, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 593 + }, + { + "completion_length": 216.0, + "epoch": 2.076923076923077, + "grad_norm": 0.7589979767799377, + "kl": 0.2887541651725769, + "learning_rate": 4.431833933743378e-06, + "loss": 0.0116, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 594 + }, + { + "completion_length": 234.0, + "epoch": 2.0804195804195804, + "grad_norm": 0.952064037322998, + "kl": 0.30340343713760376, + "learning_rate": 4.4290614584693005e-06, + "loss": 0.0121, + "reward": 2.5375001430511475, + "reward_std": 0.9115578532218933, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 595 + }, + { + "completion_length": 1109.8333740234375, + "epoch": 2.0839160839160837, + "grad_norm": 0.382217139005661, + "kl": 0.1974603831768036, + "learning_rate": 4.426283106939474e-06, + "loss": 0.0079, + "reward": 1.7166666984558105, + "reward_std": 0.967298686504364, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7166666984558105, + "step": 596 + }, + { + "completion_length": 497.66668701171875, + "epoch": 2.0874125874125875, + "grad_norm": 0.7741627097129822, + "kl": 0.2393149733543396, + "learning_rate": 4.423498887617238e-06, + "loss": 0.0096, + "reward": 1.9583333730697632, + "reward_std": 1.400148868560791, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 597 + }, + { + "completion_length": 518.0, + "epoch": 2.090909090909091, + "grad_norm": 0.534230649471283, + "kl": 0.22715210914611816, + "learning_rate": 4.420708808983809e-06, + "loss": 0.0091, + "reward": 2.4583334922790527, + "reward_std": 1.3603003025054932, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 598 + }, + { + "completion_length": 502.8333435058594, + "epoch": 2.0944055944055946, + "grad_norm": 0.5411605834960938, + "kl": 0.2008448839187622, + "learning_rate": 4.41791287953825e-06, + "loss": 0.008, + "reward": 2.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 599 + }, + { + "completion_length": 545.6666870117188, + "epoch": 2.097902097902098, + "grad_norm": 0.44943779706954956, + "kl": 0.225155770778656, + "learning_rate": 4.415111107797445e-06, + "loss": 0.009, + "reward": 3.016666889190674, + "reward_std": 1.3952300548553467, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 600 + }, + { + "completion_length": 239.0, + "epoch": 2.1013986013986012, + "grad_norm": 0.9387716054916382, + "kl": 0.2535586357116699, + "learning_rate": 4.412303502296081e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 601 + }, + { + "completion_length": 188.0, + "epoch": 2.104895104895105, + "grad_norm": 3.3025033473968506, + "kl": 0.3564508557319641, + "learning_rate": 4.409490071586606e-06, + "loss": 0.0143, + "reward": 2.9583334922790527, + "reward_std": 1.6554205417633057, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 602 + }, + { + "completion_length": 526.8333740234375, + "epoch": 2.1083916083916083, + "grad_norm": 0.7135488986968994, + "kl": 0.25961729884147644, + "learning_rate": 4.406670824239221e-06, + "loss": 0.0104, + "reward": 2.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 603 + }, + { + "completion_length": 201.0, + "epoch": 2.111888111888112, + "grad_norm": 0.5526494979858398, + "kl": 0.26036110520362854, + "learning_rate": 4.403845768841842e-06, + "loss": 0.0104, + "reward": 2.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 604 + }, + { + "completion_length": 516.8333740234375, + "epoch": 2.1153846153846154, + "grad_norm": 0.4089651107788086, + "kl": 0.2617362141609192, + "learning_rate": 4.401014914000078e-06, + "loss": 0.0105, + "reward": 3.2916667461395264, + "reward_std": 1.6125807762145996, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 605 + }, + { + "completion_length": 192.5, + "epoch": 2.1188811188811187, + "grad_norm": 0.7996219396591187, + "kl": 0.30715522170066833, + "learning_rate": 4.398178268337202e-06, + "loss": 0.0123, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 606 + }, + { + "completion_length": 793.3333740234375, + "epoch": 2.1223776223776225, + "grad_norm": 0.8545472025871277, + "kl": 0.20438644289970398, + "learning_rate": 4.395335840494131e-06, + "loss": 0.0082, + "reward": 3.375, + "reward_std": 0.493710458278656, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.875, + "step": 607 + }, + { + "completion_length": 197.5, + "epoch": 2.125874125874126, + "grad_norm": 0.09662449359893799, + "kl": 0.2624778151512146, + "learning_rate": 4.3924876391293915e-06, + "loss": 0.0117, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 608 + }, + { + "completion_length": 199.0, + "epoch": 2.129370629370629, + "grad_norm": 0.8693634867668152, + "kl": 0.232680082321167, + "learning_rate": 4.389633672919099e-06, + "loss": 0.0093, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 609 + }, + { + "completion_length": 213.1666717529297, + "epoch": 2.132867132867133, + "grad_norm": 0.23271039128303528, + "kl": 0.2889987826347351, + "learning_rate": 4.386773950556931e-06, + "loss": 0.0139, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 610 + }, + { + "completion_length": 197.83334350585938, + "epoch": 2.1363636363636362, + "grad_norm": 0.8127601742744446, + "kl": 0.35951054096221924, + "learning_rate": 4.3839084807540956e-06, + "loss": 0.0144, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 611 + }, + { + "completion_length": 164.6666717529297, + "epoch": 2.13986013986014, + "grad_norm": 1.0649946928024292, + "kl": 0.26743820309638977, + "learning_rate": 4.381037272239311e-06, + "loss": 0.0107, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 612 + }, + { + "completion_length": 217.6666717529297, + "epoch": 2.1433566433566433, + "grad_norm": 0.8122753500938416, + "kl": 0.27118992805480957, + "learning_rate": 4.378160333758779e-06, + "loss": 0.0108, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 613 + }, + { + "completion_length": 201.6666717529297, + "epoch": 2.1468531468531467, + "grad_norm": 0.8640854358673096, + "kl": 0.2445271909236908, + "learning_rate": 4.3752776740761495e-06, + "loss": 0.0098, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 614 + }, + { + "completion_length": 188.6666717529297, + "epoch": 2.1503496503496504, + "grad_norm": 1.3168154954910278, + "kl": 0.2900705933570862, + "learning_rate": 4.372389301972506e-06, + "loss": 0.0116, + "reward": 1.7083333730697632, + "reward_std": 0.591960072517395, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 615 + }, + { + "completion_length": 241.6666717529297, + "epoch": 2.1538461538461537, + "grad_norm": 1.1053791046142578, + "kl": 0.4096168875694275, + "learning_rate": 4.36949522624633e-06, + "loss": 0.0164, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 616 + }, + { + "completion_length": 147.83334350585938, + "epoch": 2.1573426573426575, + "grad_norm": 3.980419874191284, + "kl": 1.5825055837631226, + "learning_rate": 4.366595455713479e-06, + "loss": 0.0633, + "reward": 2.3000001907348633, + "reward_std": 1.4812158346176147, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 617 + }, + { + "completion_length": 197.0, + "epoch": 2.160839160839161, + "grad_norm": 0.8954426050186157, + "kl": 0.23646585643291473, + "learning_rate": 4.3636899992071555e-06, + "loss": 0.0095, + "reward": 2.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 618 + }, + { + "completion_length": 221.33334350585938, + "epoch": 2.164335664335664, + "grad_norm": 0.8455007076263428, + "kl": 0.25921204686164856, + "learning_rate": 4.360778865577885e-06, + "loss": 0.0104, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 619 + }, + { + "completion_length": 196.5, + "epoch": 2.167832167832168, + "grad_norm": 0.8735758662223816, + "kl": 0.27696120738983154, + "learning_rate": 4.357862063693486e-06, + "loss": 0.0111, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 620 + }, + { + "completion_length": 177.83334350585938, + "epoch": 2.1713286713286712, + "grad_norm": 32.12022018432617, + "kl": 2.4454264640808105, + "learning_rate": 4.354939602439041e-06, + "loss": 0.0978, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 621 + }, + { + "completion_length": 184.6666717529297, + "epoch": 2.174825174825175, + "grad_norm": 2.8916237354278564, + "kl": 0.3946024775505066, + "learning_rate": 4.352011490716875e-06, + "loss": 0.0158, + "reward": 3.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 622 + }, + { + "completion_length": 210.33334350585938, + "epoch": 2.1783216783216783, + "grad_norm": 1.4287588596343994, + "kl": 0.32967257499694824, + "learning_rate": 4.349077737446525e-06, + "loss": 0.0132, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 623 + }, + { + "completion_length": 229.83334350585938, + "epoch": 2.1818181818181817, + "grad_norm": 0.04024571180343628, + "kl": 0.2965821325778961, + "learning_rate": 4.346138351564711e-06, + "loss": 0.0142, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 624 + }, + { + "completion_length": 153.83334350585938, + "epoch": 2.1853146853146854, + "grad_norm": 0.9452215433120728, + "kl": 0.26284661889076233, + "learning_rate": 4.34319334202531e-06, + "loss": 0.0105, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 625 + }, + { + "completion_length": 162.1666717529297, + "epoch": 2.1888111888111887, + "grad_norm": 32.100563049316406, + "kl": 7.969426155090332, + "learning_rate": 4.340242717799337e-06, + "loss": 0.3188, + "reward": 2.2916667461395264, + "reward_std": 0.8002604246139526, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 626 + }, + { + "completion_length": 175.5, + "epoch": 2.1923076923076925, + "grad_norm": 6.515329360961914, + "kl": 0.3849031627178192, + "learning_rate": 4.3372864878749e-06, + "loss": 0.0154, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 627 + }, + { + "completion_length": 504.3333435058594, + "epoch": 2.195804195804196, + "grad_norm": 0.6083482503890991, + "kl": 0.19082359969615936, + "learning_rate": 4.334324661257191e-06, + "loss": 0.0076, + "reward": 2.4583334922790527, + "reward_std": 1.5001389980316162, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 628 + }, + { + "completion_length": 196.0, + "epoch": 2.199300699300699, + "grad_norm": 0.9820056557655334, + "kl": 0.2912360727787018, + "learning_rate": 4.331357246968447e-06, + "loss": 0.0116, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 629 + }, + { + "completion_length": 544.0, + "epoch": 2.202797202797203, + "grad_norm": 0.5948340892791748, + "kl": 0.22720639407634735, + "learning_rate": 4.328384254047927e-06, + "loss": 0.0091, + "reward": 2.375, + "reward_std": 0.6509608626365662, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 630 + }, + { + "completion_length": 237.0, + "epoch": 2.2062937062937062, + "grad_norm": 0.0632646456360817, + "kl": 0.2671894431114197, + "learning_rate": 4.3254056915518815e-06, + "loss": 0.0131, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 631 + }, + { + "completion_length": 501.16668701171875, + "epoch": 2.20979020979021, + "grad_norm": 0.44626739621162415, + "kl": 0.2233467698097229, + "learning_rate": 4.322421568553529e-06, + "loss": 0.0089, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 632 + }, + { + "completion_length": 187.5, + "epoch": 2.2132867132867133, + "grad_norm": 0.9024590849876404, + "kl": 0.299750417470932, + "learning_rate": 4.319431894143027e-06, + "loss": 0.012, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 633 + }, + { + "completion_length": 532.5, + "epoch": 2.2167832167832167, + "grad_norm": 0.38001272082328796, + "kl": 0.28776365518569946, + "learning_rate": 4.316436677427441e-06, + "loss": 0.0115, + "reward": 3.566666603088379, + "reward_std": 0.9389711618423462, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 634 + }, + { + "completion_length": 201.6666717529297, + "epoch": 2.2202797202797204, + "grad_norm": 1.1841076612472534, + "kl": 0.3013113737106323, + "learning_rate": 4.313435927530719e-06, + "loss": 0.0121, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 635 + }, + { + "completion_length": 185.6666717529297, + "epoch": 2.2237762237762237, + "grad_norm": 0.8018883466720581, + "kl": 0.2923080325126648, + "learning_rate": 4.3104296535936695e-06, + "loss": 0.0117, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 636 + }, + { + "completion_length": 525.3333740234375, + "epoch": 2.227272727272727, + "grad_norm": 0.4936811923980713, + "kl": 0.25341111421585083, + "learning_rate": 4.3074178647739205e-06, + "loss": 0.0101, + "reward": 3.2083334922790527, + "reward_std": 0.9697508215904236, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 637 + }, + { + "completion_length": 191.83334350585938, + "epoch": 2.230769230769231, + "grad_norm": 0.6575815677642822, + "kl": 0.3100575804710388, + "learning_rate": 4.3044005702459055e-06, + "loss": 0.0124, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 638 + }, + { + "completion_length": 178.5, + "epoch": 2.234265734265734, + "grad_norm": 0.8525052666664124, + "kl": 0.31076908111572266, + "learning_rate": 4.301377779200826e-06, + "loss": 0.0124, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 639 + }, + { + "completion_length": 185.33334350585938, + "epoch": 2.237762237762238, + "grad_norm": 1.0106300115585327, + "kl": 0.30621784925460815, + "learning_rate": 4.2983495008466285e-06, + "loss": 0.0122, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 640 + }, + { + "completion_length": 186.5, + "epoch": 2.2412587412587412, + "grad_norm": 0.885761022567749, + "kl": 0.3738858103752136, + "learning_rate": 4.295315744407972e-06, + "loss": 0.015, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 641 + }, + { + "completion_length": 171.6666717529297, + "epoch": 2.2447552447552446, + "grad_norm": 1.113839030265808, + "kl": 0.3465404212474823, + "learning_rate": 4.2922765191262075e-06, + "loss": 0.0139, + "reward": 2.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 642 + }, + { + "completion_length": 203.0, + "epoch": 2.2482517482517483, + "grad_norm": 0.8950809836387634, + "kl": 0.2658528983592987, + "learning_rate": 4.28923183425934e-06, + "loss": 0.0106, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 643 + }, + { + "completion_length": 198.5, + "epoch": 2.2517482517482517, + "grad_norm": 0.9561752080917358, + "kl": 0.31710129976272583, + "learning_rate": 4.286181699082008e-06, + "loss": 0.0127, + "reward": 2.2833335399627686, + "reward_std": 1.0327956676483154, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 644 + }, + { + "completion_length": 168.1666717529297, + "epoch": 2.2552447552447554, + "grad_norm": 0.8310069441795349, + "kl": 0.27687615156173706, + "learning_rate": 4.283126122885455e-06, + "loss": 0.0111, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 645 + }, + { + "completion_length": 196.83334350585938, + "epoch": 2.2587412587412588, + "grad_norm": 0.09269661456346512, + "kl": 0.2699682414531708, + "learning_rate": 4.280065114977492e-06, + "loss": 0.012, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 646 + }, + { + "completion_length": 163.6666717529297, + "epoch": 2.262237762237762, + "grad_norm": 1.2992812395095825, + "kl": 0.3616819381713867, + "learning_rate": 4.276998684682482e-06, + "loss": 0.0145, + "reward": 2.375, + "reward_std": 1.1847995519638062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 647 + }, + { + "completion_length": 198.6666717529297, + "epoch": 2.265734265734266, + "grad_norm": 0.8000275492668152, + "kl": 0.2609575390815735, + "learning_rate": 4.273926841341303e-06, + "loss": 0.0104, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 648 + }, + { + "completion_length": 196.1666717529297, + "epoch": 2.269230769230769, + "grad_norm": 0.8786153197288513, + "kl": 0.3877195119857788, + "learning_rate": 4.270849594311323e-06, + "loss": 0.0155, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 649 + }, + { + "completion_length": 201.0, + "epoch": 2.2727272727272725, + "grad_norm": 0.9727340936660767, + "kl": 0.3743540942668915, + "learning_rate": 4.267766952966369e-06, + "loss": 0.015, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 650 + } + ], + "logging_steps": 1, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-650/training_args.bin b/checkpoint-650/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..404a67ca1097568ef818195412e92eb5df6df003 --- /dev/null +++ b/checkpoint-650/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9b809202c83316443ca7c3596f9666d891e249e918f031374256726d85b5070 +size 6008 diff --git a/checkpoint-700/README.md b/checkpoint-700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..342a23987f57b711334f1f7c4b72004ab4751d11 --- /dev/null +++ b/checkpoint-700/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/checkpoint-700/adapter_config.json b/checkpoint-700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca69f90ffbea02ffd530ac27f43588458c02af39 --- /dev/null +++ b/checkpoint-700/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "up_proj", + "o_proj", + "k_proj", + "gate_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-700/adapter_model.safetensors b/checkpoint-700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ae8345a06ef4c5caaf25a8bd106b92137ba0f3ea --- /dev/null +++ b/checkpoint-700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f0ef2082077855e0ee049d3fa54e5daa990b517bde22017c1a0ebd48f339e3b +size 778096664 diff --git a/checkpoint-700/optimizer.pt b/checkpoint-700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..679ea005ef3a27e355e1bc137a72acdea2db8293 --- /dev/null +++ b/checkpoint-700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c001430a46ee2b18da8e7bf201686cd70b26c7578e04fc76c7b9f207bc2ba74 +size 395571252 diff --git a/checkpoint-700/rng_state.pth b/checkpoint-700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..76bf14880381e410a96b0fa9963e8aa7edac566d --- /dev/null +++ b/checkpoint-700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b27724737e21503cee22e539eb2d0e8faab31ab505b66afa81d2081fd0d14324 +size 14244 diff --git a/checkpoint-700/scheduler.pt b/checkpoint-700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9adc012880449f66897cb3a462e30d26cb84953 --- /dev/null +++ b/checkpoint-700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffa0df54c5ffe33bfe3ac0ac0ed94e995d5b009fa6488b50b2b3489daa9f5c6c +size 1064 diff --git a/checkpoint-700/special_tokens_map.json b/checkpoint-700/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-700/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-700/tokenizer.json b/checkpoint-700/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-700/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-700/tokenizer_config.json b/checkpoint-700/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f29bafcf7d24e386a389486e71a4e81dfef0f5c2 --- /dev/null +++ b/checkpoint-700/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-700/trainer_state.json b/checkpoint-700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e43568bf150919b10b0f9a450c608ba14719da6d --- /dev/null +++ b/checkpoint-700/trainer_state.json @@ -0,0 +1,10533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.4475524475524475, + "eval_steps": 500, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 399.0, + "epoch": 0.0034965034965034965, + "grad_norm": 0.9857833385467529, + "kl": 0.0, + "learning_rate": 2.5000000000000002e-08, + "loss": 0.0, + "reward": 1.75, + "reward_std": 1.069111704826355, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4166666865348816, + "step": 1 + }, + { + "completion_length": 305.3333435058594, + "epoch": 0.006993006993006993, + "grad_norm": 1.3122953176498413, + "kl": 0.0, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.0, + "reward": 1.0500000715255737, + "reward_std": 0.6340347528457642, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 2 + }, + { + "completion_length": 475.3333435058594, + "epoch": 0.01048951048951049, + "grad_norm": 6.344944953918457, + "kl": 0.0006356238736771047, + "learning_rate": 7.500000000000001e-08, + "loss": 0.0, + "reward": 1.4750001430511475, + "reward_std": 1.025061011314392, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 3 + }, + { + "completion_length": 378.3333435058594, + "epoch": 0.013986013986013986, + "grad_norm": 0.9831988215446472, + "kl": 0.0006719424272887409, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.0, + "reward": 1.2208333015441895, + "reward_std": 1.3383214473724365, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.22083334624767303, + "step": 4 + }, + { + "completion_length": 925.0, + "epoch": 0.017482517482517484, + "grad_norm": 1.042701005935669, + "kl": 0.000699286290910095, + "learning_rate": 1.2500000000000002e-07, + "loss": 0.0, + "reward": 2.4666666984558105, + "reward_std": 1.618847370147705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 5 + }, + { + "completion_length": 130.6666717529297, + "epoch": 0.02097902097902098, + "grad_norm": 1.276957631111145, + "kl": 0.0007741473382338881, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.0, + "reward": 0.38333332538604736, + "reward_std": 0.7222649455070496, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 6 + }, + { + "completion_length": 185.5, + "epoch": 0.024475524475524476, + "grad_norm": 1.277024507522583, + "kl": 0.0007853443967178464, + "learning_rate": 1.7500000000000002e-07, + "loss": 0.0, + "reward": 0.7250000238418579, + "reward_std": 0.44017040729522705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 7 + }, + { + "completion_length": 113.83333587646484, + "epoch": 0.027972027972027972, + "grad_norm": 4.894377708435059, + "kl": 0.0010196010116487741, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.0, + "reward": 0.7250000238418579, + "reward_std": 0.5777109861373901, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 8 + }, + { + "completion_length": 195.33334350585938, + "epoch": 0.03146853146853147, + "grad_norm": 0.9491543769836426, + "kl": 0.0009398699621669948, + "learning_rate": 2.2500000000000002e-07, + "loss": 0.0, + "reward": 1.2750000953674316, + "reward_std": 0.673609733581543, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10833333432674408, + "step": 9 + }, + { + "completion_length": 168.83334350585938, + "epoch": 0.03496503496503497, + "grad_norm": 4.634313583374023, + "kl": 0.0008446139981970191, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.0, + "reward": 0.5791666507720947, + "reward_std": 1.011238932609558, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.07916666567325592, + "step": 10 + }, + { + "completion_length": 181.0, + "epoch": 0.038461538461538464, + "grad_norm": 0.9203607439994812, + "kl": 0.0005472182529047132, + "learning_rate": 2.75e-07, + "loss": 0.0, + "reward": 1.2833333015441895, + "reward_std": 0.9125057458877563, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.11666666716337204, + "step": 11 + }, + { + "completion_length": 181.1666717529297, + "epoch": 0.04195804195804196, + "grad_norm": 1.4339206218719482, + "kl": 0.0007050944259390235, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.0, + "reward": 1.7333333492279053, + "reward_std": 1.0063133239746094, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.23333333432674408, + "step": 12 + }, + { + "completion_length": 130.0, + "epoch": 0.045454545454545456, + "grad_norm": 1.073473334312439, + "kl": 0.0007636564550921321, + "learning_rate": 3.25e-07, + "loss": 0.0, + "reward": 1.058333396911621, + "reward_std": 0.7486097812652588, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 13 + }, + { + "completion_length": 356.16668701171875, + "epoch": 0.04895104895104895, + "grad_norm": 0.8452476859092712, + "kl": 0.0006562608177773654, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.0, + "reward": 0.7416666746139526, + "reward_std": 0.8662659525871277, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.24166667461395264, + "step": 14 + }, + { + "completion_length": 143.1666717529297, + "epoch": 0.05244755244755245, + "grad_norm": 0.9590725302696228, + "kl": 0.0008172739762812853, + "learning_rate": 3.75e-07, + "loss": 0.0, + "reward": 0.5541666746139526, + "reward_std": 0.9553031921386719, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05416666716337204, + "step": 15 + }, + { + "completion_length": 454.16668701171875, + "epoch": 0.055944055944055944, + "grad_norm": 1.2272268533706665, + "kl": 0.0007388863014057279, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.0, + "reward": 1.2083333730697632, + "reward_std": 1.0360583066940308, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 16 + }, + { + "completion_length": 152.5, + "epoch": 0.05944055944055944, + "grad_norm": 1.0074872970581055, + "kl": 0.0006766216829419136, + "learning_rate": 4.2500000000000006e-07, + "loss": 0.0, + "reward": 0.8916666507720947, + "reward_std": 0.8662659525871277, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 17 + }, + { + "completion_length": 250.1666717529297, + "epoch": 0.06293706293706294, + "grad_norm": 1.305372953414917, + "kl": 0.001035388559103012, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.0, + "reward": 0.7166666984558105, + "reward_std": 1.2201093435287476, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 18 + }, + { + "completion_length": 243.0, + "epoch": 0.06643356643356643, + "grad_norm": 1.0690687894821167, + "kl": 0.0006665514083579183, + "learning_rate": 4.7500000000000006e-07, + "loss": 0.0, + "reward": 0.9916666746139526, + "reward_std": 0.6167792677879333, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.15833333134651184, + "step": 19 + }, + { + "completion_length": 276.16668701171875, + "epoch": 0.06993006993006994, + "grad_norm": 1.052300214767456, + "kl": 0.0005925261066295207, + "learning_rate": 5.000000000000001e-07, + "loss": 0.0, + "reward": 1.5333333015441895, + "reward_std": 1.0186593532562256, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.20000001788139343, + "step": 20 + }, + { + "completion_length": 333.3333435058594, + "epoch": 0.07342657342657342, + "grad_norm": 0.95088130235672, + "kl": 0.0006341444095596671, + "learning_rate": 5.250000000000001e-07, + "loss": 0.0, + "reward": 1.8583333492279053, + "reward_std": 0.8458231687545776, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3583333194255829, + "step": 21 + }, + { + "completion_length": 166.6666717529297, + "epoch": 0.07692307692307693, + "grad_norm": 1.2825149297714233, + "kl": 0.0007712479564361274, + "learning_rate": 5.5e-07, + "loss": 0.0, + "reward": 0.7666666507720947, + "reward_std": 1.1881358623504639, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10000000894069672, + "step": 22 + }, + { + "completion_length": 380.0, + "epoch": 0.08041958041958042, + "grad_norm": 1.2229748964309692, + "kl": 0.0007141837850213051, + "learning_rate": 5.750000000000001e-07, + "loss": 0.0, + "reward": 1.933333396911621, + "reward_std": 0.7672461867332458, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 23 + }, + { + "completion_length": 250.0, + "epoch": 0.08391608391608392, + "grad_norm": 1.1869820356369019, + "kl": 0.0007901927456259727, + "learning_rate": 6.000000000000001e-07, + "loss": 0.0, + "reward": 0.9666666984558105, + "reward_std": 0.797287106513977, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 24 + }, + { + "completion_length": 224.33334350585938, + "epoch": 0.08741258741258741, + "grad_norm": 1.1140718460083008, + "kl": 0.0006676652701571584, + "learning_rate": 6.25e-07, + "loss": 0.0, + "reward": 1.125, + "reward_std": 1.069462537765503, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.125, + "step": 25 + }, + { + "completion_length": 112.33333587646484, + "epoch": 0.09090909090909091, + "grad_norm": 1.20625901222229, + "kl": 0.0006995900766924024, + "learning_rate": 6.5e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0, + "step": 26 + }, + { + "completion_length": 398.8333435058594, + "epoch": 0.0944055944055944, + "grad_norm": 5.332723617553711, + "kl": 0.0007186655420809984, + "learning_rate": 6.750000000000001e-07, + "loss": 0.0, + "reward": 1.6625001430511475, + "reward_std": 0.9664044380187988, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3291666805744171, + "step": 27 + }, + { + "completion_length": 336.3333435058594, + "epoch": 0.0979020979020979, + "grad_norm": 0.7707162499427795, + "kl": 0.0007305681938305497, + "learning_rate": 7.000000000000001e-07, + "loss": 0.0, + "reward": 1.441666603088379, + "reward_std": 0.9876319766044617, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.2750000059604645, + "step": 28 + }, + { + "completion_length": 355.8333435058594, + "epoch": 0.10139860139860139, + "grad_norm": 0.999113142490387, + "kl": 0.0006821553106419742, + "learning_rate": 7.25e-07, + "loss": 0.0, + "reward": 1.1583333015441895, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.15833333134651184, + "step": 29 + }, + { + "completion_length": 188.1666717529297, + "epoch": 0.1048951048951049, + "grad_norm": 1.1029480695724487, + "kl": 0.0007804523920640349, + "learning_rate": 7.5e-07, + "loss": 0.0, + "reward": 1.183333396911621, + "reward_std": 1.0680201053619385, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.18333333730697632, + "step": 30 + }, + { + "completion_length": 380.3333435058594, + "epoch": 0.10839160839160839, + "grad_norm": 0.9132871627807617, + "kl": 0.0008556495886296034, + "learning_rate": 7.750000000000001e-07, + "loss": 0.0, + "reward": 2.2375001907348633, + "reward_std": 1.4762918949127197, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40416666865348816, + "step": 31 + }, + { + "completion_length": 348.0, + "epoch": 0.11188811188811189, + "grad_norm": 1.549122929573059, + "kl": 0.0009064790210686624, + "learning_rate": 8.000000000000001e-07, + "loss": 0.0, + "reward": 0.8291666507720947, + "reward_std": 1.029613733291626, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.16250000894069672, + "step": 32 + }, + { + "completion_length": 349.5, + "epoch": 0.11538461538461539, + "grad_norm": 0.8771302700042725, + "kl": 0.0008574656676501036, + "learning_rate": 8.250000000000001e-07, + "loss": 0.0, + "reward": 1.133333444595337, + "reward_std": 0.9867455363273621, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.30000001192092896, + "step": 33 + }, + { + "completion_length": 698.8333740234375, + "epoch": 0.11888111888111888, + "grad_norm": 0.7568854689598083, + "kl": 0.0007735582767054439, + "learning_rate": 8.500000000000001e-07, + "loss": 0.0, + "reward": 1.933333396911621, + "reward_std": 1.1737406253814697, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 34 + }, + { + "completion_length": 655.3333740234375, + "epoch": 0.12237762237762238, + "grad_norm": 1.5077099800109863, + "kl": 0.0007145506679080427, + "learning_rate": 8.75e-07, + "loss": 0.0, + "reward": 1.337499976158142, + "reward_std": 0.7572566270828247, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5041667222976685, + "step": 35 + }, + { + "completion_length": 156.0, + "epoch": 0.1258741258741259, + "grad_norm": 1.1091190576553345, + "kl": 0.0010963345412164927, + "learning_rate": 9.000000000000001e-07, + "loss": 0.0, + "reward": 1.1583333015441895, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.15833333134651184, + "step": 36 + }, + { + "completion_length": 184.6666717529297, + "epoch": 0.12937062937062938, + "grad_norm": 1.1978340148925781, + "kl": 0.000993944238871336, + "learning_rate": 9.25e-07, + "loss": 0.0, + "reward": 0.8333333730697632, + "reward_std": 1.2944754362106323, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 37 + }, + { + "completion_length": 170.1666717529297, + "epoch": 0.13286713286713286, + "grad_norm": 0.9296630620956421, + "kl": 0.0012741987593472004, + "learning_rate": 9.500000000000001e-07, + "loss": 0.0001, + "reward": 1.25, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 38 + }, + { + "completion_length": 284.3333435058594, + "epoch": 0.13636363636363635, + "grad_norm": 1.3948841094970703, + "kl": 0.0010804318590089679, + "learning_rate": 9.750000000000002e-07, + "loss": 0.0, + "reward": 1.1083333492279053, + "reward_std": 1.263098120689392, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2750000059604645, + "step": 39 + }, + { + "completion_length": 132.1666717529297, + "epoch": 0.13986013986013987, + "grad_norm": 1.0202951431274414, + "kl": 0.0013121496886014938, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0001, + "reward": 0.3333333432674408, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0, + "step": 40 + }, + { + "completion_length": 156.1666717529297, + "epoch": 0.14335664335664336, + "grad_norm": 0.9724128246307373, + "kl": 0.0010785979684442282, + "learning_rate": 1.025e-06, + "loss": 0.0, + "reward": 0.6083333492279053, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10833333432674408, + "step": 41 + }, + { + "completion_length": 603.1666870117188, + "epoch": 0.14685314685314685, + "grad_norm": 0.7776791453361511, + "kl": 0.0006764258723706007, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.0, + "reward": 1.4500001668930054, + "reward_std": 0.30659419298171997, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.45000001788139343, + "step": 42 + }, + { + "completion_length": 183.1666717529297, + "epoch": 0.15034965034965034, + "grad_norm": 1.2581369876861572, + "kl": 0.0012429999187588692, + "learning_rate": 1.075e-06, + "loss": 0.0, + "reward": 1.1749999523162842, + "reward_std": 1.0567638874053955, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17499999701976776, + "step": 43 + }, + { + "completion_length": 379.16668701171875, + "epoch": 0.15384615384615385, + "grad_norm": 2.0310208797454834, + "kl": 0.0011767616961151361, + "learning_rate": 1.1e-06, + "loss": 0.0, + "reward": 2.633333683013916, + "reward_std": 1.0595598220825195, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.46666663885116577, + "step": 44 + }, + { + "completion_length": 637.3333740234375, + "epoch": 0.15734265734265734, + "grad_norm": 1.2500090599060059, + "kl": 0.001643048133701086, + "learning_rate": 1.125e-06, + "loss": 0.0001, + "reward": 1.1500000953674316, + "reward_std": 0.7307531237602234, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 45 + }, + { + "completion_length": 182.0, + "epoch": 0.16083916083916083, + "grad_norm": 2.3323163986206055, + "kl": 0.003556631039828062, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.0001, + "reward": 0.9666666984558105, + "reward_std": 1.0230672359466553, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.13333334028720856, + "step": 46 + }, + { + "completion_length": 109.83333587646484, + "epoch": 0.16433566433566432, + "grad_norm": 1.834832787513733, + "kl": 0.002168774139136076, + "learning_rate": 1.175e-06, + "loss": 0.0001, + "reward": 0.5583333373069763, + "reward_std": 0.6248332858085632, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 47 + }, + { + "completion_length": 337.16668701171875, + "epoch": 0.16783216783216784, + "grad_norm": 1.1725846529006958, + "kl": 0.002405840437859297, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0001, + "reward": 0.6500000357627869, + "reward_std": 0.7962412238121033, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 48 + }, + { + "completion_length": 437.3333435058594, + "epoch": 0.17132867132867133, + "grad_norm": 0.743201494216919, + "kl": 0.0013375936541706324, + "learning_rate": 1.2250000000000001e-06, + "loss": 0.0001, + "reward": 1.183333396911621, + "reward_std": 1.3611271381378174, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3499999940395355, + "step": 49 + }, + { + "completion_length": 533.8333740234375, + "epoch": 0.17482517482517482, + "grad_norm": 0.7576809525489807, + "kl": 0.0019401045283302665, + "learning_rate": 1.25e-06, + "loss": 0.0001, + "reward": 1.7291667461395264, + "reward_std": 0.7050561308860779, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5625, + "step": 50 + }, + { + "completion_length": 203.5, + "epoch": 0.17832167832167833, + "grad_norm": 1.4076164960861206, + "kl": 0.0030774520710110664, + "learning_rate": 1.275e-06, + "loss": 0.0001, + "reward": 0.7750000357627869, + "reward_std": 0.5135659575462341, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2750000059604645, + "step": 51 + }, + { + "completion_length": 409.0, + "epoch": 0.18181818181818182, + "grad_norm": 0.8726016879081726, + "kl": 0.0025800741277635098, + "learning_rate": 1.3e-06, + "loss": 0.0001, + "reward": 0.5916666984558105, + "reward_std": 0.7324047088623047, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 52 + }, + { + "completion_length": 356.5, + "epoch": 0.1853146853146853, + "grad_norm": 0.877477765083313, + "kl": 0.0021268115378916264, + "learning_rate": 1.3250000000000002e-06, + "loss": 0.0001, + "reward": 1.6166666746139526, + "reward_std": 0.6976150274276733, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.28333336114883423, + "step": 53 + }, + { + "completion_length": 243.33334350585938, + "epoch": 0.1888111888111888, + "grad_norm": 0.9792532324790955, + "kl": 0.0043938253074884415, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.0002, + "reward": 1.1708333492279053, + "reward_std": 1.282616138458252, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17083333432674408, + "step": 54 + }, + { + "completion_length": 393.3333435058594, + "epoch": 0.19230769230769232, + "grad_norm": 1.205925703048706, + "kl": 0.0031106050591915846, + "learning_rate": 1.3750000000000002e-06, + "loss": 0.0001, + "reward": 0.9666666984558105, + "reward_std": 0.8084965944290161, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 55 + }, + { + "completion_length": 228.83334350585938, + "epoch": 0.1958041958041958, + "grad_norm": 0.7984407544136047, + "kl": 0.007072250358760357, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.0003, + "reward": 0.6916667222976685, + "reward_std": 1.1655113697052002, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19166666269302368, + "step": 56 + }, + { + "completion_length": 361.66668701171875, + "epoch": 0.1993006993006993, + "grad_norm": 3.0838680267333984, + "kl": 0.006738494616001844, + "learning_rate": 1.425e-06, + "loss": 0.0003, + "reward": 1.3041667938232422, + "reward_std": 0.2600080370903015, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30416667461395264, + "step": 57 + }, + { + "completion_length": 502.66668701171875, + "epoch": 0.20279720279720279, + "grad_norm": 0.7226095795631409, + "kl": 0.0058082761242985725, + "learning_rate": 1.45e-06, + "loss": 0.0002, + "reward": 1.399999976158142, + "reward_std": 1.3337916135787964, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.40000003576278687, + "step": 58 + }, + { + "completion_length": 210.5, + "epoch": 0.2062937062937063, + "grad_norm": 1.079681158065796, + "kl": 0.009464471600949764, + "learning_rate": 1.475e-06, + "loss": 0.0004, + "reward": 0.9750000238418579, + "reward_std": 1.1890122890472412, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.14166668057441711, + "step": 59 + }, + { + "completion_length": 208.5, + "epoch": 0.2097902097902098, + "grad_norm": 1.8312753438949585, + "kl": 0.03959222882986069, + "learning_rate": 1.5e-06, + "loss": 0.0016, + "reward": 0.5333333611488342, + "reward_std": 0.8553751707077026, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.20000001788139343, + "step": 60 + }, + { + "completion_length": 285.5, + "epoch": 0.21328671328671328, + "grad_norm": 0.9337784051895142, + "kl": 0.011914614588022232, + "learning_rate": 1.525e-06, + "loss": 0.0005, + "reward": 1.4458332061767578, + "reward_std": 0.4955846071243286, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.27916666865348816, + "step": 61 + }, + { + "completion_length": 276.3333435058594, + "epoch": 0.21678321678321677, + "grad_norm": 1.4266396760940552, + "kl": 0.02391706220805645, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.001, + "reward": 1.1583333015441895, + "reward_std": 0.8598934412002563, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32500001788139343, + "step": 62 + }, + { + "completion_length": 381.3333435058594, + "epoch": 0.2202797202797203, + "grad_norm": 1.1708087921142578, + "kl": 0.012987270019948483, + "learning_rate": 1.5750000000000002e-06, + "loss": 0.0005, + "reward": 1.5416667461395264, + "reward_std": 1.3807305097579956, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 63 + }, + { + "completion_length": 237.0, + "epoch": 0.22377622377622378, + "grad_norm": 1.3068374395370483, + "kl": 0.027782242745161057, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0011, + "reward": 1.433333396911621, + "reward_std": 1.162611961364746, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2666666805744171, + "step": 64 + }, + { + "completion_length": 797.6666870117188, + "epoch": 0.22727272727272727, + "grad_norm": 0.7319328784942627, + "kl": 0.013491494581103325, + "learning_rate": 1.6250000000000001e-06, + "loss": 0.0005, + "reward": 1.3166667222976685, + "reward_std": 0.8604747653007507, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3166666626930237, + "step": 65 + }, + { + "completion_length": 237.1666717529297, + "epoch": 0.23076923076923078, + "grad_norm": 1.9626200199127197, + "kl": 0.015099573880434036, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0006, + "reward": 0.9666666388511658, + "reward_std": 0.797287106513977, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 66 + }, + { + "completion_length": 221.1666717529297, + "epoch": 0.23426573426573427, + "grad_norm": 0.7815642952919006, + "kl": 0.03964684158563614, + "learning_rate": 1.6750000000000003e-06, + "loss": 0.0016, + "reward": 1.6416667699813843, + "reward_std": 1.0584973096847534, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.14166668057441711, + "step": 67 + }, + { + "completion_length": 227.33334350585938, + "epoch": 0.23776223776223776, + "grad_norm": 1.5282418727874756, + "kl": 0.0695306807756424, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0028, + "reward": 0.75, + "reward_std": 0.7375635504722595, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.25, + "step": 68 + }, + { + "completion_length": 673.3333740234375, + "epoch": 0.24125874125874125, + "grad_norm": 0.8560697436332703, + "kl": 0.03540939837694168, + "learning_rate": 1.725e-06, + "loss": 0.0014, + "reward": 2.200000047683716, + "reward_std": 0.9581232070922852, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5333333611488342, + "step": 69 + }, + { + "completion_length": 254.6666717529297, + "epoch": 0.24475524475524477, + "grad_norm": 1.2371562719345093, + "kl": 0.03692096844315529, + "learning_rate": 1.75e-06, + "loss": 0.0015, + "reward": 1.8249998092651367, + "reward_std": 0.9968700408935547, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32499998807907104, + "step": 70 + }, + { + "completion_length": 234.6666717529297, + "epoch": 0.24825174825174826, + "grad_norm": 0.9824966192245483, + "kl": 0.07421376556158066, + "learning_rate": 1.7750000000000002e-06, + "loss": 0.003, + "reward": 1.1666667461395264, + "reward_std": 0.6485882997512817, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3333333432674408, + "step": 71 + }, + { + "completion_length": 580.0, + "epoch": 0.2517482517482518, + "grad_norm": 1.0504631996154785, + "kl": 0.048039551824331284, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0019, + "reward": 1.808333396911621, + "reward_std": 1.302849531173706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 72 + }, + { + "completion_length": 788.1666870117188, + "epoch": 0.25524475524475526, + "grad_norm": 0.6447965502738953, + "kl": 0.04130098968744278, + "learning_rate": 1.825e-06, + "loss": 0.0017, + "reward": 1.3875000476837158, + "reward_std": 1.093817949295044, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5541666746139526, + "step": 73 + }, + { + "completion_length": 376.16668701171875, + "epoch": 0.25874125874125875, + "grad_norm": 1.347108244895935, + "kl": 0.19923770427703857, + "learning_rate": 1.85e-06, + "loss": 0.008, + "reward": 1.529166579246521, + "reward_std": 0.6618943214416504, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19583332538604736, + "step": 74 + }, + { + "completion_length": 227.1666717529297, + "epoch": 0.26223776223776224, + "grad_norm": 0.8091520667076111, + "kl": 0.06355344504117966, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.0025, + "reward": 0.75, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 75 + }, + { + "completion_length": 502.3333435058594, + "epoch": 0.26573426573426573, + "grad_norm": 1.1315293312072754, + "kl": 0.11514662951231003, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0046, + "reward": 1.504166603088379, + "reward_std": 1.256027102470398, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.33750003576278687, + "step": 76 + }, + { + "completion_length": 306.16668701171875, + "epoch": 0.2692307692307692, + "grad_norm": 1.6002874374389648, + "kl": 0.07964249700307846, + "learning_rate": 1.925e-06, + "loss": 0.0032, + "reward": 1.7083333730697632, + "reward_std": 1.2195971012115479, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5416666865348816, + "step": 77 + }, + { + "completion_length": 253.0, + "epoch": 0.2727272727272727, + "grad_norm": 1.134474754333496, + "kl": 0.09407778084278107, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0038, + "reward": 1.8333333730697632, + "reward_std": 1.0842816829681396, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3333333432674408, + "step": 78 + }, + { + "completion_length": 456.3333435058594, + "epoch": 0.2762237762237762, + "grad_norm": 1.4590799808502197, + "kl": 0.08163408935070038, + "learning_rate": 1.975e-06, + "loss": 0.0033, + "reward": 1.1875, + "reward_std": 1.164232611656189, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3541666865348816, + "step": 79 + }, + { + "completion_length": 273.0, + "epoch": 0.27972027972027974, + "grad_norm": 1.589087724685669, + "kl": 0.08010071516036987, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0032, + "reward": 0.9125000238418579, + "reward_std": 0.9088110327720642, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.07916666567325592, + "step": 80 + }, + { + "completion_length": 196.1666717529297, + "epoch": 0.28321678321678323, + "grad_norm": 1.4217482805252075, + "kl": 0.0619954913854599, + "learning_rate": 2.025e-06, + "loss": 0.0025, + "reward": 1.058333396911621, + "reward_std": 0.7486097812652588, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 81 + }, + { + "completion_length": 340.8333435058594, + "epoch": 0.2867132867132867, + "grad_norm": 1.056475043296814, + "kl": 0.05495650693774223, + "learning_rate": 2.05e-06, + "loss": 0.0022, + "reward": 0.8625000715255737, + "reward_std": 0.5305068492889404, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19583332538604736, + "step": 82 + }, + { + "completion_length": 410.66668701171875, + "epoch": 0.2902097902097902, + "grad_norm": 0.5162915587425232, + "kl": 0.04134432598948479, + "learning_rate": 2.075e-06, + "loss": 0.0017, + "reward": 1.1875, + "reward_std": 0.7466174364089966, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1875, + "step": 83 + }, + { + "completion_length": 510.66668701171875, + "epoch": 0.2937062937062937, + "grad_norm": 0.9501734972000122, + "kl": 0.047528013586997986, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0019, + "reward": 1.258333444595337, + "reward_std": 1.1069854497909546, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 84 + }, + { + "completion_length": 476.0, + "epoch": 0.2972027972027972, + "grad_norm": 1.0745543241500854, + "kl": 0.04738708958029747, + "learning_rate": 2.125e-06, + "loss": 0.0019, + "reward": 0.7541666030883789, + "reward_std": 0.6050654649734497, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2541666626930237, + "step": 85 + }, + { + "completion_length": 346.16668701171875, + "epoch": 0.3006993006993007, + "grad_norm": 0.7894018888473511, + "kl": 0.03818603605031967, + "learning_rate": 2.15e-06, + "loss": 0.0015, + "reward": 1.5499999523162842, + "reward_std": 0.8729261159896851, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 86 + }, + { + "completion_length": 157.5, + "epoch": 0.3041958041958042, + "grad_norm": 1.2285088300704956, + "kl": 0.04852033406496048, + "learning_rate": 2.1750000000000004e-06, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 1.2284135818481445, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 87 + }, + { + "completion_length": 853.5, + "epoch": 0.3076923076923077, + "grad_norm": 1.1314716339111328, + "kl": 0.03052813559770584, + "learning_rate": 2.2e-06, + "loss": 0.0012, + "reward": 1.5625, + "reward_std": 1.093817949295044, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3958333432674408, + "step": 88 + }, + { + "completion_length": 372.66668701171875, + "epoch": 0.3111888111888112, + "grad_norm": 0.9353286623954773, + "kl": 0.027921725064516068, + "learning_rate": 2.2250000000000003e-06, + "loss": 0.0011, + "reward": 1.8250000476837158, + "reward_std": 0.9234446287155151, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32500001788139343, + "step": 89 + }, + { + "completion_length": 296.3333435058594, + "epoch": 0.3146853146853147, + "grad_norm": 1.140289306640625, + "kl": 0.04811665043234825, + "learning_rate": 2.25e-06, + "loss": 0.0019, + "reward": 1.125, + "reward_std": 1.1268318891525269, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 90 + }, + { + "completion_length": 99.83333587646484, + "epoch": 0.3181818181818182, + "grad_norm": 4.178561687469482, + "kl": 0.09318779408931732, + "learning_rate": 2.2750000000000002e-06, + "loss": 0.0037, + "reward": 0.5583333373069763, + "reward_std": 0.9645810127258301, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 91 + }, + { + "completion_length": 192.1666717529297, + "epoch": 0.32167832167832167, + "grad_norm": 1.560648798942566, + "kl": 0.03698144853115082, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0015, + "reward": 1.9249999523162842, + "reward_std": 0.718853235244751, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.25833335518836975, + "step": 92 + }, + { + "completion_length": 576.5, + "epoch": 0.32517482517482516, + "grad_norm": 1.093043327331543, + "kl": 0.021529672667384148, + "learning_rate": 2.325e-06, + "loss": 0.0009, + "reward": 1.070833444595337, + "reward_std": 0.6477686166763306, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.23749999701976776, + "step": 93 + }, + { + "completion_length": 335.8333435058594, + "epoch": 0.32867132867132864, + "grad_norm": 0.8303731679916382, + "kl": 0.019405633211135864, + "learning_rate": 2.35e-06, + "loss": 0.0008, + "reward": 0.8416666984558105, + "reward_std": 0.9183772802352905, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17499999701976776, + "step": 94 + }, + { + "completion_length": 569.5, + "epoch": 0.3321678321678322, + "grad_norm": 1.4912625551223755, + "kl": 0.014733041636645794, + "learning_rate": 2.375e-06, + "loss": 0.0006, + "reward": 1.4541667699813843, + "reward_std": 1.1459076404571533, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4541666507720947, + "step": 95 + }, + { + "completion_length": 232.83334350585938, + "epoch": 0.3356643356643357, + "grad_norm": 0.9174475073814392, + "kl": 0.018923718482255936, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0008, + "reward": 1.3333333730697632, + "reward_std": 0.9877583980560303, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 96 + }, + { + "completion_length": 742.1666870117188, + "epoch": 0.33916083916083917, + "grad_norm": 1.258750557899475, + "kl": 0.017664968967437744, + "learning_rate": 2.425e-06, + "loss": 0.0007, + "reward": 1.4583333730697632, + "reward_std": 0.6202150583267212, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 97 + }, + { + "completion_length": 270.8333435058594, + "epoch": 0.34265734265734266, + "grad_norm": 0.9259786605834961, + "kl": 0.05115365609526634, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.002, + "reward": 1.5500000715255737, + "reward_std": 0.8729261159896851, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.21666666865348816, + "step": 98 + }, + { + "completion_length": 476.3333435058594, + "epoch": 0.34615384615384615, + "grad_norm": 1.240902066230774, + "kl": 0.036602895706892014, + "learning_rate": 2.475e-06, + "loss": 0.0015, + "reward": 1.2791666984558105, + "reward_std": 1.1935679912567139, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.27916669845581055, + "step": 99 + }, + { + "completion_length": 213.6666717529297, + "epoch": 0.34965034965034963, + "grad_norm": 0.943215548992157, + "kl": 0.04590342566370964, + "learning_rate": 2.5e-06, + "loss": 0.0018, + "reward": 1.841666579246521, + "reward_std": 0.6873258352279663, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.34166666865348816, + "step": 100 + }, + { + "completion_length": 401.0, + "epoch": 0.3531468531468531, + "grad_norm": 0.7366496324539185, + "kl": 0.016905900090932846, + "learning_rate": 2.5250000000000004e-06, + "loss": 0.0007, + "reward": 1.3000000715255737, + "reward_std": 1.1256110668182373, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 101 + }, + { + "completion_length": 854.5, + "epoch": 0.35664335664335667, + "grad_norm": 8.089740753173828, + "kl": 0.08785610646009445, + "learning_rate": 2.55e-06, + "loss": 0.0035, + "reward": 1.316666603088379, + "reward_std": 1.2330517768859863, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 102 + }, + { + "completion_length": 455.16668701171875, + "epoch": 0.36013986013986016, + "grad_norm": 1.6066083908081055, + "kl": 0.03349429741501808, + "learning_rate": 2.5750000000000003e-06, + "loss": 0.0013, + "reward": 1.7333333492279053, + "reward_std": 1.6448911428451538, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40000003576278687, + "step": 103 + }, + { + "completion_length": 558.6666870117188, + "epoch": 0.36363636363636365, + "grad_norm": 1.2461860179901123, + "kl": 0.0453556627035141, + "learning_rate": 2.6e-06, + "loss": 0.0018, + "reward": 1.933333396911621, + "reward_std": 1.1851863861083984, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 104 + }, + { + "completion_length": 462.16668701171875, + "epoch": 0.36713286713286714, + "grad_norm": 0.9176071286201477, + "kl": 0.05445032939314842, + "learning_rate": 2.6250000000000003e-06, + "loss": 0.0022, + "reward": 1.2916667461395264, + "reward_std": 0.9144214391708374, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 105 + }, + { + "completion_length": 357.5, + "epoch": 0.3706293706293706, + "grad_norm": 1.1796709299087524, + "kl": 0.08697855472564697, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0035, + "reward": 0.9833333492279053, + "reward_std": 1.2412359714508057, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 106 + }, + { + "completion_length": 556.8333740234375, + "epoch": 0.3741258741258741, + "grad_norm": 1.1719709634780884, + "kl": 0.09557916224002838, + "learning_rate": 2.6750000000000002e-06, + "loss": 0.0038, + "reward": 0.9541666507720947, + "reward_std": 1.0742924213409424, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2875000238418579, + "step": 107 + }, + { + "completion_length": 490.8333435058594, + "epoch": 0.3776223776223776, + "grad_norm": 0.9839584827423096, + "kl": 0.07620736211538315, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.003, + "reward": 1.3416666984558105, + "reward_std": 1.0551856756210327, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5083333253860474, + "step": 108 + }, + { + "completion_length": 459.8333435058594, + "epoch": 0.3811188811188811, + "grad_norm": 1.0232492685317993, + "kl": 0.09754881262779236, + "learning_rate": 2.7250000000000006e-06, + "loss": 0.0039, + "reward": 1.7916667461395264, + "reward_std": 1.201422929763794, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 109 + }, + { + "completion_length": 432.5, + "epoch": 0.38461538461538464, + "grad_norm": 0.7946304082870483, + "kl": 0.043154411017894745, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0017, + "reward": 2.1000001430511475, + "reward_std": 0.8933085203170776, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 110 + }, + { + "completion_length": 346.8333435058594, + "epoch": 0.3881118881118881, + "grad_norm": 0.9842674136161804, + "kl": 0.1046643778681755, + "learning_rate": 2.7750000000000005e-06, + "loss": 0.0042, + "reward": 0.8166667222976685, + "reward_std": 0.7353004217147827, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 111 + }, + { + "completion_length": 214.5, + "epoch": 0.3916083916083916, + "grad_norm": 1.1671849489212036, + "kl": 0.1281026154756546, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0051, + "reward": 1.0500000715255737, + "reward_std": 0.14832398295402527, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 112 + }, + { + "completion_length": 908.6666870117188, + "epoch": 0.3951048951048951, + "grad_norm": 0.3388780951499939, + "kl": 0.022495290264487267, + "learning_rate": 2.825e-06, + "loss": 0.0009, + "reward": 2.3375000953674316, + "reward_std": 0.3727431893348694, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6708333492279053, + "step": 113 + }, + { + "completion_length": 891.6666870117188, + "epoch": 0.3986013986013986, + "grad_norm": 0.467278391122818, + "kl": 0.025123490020632744, + "learning_rate": 2.85e-06, + "loss": 0.001, + "reward": 1.8541667461395264, + "reward_std": 0.7543899416923523, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6875, + "step": 114 + }, + { + "completion_length": 546.1666870117188, + "epoch": 0.4020979020979021, + "grad_norm": 1.054366111755371, + "kl": 0.0783834159374237, + "learning_rate": 2.875e-06, + "loss": 0.0031, + "reward": 2.4000000953674316, + "reward_std": 1.306904673576355, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5666666626930237, + "step": 115 + }, + { + "completion_length": 835.1666870117188, + "epoch": 0.40559440559440557, + "grad_norm": 0.7376688122749329, + "kl": 0.04768560454249382, + "learning_rate": 2.9e-06, + "loss": 0.0019, + "reward": 1.5291666984558105, + "reward_std": 0.32841163873672485, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5291666984558105, + "step": 116 + }, + { + "completion_length": 368.3333435058594, + "epoch": 0.4090909090909091, + "grad_norm": 1.456405758857727, + "kl": 0.1393664926290512, + "learning_rate": 2.925e-06, + "loss": 0.0056, + "reward": 0.9541666507720947, + "reward_std": 0.7450531721115112, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2875000238418579, + "step": 117 + }, + { + "completion_length": 485.5, + "epoch": 0.4125874125874126, + "grad_norm": 1.4957919120788574, + "kl": 0.1291833370923996, + "learning_rate": 2.95e-06, + "loss": 0.0052, + "reward": 1.5833333730697632, + "reward_std": 1.4998888969421387, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4166666865348816, + "step": 118 + }, + { + "completion_length": 356.3333435058594, + "epoch": 0.4160839160839161, + "grad_norm": 1.178475022315979, + "kl": 0.10108506679534912, + "learning_rate": 2.9750000000000003e-06, + "loss": 0.004, + "reward": 0.7083333730697632, + "reward_std": 0.7506109476089478, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 119 + }, + { + "completion_length": 140.33334350585938, + "epoch": 0.4195804195804196, + "grad_norm": 1.4624924659729004, + "kl": 0.2249661386013031, + "learning_rate": 3e-06, + "loss": 0.009, + "reward": 0.9166666865348816, + "reward_std": 0.9174239635467529, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 120 + }, + { + "completion_length": 673.1666870117188, + "epoch": 0.4230769230769231, + "grad_norm": 1.0837116241455078, + "kl": 0.09312133491039276, + "learning_rate": 3.0250000000000003e-06, + "loss": 0.0037, + "reward": 2.2208335399627686, + "reward_std": 0.9818881750106812, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.38749998807907104, + "step": 121 + }, + { + "completion_length": 238.1666717529297, + "epoch": 0.42657342657342656, + "grad_norm": 1.0982871055603027, + "kl": 0.05689762160181999, + "learning_rate": 3.05e-06, + "loss": 0.0023, + "reward": 1.1166666746139526, + "reward_std": 0.7567474246025085, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.11666666716337204, + "step": 122 + }, + { + "completion_length": 576.1666870117188, + "epoch": 0.43006993006993005, + "grad_norm": 1.0922025442123413, + "kl": 0.04579655081033707, + "learning_rate": 3.075e-06, + "loss": 0.0018, + "reward": 2.4000000953674316, + "reward_std": 1.0807406902313232, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5666666626930237, + "step": 123 + }, + { + "completion_length": 736.6666870117188, + "epoch": 0.43356643356643354, + "grad_norm": 1.5019290447235107, + "kl": 0.030428007245063782, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0012, + "reward": 1.504166603088379, + "reward_std": 1.2472386360168457, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5041667222976685, + "step": 124 + }, + { + "completion_length": 603.5, + "epoch": 0.4370629370629371, + "grad_norm": 4.212569713592529, + "kl": 0.37697991728782654, + "learning_rate": 3.125e-06, + "loss": 0.0151, + "reward": 1.6416667699813843, + "reward_std": 0.8303112387657166, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6416667103767395, + "step": 125 + }, + { + "completion_length": 492.0, + "epoch": 0.4405594405594406, + "grad_norm": 0.9634215831756592, + "kl": 0.06763506680727005, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0027, + "reward": 2.125, + "reward_std": 1.2069590091705322, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 126 + }, + { + "completion_length": 792.1666870117188, + "epoch": 0.44405594405594406, + "grad_norm": 0.4220138192176819, + "kl": 0.03986603766679764, + "learning_rate": 3.175e-06, + "loss": 0.0016, + "reward": 1.1375000476837158, + "reward_std": 0.5137485265731812, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6375000476837158, + "step": 127 + }, + { + "completion_length": 535.5, + "epoch": 0.44755244755244755, + "grad_norm": 4.797938823699951, + "kl": 0.13327616453170776, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0053, + "reward": 1.1791666746139526, + "reward_std": 1.1582764387130737, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.34583336114883423, + "step": 128 + }, + { + "completion_length": 444.8333435058594, + "epoch": 0.45104895104895104, + "grad_norm": 0.7808079719543457, + "kl": 0.055326174944639206, + "learning_rate": 3.2250000000000005e-06, + "loss": 0.0022, + "reward": 1.495833396911621, + "reward_std": 0.7681823968887329, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.16250000894069672, + "step": 129 + }, + { + "completion_length": 454.66668701171875, + "epoch": 0.45454545454545453, + "grad_norm": 0.8776301741600037, + "kl": 0.11162035167217255, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0045, + "reward": 1.5750001668930054, + "reward_std": 1.4812158346176147, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.24166665971279144, + "step": 130 + }, + { + "completion_length": 769.6666870117188, + "epoch": 0.458041958041958, + "grad_norm": 0.4391367733478546, + "kl": 0.025292951613664627, + "learning_rate": 3.2750000000000004e-06, + "loss": 0.001, + "reward": 2.433333396911621, + "reward_std": 0.2746209502220154, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6000000238418579, + "step": 131 + }, + { + "completion_length": 528.6666870117188, + "epoch": 0.46153846153846156, + "grad_norm": 0.8809014558792114, + "kl": 0.12223925441503525, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0049, + "reward": 2.120833396911621, + "reward_std": 1.101410150527954, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4541666507720947, + "step": 132 + }, + { + "completion_length": 491.3333435058594, + "epoch": 0.46503496503496505, + "grad_norm": 1.0070464611053467, + "kl": 0.05908138304948807, + "learning_rate": 3.3250000000000004e-06, + "loss": 0.0024, + "reward": 0.5916666984558105, + "reward_std": 0.5335416197776794, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 133 + }, + { + "completion_length": 892.5, + "epoch": 0.46853146853146854, + "grad_norm": 0.4570764899253845, + "kl": 0.037701599299907684, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0015, + "reward": 1.7249999046325684, + "reward_std": 1.292478322982788, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 134 + }, + { + "completion_length": 806.8333740234375, + "epoch": 0.47202797202797203, + "grad_norm": 0.5572299361228943, + "kl": 0.05404336377978325, + "learning_rate": 3.3750000000000003e-06, + "loss": 0.0022, + "reward": 1.4583333730697632, + "reward_std": 0.990033745765686, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7916666269302368, + "step": 135 + }, + { + "completion_length": 589.0, + "epoch": 0.4755244755244755, + "grad_norm": 0.7575751543045044, + "kl": 0.04170485585927963, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0017, + "reward": 2.683333396911621, + "reward_std": 1.1075499057769775, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8500000238418579, + "step": 136 + }, + { + "completion_length": 1060.166748046875, + "epoch": 0.479020979020979, + "grad_norm": 0.5119641423225403, + "kl": 0.04976843297481537, + "learning_rate": 3.4250000000000007e-06, + "loss": 0.002, + "reward": 1.1125000715255737, + "reward_std": 0.39457258582115173, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6124999523162842, + "step": 137 + }, + { + "completion_length": 559.8333740234375, + "epoch": 0.4825174825174825, + "grad_norm": 0.6115387082099915, + "kl": 0.05675242468714714, + "learning_rate": 3.45e-06, + "loss": 0.0023, + "reward": 2.0416667461395264, + "reward_std": 0.5715476274490356, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5416666865348816, + "step": 138 + }, + { + "completion_length": 685.6666870117188, + "epoch": 0.486013986013986, + "grad_norm": 1.2578071355819702, + "kl": 0.07080799341201782, + "learning_rate": 3.475e-06, + "loss": 0.0028, + "reward": 1.379166603088379, + "reward_std": 1.0072758197784424, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7125000357627869, + "step": 139 + }, + { + "completion_length": 987.5, + "epoch": 0.48951048951048953, + "grad_norm": 0.6280319690704346, + "kl": 0.03268418833613396, + "learning_rate": 3.5e-06, + "loss": 0.0013, + "reward": 0.9291666746139526, + "reward_std": 0.6654728651046753, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5958333015441895, + "step": 140 + }, + { + "completion_length": 728.5, + "epoch": 0.493006993006993, + "grad_norm": 0.8773026466369629, + "kl": 0.032183535397052765, + "learning_rate": 3.525e-06, + "loss": 0.0013, + "reward": 2.862499952316284, + "reward_std": 0.7864078879356384, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6958333253860474, + "step": 141 + }, + { + "completion_length": 405.8333435058594, + "epoch": 0.4965034965034965, + "grad_norm": 0.8974792957305908, + "kl": 0.059865664690732956, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0024, + "reward": 1.6875, + "reward_std": 0.8300225734710693, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.3541666865348816, + "step": 142 + }, + { + "completion_length": 1081.666748046875, + "epoch": 0.5, + "grad_norm": 0.5286564230918884, + "kl": 0.022505857050418854, + "learning_rate": 3.575e-06, + "loss": 0.0009, + "reward": 2.370833396911621, + "reward_std": 0.4925486445426941, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8708332777023315, + "step": 143 + }, + { + "completion_length": 1141.3333740234375, + "epoch": 0.5034965034965035, + "grad_norm": 0.527409017086029, + "kl": 0.021072231233119965, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0008, + "reward": 1.9291666746139526, + "reward_std": 0.7955214381217957, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5958333611488342, + "step": 144 + }, + { + "completion_length": 515.5, + "epoch": 0.506993006993007, + "grad_norm": 2.5036261081695557, + "kl": 0.3181736469268799, + "learning_rate": 3.625e-06, + "loss": 0.0127, + "reward": 1.5833333730697632, + "reward_std": 0.9988327026367188, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5833333730697632, + "step": 145 + }, + { + "completion_length": 599.5, + "epoch": 0.5104895104895105, + "grad_norm": 0.7538139224052429, + "kl": 0.041587017476558685, + "learning_rate": 3.65e-06, + "loss": 0.0017, + "reward": 1.3583334684371948, + "reward_std": 0.6873258352279663, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6916666030883789, + "step": 146 + }, + { + "completion_length": 645.6666870117188, + "epoch": 0.513986013986014, + "grad_norm": 0.6815938353538513, + "kl": 0.031590305268764496, + "learning_rate": 3.6750000000000004e-06, + "loss": 0.0013, + "reward": 2.445833683013916, + "reward_std": 1.186003565788269, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6124999523162842, + "step": 147 + }, + { + "completion_length": 731.0, + "epoch": 0.5174825174825175, + "grad_norm": 1.4654277563095093, + "kl": 0.11272114515304565, + "learning_rate": 3.7e-06, + "loss": 0.0045, + "reward": 1.2125000953674316, + "reward_std": 0.7435977458953857, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7124999761581421, + "step": 148 + }, + { + "completion_length": 476.16668701171875, + "epoch": 0.5209790209790209, + "grad_norm": 3.388495683670044, + "kl": 0.9080104827880859, + "learning_rate": 3.7250000000000003e-06, + "loss": 0.0363, + "reward": 1.8958333730697632, + "reward_std": 0.9965461492538452, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3958333432674408, + "step": 149 + }, + { + "completion_length": 1053.166748046875, + "epoch": 0.5244755244755245, + "grad_norm": 0.4761454164981842, + "kl": 0.027715642005205154, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0011, + "reward": 3.2916667461395264, + "reward_std": 0.7417322397232056, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 150 + }, + { + "completion_length": 751.1666870117188, + "epoch": 0.527972027972028, + "grad_norm": 0.6827074885368347, + "kl": 0.0386313796043396, + "learning_rate": 3.7750000000000003e-06, + "loss": 0.0015, + "reward": 2.495833396911621, + "reward_std": 1.0227923393249512, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6625000238418579, + "step": 151 + }, + { + "completion_length": 721.8333740234375, + "epoch": 0.5314685314685315, + "grad_norm": 1.2814685106277466, + "kl": 0.041070081293582916, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0016, + "reward": 2.4666666984558105, + "reward_std": 0.8834120631217957, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 152 + }, + { + "completion_length": 513.0, + "epoch": 0.534965034965035, + "grad_norm": 0.6044140458106995, + "kl": 0.08036690950393677, + "learning_rate": 3.825000000000001e-06, + "loss": 0.0032, + "reward": 1.7875001430511475, + "reward_std": 1.1646621227264404, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6208333373069763, + "step": 153 + }, + { + "completion_length": 720.8333740234375, + "epoch": 0.5384615384615384, + "grad_norm": 0.7732751965522766, + "kl": 0.04927179962396622, + "learning_rate": 3.85e-06, + "loss": 0.002, + "reward": 2.383333206176758, + "reward_std": 1.4126808643341064, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 154 + }, + { + "completion_length": 708.8333740234375, + "epoch": 0.541958041958042, + "grad_norm": 0.6660548448562622, + "kl": 0.07937665283679962, + "learning_rate": 3.875e-06, + "loss": 0.0032, + "reward": 2.183333396911621, + "reward_std": 0.6377042531967163, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8500000238418579, + "step": 155 + }, + { + "completion_length": 1192.0, + "epoch": 0.5454545454545454, + "grad_norm": 0.3896901309490204, + "kl": 0.025209862738847733, + "learning_rate": 3.900000000000001e-06, + "loss": 0.001, + "reward": 1.8833332061767578, + "reward_std": 0.8691471815109253, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 156 + }, + { + "completion_length": 705.1666870117188, + "epoch": 0.548951048951049, + "grad_norm": 0.5750932097434998, + "kl": 0.04517858847975731, + "learning_rate": 3.9250000000000005e-06, + "loss": 0.0018, + "reward": 2.9541664123535156, + "reward_std": 0.6458360552787781, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6208333373069763, + "step": 157 + }, + { + "completion_length": 465.5, + "epoch": 0.5524475524475524, + "grad_norm": 0.8335661888122559, + "kl": 0.08351196348667145, + "learning_rate": 3.95e-06, + "loss": 0.0033, + "reward": 2.424999952316284, + "reward_std": 0.941673994064331, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5916666984558105, + "step": 158 + }, + { + "completion_length": 539.6666870117188, + "epoch": 0.5559440559440559, + "grad_norm": 1.1459757089614868, + "kl": 0.12647944688796997, + "learning_rate": 3.975000000000001e-06, + "loss": 0.0051, + "reward": 1.6416667699813843, + "reward_std": 1.2068209648132324, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 159 + }, + { + "completion_length": 798.0, + "epoch": 0.5594405594405595, + "grad_norm": 0.4939272105693817, + "kl": 0.051064085215330124, + "learning_rate": 4.000000000000001e-06, + "loss": 0.002, + "reward": 2.183333396911621, + "reward_std": 1.2081665992736816, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 160 + }, + { + "completion_length": 338.8333435058594, + "epoch": 0.5629370629370629, + "grad_norm": 0.8890612125396729, + "kl": 0.12327366322278976, + "learning_rate": 4.0250000000000004e-06, + "loss": 0.0049, + "reward": 2.575000286102295, + "reward_std": 0.9913375377655029, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40833336114883423, + "step": 161 + }, + { + "completion_length": 809.6666870117188, + "epoch": 0.5664335664335665, + "grad_norm": 0.3928314447402954, + "kl": 0.040153808891773224, + "learning_rate": 4.05e-06, + "loss": 0.0016, + "reward": 1.7208335399627686, + "reward_std": 0.5225937366485596, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7208333015441895, + "step": 162 + }, + { + "completion_length": 766.0, + "epoch": 0.5699300699300699, + "grad_norm": 0.7869060039520264, + "kl": 0.04531605541706085, + "learning_rate": 4.075e-06, + "loss": 0.0018, + "reward": 2.120833396911621, + "reward_std": 0.8866251707077026, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4541667103767395, + "step": 163 + }, + { + "completion_length": 1085.666748046875, + "epoch": 0.5734265734265734, + "grad_norm": 1.0671396255493164, + "kl": 0.06464602053165436, + "learning_rate": 4.1e-06, + "loss": 0.0026, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666865348816, + "step": 164 + }, + { + "completion_length": 628.1666870117188, + "epoch": 0.5769230769230769, + "grad_norm": 0.9583672285079956, + "kl": 0.06743767857551575, + "learning_rate": 4.125e-06, + "loss": 0.0027, + "reward": 2.137500286102295, + "reward_std": 1.376930594444275, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.637499988079071, + "step": 165 + }, + { + "completion_length": 351.8333435058594, + "epoch": 0.5804195804195804, + "grad_norm": 0.6946209669113159, + "kl": 0.09894745796918869, + "learning_rate": 4.15e-06, + "loss": 0.004, + "reward": 2.7750000953674316, + "reward_std": 0.7055140733718872, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4416666626930237, + "step": 166 + }, + { + "completion_length": 448.16668701171875, + "epoch": 0.583916083916084, + "grad_norm": 0.6712130308151245, + "kl": 0.0714031383395195, + "learning_rate": 4.175e-06, + "loss": 0.0029, + "reward": 1.9583333730697632, + "reward_std": 0.6499359011650085, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6250000596046448, + "step": 167 + }, + { + "completion_length": 763.0, + "epoch": 0.5874125874125874, + "grad_norm": 0.5934569239616394, + "kl": 0.039833370596170425, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0016, + "reward": 1.7208335399627686, + "reward_std": 0.6870983839035034, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.720833420753479, + "step": 168 + }, + { + "completion_length": 813.8333740234375, + "epoch": 0.5909090909090909, + "grad_norm": 0.46408811211586, + "kl": 0.0639135017991066, + "learning_rate": 4.225e-06, + "loss": 0.0026, + "reward": 2.6625001430511475, + "reward_std": 0.271454393863678, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6625000238418579, + "step": 169 + }, + { + "completion_length": 621.3333740234375, + "epoch": 0.5944055944055944, + "grad_norm": 1.6175382137298584, + "kl": 0.23431169986724854, + "learning_rate": 4.25e-06, + "loss": 0.0094, + "reward": 1.5250000953674316, + "reward_std": 1.00784432888031, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 170 + }, + { + "completion_length": 685.1666870117188, + "epoch": 0.5979020979020979, + "grad_norm": 0.7504808306694031, + "kl": 0.06654171645641327, + "learning_rate": 4.2750000000000006e-06, + "loss": 0.0027, + "reward": 2.4583334922790527, + "reward_std": 0.8187898397445679, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 171 + }, + { + "completion_length": 772.6666870117188, + "epoch": 0.6013986013986014, + "grad_norm": 0.39892545342445374, + "kl": 0.030765770003199577, + "learning_rate": 4.3e-06, + "loss": 0.0012, + "reward": 1.7333333492279053, + "reward_std": 0.40207797288894653, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 172 + }, + { + "completion_length": 600.8333740234375, + "epoch": 0.6048951048951049, + "grad_norm": 0.6147928833961487, + "kl": 0.07108036428689957, + "learning_rate": 4.325e-06, + "loss": 0.0028, + "reward": 2.054166793823242, + "reward_std": 0.5684225559234619, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7208333015441895, + "step": 173 + }, + { + "completion_length": 761.3333740234375, + "epoch": 0.6083916083916084, + "grad_norm": 1.1690645217895508, + "kl": 0.11572085320949554, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0046, + "reward": 1.9583333730697632, + "reward_std": 1.2491663694381714, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7916666865348816, + "step": 174 + }, + { + "completion_length": 800.6666870117188, + "epoch": 0.6118881118881119, + "grad_norm": 1.141146183013916, + "kl": 0.0763167217373848, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.0031, + "reward": 1.4458335638046265, + "reward_std": 1.0782413482666016, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6124999523162842, + "step": 175 + }, + { + "completion_length": 582.0, + "epoch": 0.6153846153846154, + "grad_norm": 0.9667629599571228, + "kl": 0.04065123200416565, + "learning_rate": 4.4e-06, + "loss": 0.0016, + "reward": 1.5625, + "reward_std": 1.3656271696090698, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5625, + "step": 176 + }, + { + "completion_length": 653.6666870117188, + "epoch": 0.6188811188811189, + "grad_norm": 0.7743256092071533, + "kl": 0.07254478335380554, + "learning_rate": 4.425e-06, + "loss": 0.0029, + "reward": 1.308333396911621, + "reward_std": 0.7324048280715942, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6416666507720947, + "step": 177 + }, + { + "completion_length": 624.8333740234375, + "epoch": 0.6223776223776224, + "grad_norm": 1.7900493144989014, + "kl": 0.2500300407409668, + "learning_rate": 4.450000000000001e-06, + "loss": 0.01, + "reward": 1.3583333492279053, + "reward_std": 0.7825705409049988, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916667222976685, + "step": 178 + }, + { + "completion_length": 1285.0, + "epoch": 0.6258741258741258, + "grad_norm": 0.3387628197669983, + "kl": 0.025821728631854057, + "learning_rate": 4.475e-06, + "loss": 0.001, + "reward": 2.7916667461395264, + "reward_std": 0.678355872631073, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666269302368, + "step": 179 + }, + { + "completion_length": 975.8333740234375, + "epoch": 0.6293706293706294, + "grad_norm": 0.41932833194732666, + "kl": 0.04700490087270737, + "learning_rate": 4.5e-06, + "loss": 0.0019, + "reward": 1.8500001430511475, + "reward_std": 0.6782330274581909, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8499999046325684, + "step": 180 + }, + { + "completion_length": 771.8333740234375, + "epoch": 0.6328671328671329, + "grad_norm": 0.6049262881278992, + "kl": 0.05856431648135185, + "learning_rate": 4.525000000000001e-06, + "loss": 0.0023, + "reward": 1.6624999046325684, + "reward_std": 1.5213277339935303, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6625000238418579, + "step": 181 + }, + { + "completion_length": 718.3333740234375, + "epoch": 0.6363636363636364, + "grad_norm": 0.519266664981842, + "kl": 0.05408002436161041, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0022, + "reward": 3.012500286102295, + "reward_std": 1.0839452743530273, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8458333015441895, + "step": 182 + }, + { + "completion_length": 417.3333435058594, + "epoch": 0.6398601398601399, + "grad_norm": 1.159592866897583, + "kl": 0.06883987784385681, + "learning_rate": 4.575e-06, + "loss": 0.0028, + "reward": 2.308333396911621, + "reward_std": 1.089686393737793, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416666507720947, + "step": 183 + }, + { + "completion_length": 403.66668701171875, + "epoch": 0.6433566433566433, + "grad_norm": 0.9109689593315125, + "kl": 0.12938742339611053, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0052, + "reward": 2.829166889190674, + "reward_std": 0.9263390898704529, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4958333373069763, + "step": 184 + }, + { + "completion_length": 584.1666870117188, + "epoch": 0.6468531468531469, + "grad_norm": 1.3091282844543457, + "kl": 0.1182996854186058, + "learning_rate": 4.625000000000001e-06, + "loss": 0.0047, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 185 + }, + { + "completion_length": 715.8333740234375, + "epoch": 0.6503496503496503, + "grad_norm": 0.8944427967071533, + "kl": 0.07471362501382828, + "learning_rate": 4.65e-06, + "loss": 0.003, + "reward": 2.5500001907348633, + "reward_std": 1.0044898986816406, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 186 + }, + { + "completion_length": 328.66668701171875, + "epoch": 0.6538461538461539, + "grad_norm": 2.0265045166015625, + "kl": 0.3070363402366638, + "learning_rate": 4.675000000000001e-06, + "loss": 0.0123, + "reward": 2.0291666984558105, + "reward_std": 0.9910117983818054, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.36250001192092896, + "step": 187 + }, + { + "completion_length": 463.8333435058594, + "epoch": 0.6573426573426573, + "grad_norm": 1.1863874197006226, + "kl": 0.07772837579250336, + "learning_rate": 4.7e-06, + "loss": 0.0031, + "reward": 2.5333335399627686, + "reward_std": 0.9558593034744263, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5333333611488342, + "step": 188 + }, + { + "completion_length": 516.5, + "epoch": 0.6608391608391608, + "grad_norm": 0.690477192401886, + "kl": 0.08707510679960251, + "learning_rate": 4.7250000000000005e-06, + "loss": 0.0035, + "reward": 3.4000000953674316, + "reward_std": 1.2024973630905151, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 189 + }, + { + "completion_length": 656.8333740234375, + "epoch": 0.6643356643356644, + "grad_norm": 0.7191756963729858, + "kl": 0.05152536556124687, + "learning_rate": 4.75e-06, + "loss": 0.0021, + "reward": 1.7833335399627686, + "reward_std": 0.5288351774215698, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 190 + }, + { + "completion_length": 510.16668701171875, + "epoch": 0.6678321678321678, + "grad_norm": 1.589722990989685, + "kl": 0.11165278404951096, + "learning_rate": 4.775e-06, + "loss": 0.0045, + "reward": 1.5916666984558105, + "reward_std": 1.1620744466781616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5916666984558105, + "step": 191 + }, + { + "completion_length": 463.3333435058594, + "epoch": 0.6713286713286714, + "grad_norm": 1.1402506828308105, + "kl": 0.12224837392568588, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0049, + "reward": 3.0166664123535156, + "reward_std": 0.46224093437194824, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6833333373069763, + "step": 192 + }, + { + "completion_length": 668.8333740234375, + "epoch": 0.6748251748251748, + "grad_norm": 0.829407811164856, + "kl": 0.04827030003070831, + "learning_rate": 4.825e-06, + "loss": 0.0019, + "reward": 2.516666889190674, + "reward_std": 0.9416297674179077, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 193 + }, + { + "completion_length": 653.1666870117188, + "epoch": 0.6783216783216783, + "grad_norm": 0.8737359642982483, + "kl": 0.11687206476926804, + "learning_rate": 4.85e-06, + "loss": 0.0047, + "reward": 1.883333444595337, + "reward_std": 0.9978310465812683, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666388511658, + "step": 194 + }, + { + "completion_length": 521.1666870117188, + "epoch": 0.6818181818181818, + "grad_norm": 1.265020728111267, + "kl": 0.1497541069984436, + "learning_rate": 4.875e-06, + "loss": 0.006, + "reward": 1.6666667461395264, + "reward_std": 1.1578716039657593, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6666666865348816, + "step": 195 + }, + { + "completion_length": 720.3333740234375, + "epoch": 0.6853146853146853, + "grad_norm": 0.5844486355781555, + "kl": 0.07905390858650208, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0032, + "reward": 2.683333396911621, + "reward_std": 0.7659417986869812, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 196 + }, + { + "completion_length": 654.3333740234375, + "epoch": 0.6888111888111889, + "grad_norm": 1.0279442071914673, + "kl": 0.05869147181510925, + "learning_rate": 4.925e-06, + "loss": 0.0023, + "reward": 1.8250000476837158, + "reward_std": 1.047735571861267, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.824999988079071, + "step": 197 + }, + { + "completion_length": 696.5, + "epoch": 0.6923076923076923, + "grad_norm": 0.5949178338050842, + "kl": 0.10564576834440231, + "learning_rate": 4.95e-06, + "loss": 0.0042, + "reward": 2.7958333492279053, + "reward_std": 0.8044278621673584, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 198 + }, + { + "completion_length": 667.3333740234375, + "epoch": 0.6958041958041958, + "grad_norm": 1.4045933485031128, + "kl": 0.2249039262533188, + "learning_rate": 4.975000000000001e-06, + "loss": 0.009, + "reward": 1.7833333015441895, + "reward_std": 1.2967909574508667, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 199 + }, + { + "completion_length": 549.0, + "epoch": 0.6993006993006993, + "grad_norm": 11.491266250610352, + "kl": 2.7085909843444824, + "learning_rate": 5e-06, + "loss": 0.1083, + "reward": 1.883333444595337, + "reward_std": 1.0934655666351318, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 200 + }, + { + "completion_length": 1157.666748046875, + "epoch": 0.7027972027972028, + "grad_norm": 0.3758504092693329, + "kl": 0.03439244627952576, + "learning_rate": 4.99999619228322e-06, + "loss": 0.0014, + "reward": 1.5375001430511475, + "reward_std": 0.490853875875473, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 201 + }, + { + "completion_length": 276.66668701171875, + "epoch": 0.7062937062937062, + "grad_norm": 1.4240407943725586, + "kl": 0.09711845219135284, + "learning_rate": 4.999984769144476e-06, + "loss": 0.0039, + "reward": 1.774999976158142, + "reward_std": 1.4250439405441284, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.44166669249534607, + "step": 202 + }, + { + "completion_length": 506.16668701171875, + "epoch": 0.7097902097902098, + "grad_norm": 0.8863720893859863, + "kl": 0.0886097177863121, + "learning_rate": 4.999965730618567e-06, + "loss": 0.0035, + "reward": 2.4166667461395264, + "reward_std": 0.7717944979667664, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.75, + "step": 203 + }, + { + "completion_length": 558.8333740234375, + "epoch": 0.7132867132867133, + "grad_norm": 1.036176323890686, + "kl": 0.11752279102802277, + "learning_rate": 4.999939076763487e-06, + "loss": 0.0047, + "reward": 1.8583334684371948, + "reward_std": 0.7761551141738892, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916666030883789, + "step": 204 + }, + { + "completion_length": 590.3333740234375, + "epoch": 0.7167832167832168, + "grad_norm": 1.2968803644180298, + "kl": 0.1260688155889511, + "learning_rate": 4.9999048076604286e-06, + "loss": 0.005, + "reward": 1.883333444595337, + "reward_std": 1.0934655666351318, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666984558105, + "step": 205 + }, + { + "completion_length": 653.3333740234375, + "epoch": 0.7202797202797203, + "grad_norm": 1.9041389226913452, + "kl": 0.350026935338974, + "learning_rate": 4.999862923413781e-06, + "loss": 0.014, + "reward": 1.8041666746139526, + "reward_std": 0.5104941129684448, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6375000476837158, + "step": 206 + }, + { + "completion_length": 359.3333435058594, + "epoch": 0.7237762237762237, + "grad_norm": 1.4652067422866821, + "kl": 0.09337612986564636, + "learning_rate": 4.9998134241511305e-06, + "loss": 0.0037, + "reward": 1.875, + "reward_std": 1.1440061330795288, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5416666865348816, + "step": 207 + }, + { + "completion_length": 393.3333435058594, + "epoch": 0.7272727272727273, + "grad_norm": 0.8172839879989624, + "kl": 0.11479752510786057, + "learning_rate": 4.999756310023261e-06, + "loss": 0.0046, + "reward": 3.2916667461395264, + "reward_std": 0.46627962589263916, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.625, + "step": 208 + }, + { + "completion_length": 1035.166748046875, + "epoch": 0.7307692307692307, + "grad_norm": 0.45489755272865295, + "kl": 0.03647574782371521, + "learning_rate": 4.9996915812041515e-06, + "loss": 0.0015, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 209 + }, + { + "completion_length": 561.5, + "epoch": 0.7342657342657343, + "grad_norm": 0.7732179164886475, + "kl": 0.10910838097333908, + "learning_rate": 4.9996192378909785e-06, + "loss": 0.0044, + "reward": 3.075000286102295, + "reward_std": 0.9852665662765503, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416667342185974, + "step": 210 + }, + { + "completion_length": 327.3333435058594, + "epoch": 0.7377622377622378, + "grad_norm": 1.1959446668624878, + "kl": 0.18659886717796326, + "learning_rate": 4.999539280304111e-06, + "loss": 0.0075, + "reward": 1.7333333492279053, + "reward_std": 0.6875075697898865, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5666666626930237, + "step": 211 + }, + { + "completion_length": 698.1666870117188, + "epoch": 0.7412587412587412, + "grad_norm": 0.5885636806488037, + "kl": 0.06670037657022476, + "learning_rate": 4.999451708687114e-06, + "loss": 0.0027, + "reward": 2.7750003337860107, + "reward_std": 0.8341163396835327, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7749999761581421, + "step": 212 + }, + { + "completion_length": 679.8333740234375, + "epoch": 0.7447552447552448, + "grad_norm": 0.9122396111488342, + "kl": 0.10316199064254761, + "learning_rate": 4.999356523306746e-06, + "loss": 0.0041, + "reward": 2.008333444595337, + "reward_std": 1.2973692417144775, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5083333253860474, + "step": 213 + }, + { + "completion_length": 604.1666870117188, + "epoch": 0.7482517482517482, + "grad_norm": 0.7414869070053101, + "kl": 0.08340045064687729, + "learning_rate": 4.9992537244529585e-06, + "loss": 0.0033, + "reward": 3.299999952316284, + "reward_std": 0.41713306307792664, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8000000715255737, + "step": 214 + }, + { + "completion_length": 704.5, + "epoch": 0.7517482517482518, + "grad_norm": 2.09073543548584, + "kl": 0.10594753921031952, + "learning_rate": 4.999143312438893e-06, + "loss": 0.0042, + "reward": 1.7416666746139526, + "reward_std": 0.9259679317474365, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7416667342185974, + "step": 215 + }, + { + "completion_length": 587.8333740234375, + "epoch": 0.7552447552447552, + "grad_norm": 1.304240107536316, + "kl": 0.1295248121023178, + "learning_rate": 4.999025287600886e-06, + "loss": 0.0052, + "reward": 2.616666793823242, + "reward_std": 1.6061341762542725, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6166666746139526, + "step": 216 + }, + { + "completion_length": 495.8333435058594, + "epoch": 0.7587412587412588, + "grad_norm": 1.2090598344802856, + "kl": 0.11880560964345932, + "learning_rate": 4.9988996502984604e-06, + "loss": 0.0048, + "reward": 2.7333333492279053, + "reward_std": 1.022578477859497, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5666667222976685, + "step": 217 + }, + { + "completion_length": 565.6666870117188, + "epoch": 0.7622377622377622, + "grad_norm": 0.553954005241394, + "kl": 0.052788302302360535, + "learning_rate": 4.998766400914329e-06, + "loss": 0.0021, + "reward": 2.6999998092651367, + "reward_std": 0.9705669283866882, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.699999988079071, + "step": 218 + }, + { + "completion_length": 645.6666870117188, + "epoch": 0.7657342657342657, + "grad_norm": 2.507683038711548, + "kl": 0.2849184274673462, + "learning_rate": 4.998625539854394e-06, + "loss": 0.0114, + "reward": 2.6000001430511475, + "reward_std": 1.0089600086212158, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7666666507720947, + "step": 219 + }, + { + "completion_length": 321.66668701171875, + "epoch": 0.7692307692307693, + "grad_norm": 1.2175945043563843, + "kl": 0.0842239186167717, + "learning_rate": 4.99847706754774e-06, + "loss": 0.0034, + "reward": 2.933333158493042, + "reward_std": 0.6516644954681396, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6000000238418579, + "step": 220 + }, + { + "completion_length": 700.5, + "epoch": 0.7727272727272727, + "grad_norm": 2.048892021179199, + "kl": 0.16157689690589905, + "learning_rate": 4.9983209844466404e-06, + "loss": 0.0065, + "reward": 1.625, + "reward_std": 1.0167349576950073, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 221 + }, + { + "completion_length": 833.5, + "epoch": 0.7762237762237763, + "grad_norm": 0.9171572327613831, + "kl": 0.06645169854164124, + "learning_rate": 4.998157291026553e-06, + "loss": 0.0027, + "reward": 2.9083335399627686, + "reward_std": 1.2068209648132324, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7416666746139526, + "step": 222 + }, + { + "completion_length": 506.3333435058594, + "epoch": 0.7797202797202797, + "grad_norm": 19.220211029052734, + "kl": 3.192702293395996, + "learning_rate": 4.9979859877861155e-06, + "loss": 0.1277, + "reward": 3.191666603088379, + "reward_std": 1.2146673202514648, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.6916667222976685, + "step": 223 + }, + { + "completion_length": 593.0, + "epoch": 0.7832167832167832, + "grad_norm": 0.8852243423461914, + "kl": 0.09442658722400665, + "learning_rate": 4.997807075247147e-06, + "loss": 0.0038, + "reward": 3.2750003337860107, + "reward_std": 0.6691412925720215, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7749999761581421, + "step": 224 + }, + { + "completion_length": 831.1666870117188, + "epoch": 0.7867132867132867, + "grad_norm": 0.4429211914539337, + "kl": 0.04310205578804016, + "learning_rate": 4.997620553954645e-06, + "loss": 0.0017, + "reward": 3.1541666984558105, + "reward_std": 1.132741928100586, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8208333849906921, + "step": 225 + }, + { + "completion_length": 731.0, + "epoch": 0.7902097902097902, + "grad_norm": 0.4210525155067444, + "kl": 0.0507250651717186, + "learning_rate": 4.997426424476787e-06, + "loss": 0.002, + "reward": 3.758333206176758, + "reward_std": 0.40052053332328796, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 226 + }, + { + "completion_length": 683.1666870117188, + "epoch": 0.7937062937062938, + "grad_norm": 1.443489670753479, + "kl": 0.1432674527168274, + "learning_rate": 4.9972246874049254e-06, + "loss": 0.0057, + "reward": 2.7166666984558105, + "reward_std": 1.075019359588623, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 227 + }, + { + "completion_length": 749.0, + "epoch": 0.7972027972027972, + "grad_norm": 0.4731828272342682, + "kl": 0.05084119364619255, + "learning_rate": 4.9970153433535855e-06, + "loss": 0.002, + "reward": 2.5250000953674316, + "reward_std": 0.49371039867401123, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8583332896232605, + "step": 228 + }, + { + "completion_length": 749.6666870117188, + "epoch": 0.8006993006993007, + "grad_norm": 1.1463042497634888, + "kl": 0.0917380303144455, + "learning_rate": 4.996798392960466e-06, + "loss": 0.0037, + "reward": 3.1000001430511475, + "reward_std": 1.1304867267608643, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 229 + }, + { + "completion_length": 444.3333435058594, + "epoch": 0.8041958041958042, + "grad_norm": 2.1588308811187744, + "kl": 0.2637466788291931, + "learning_rate": 4.9965738368864345e-06, + "loss": 0.0105, + "reward": 1.4583333730697632, + "reward_std": 0.665895402431488, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4583333432674408, + "step": 230 + }, + { + "completion_length": 563.8333740234375, + "epoch": 0.8076923076923077, + "grad_norm": 1.7064660787582397, + "kl": 0.15527644753456116, + "learning_rate": 4.99634167581553e-06, + "loss": 0.0062, + "reward": 2.9208335876464844, + "reward_std": 1.1095513105392456, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5874999761581421, + "step": 231 + }, + { + "completion_length": 571.6666870117188, + "epoch": 0.8111888111888111, + "grad_norm": 0.7909032106399536, + "kl": 0.10144728422164917, + "learning_rate": 4.996101910454953e-06, + "loss": 0.0041, + "reward": 3.200000286102295, + "reward_std": 0.6928204298019409, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.699999988079071, + "step": 232 + }, + { + "completion_length": 442.16668701171875, + "epoch": 0.8146853146853147, + "grad_norm": 2.3640758991241455, + "kl": 0.1561039686203003, + "learning_rate": 4.995854541535072e-06, + "loss": 0.0062, + "reward": 2.8583333492279053, + "reward_std": 1.5499732494354248, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 233 + }, + { + "completion_length": 635.0, + "epoch": 0.8181818181818182, + "grad_norm": 1.519736409187317, + "kl": 0.08059443533420563, + "learning_rate": 4.995599569809414e-06, + "loss": 0.0032, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 234 + }, + { + "completion_length": 867.1666870117188, + "epoch": 0.8216783216783217, + "grad_norm": 1.0411657094955444, + "kl": 0.18848155438899994, + "learning_rate": 4.995336996054668e-06, + "loss": 0.0075, + "reward": 2.566666603088379, + "reward_std": 0.8010410666465759, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 235 + }, + { + "completion_length": 767.0, + "epoch": 0.8251748251748252, + "grad_norm": 1.3162877559661865, + "kl": 0.1943603754043579, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.0078, + "reward": 2.8458335399627686, + "reward_std": 1.271457552909851, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8458333015441895, + "step": 236 + }, + { + "completion_length": 971.0, + "epoch": 0.8286713286713286, + "grad_norm": 0.7847824096679688, + "kl": 0.07626049965620041, + "learning_rate": 4.994789045680448e-06, + "loss": 0.0031, + "reward": 2.766666889190674, + "reward_std": 1.1245739459991455, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7666666507720947, + "step": 237 + }, + { + "completion_length": 552.0, + "epoch": 0.8321678321678322, + "grad_norm": 0.7410560250282288, + "kl": 0.10457824170589447, + "learning_rate": 4.994503670730126e-06, + "loss": 0.0042, + "reward": 3.391666889190674, + "reward_std": 0.7059863805770874, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7250000238418579, + "step": 238 + }, + { + "completion_length": 725.6666870117188, + "epoch": 0.8356643356643356, + "grad_norm": 0.4836815595626831, + "kl": 0.05600851774215698, + "learning_rate": 4.9942106970890136e-06, + "loss": 0.0022, + "reward": 2.7333333492279053, + "reward_std": 0.40207791328430176, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8999999761581421, + "step": 239 + }, + { + "completion_length": 670.1666870117188, + "epoch": 0.8391608391608392, + "grad_norm": 1.1572860479354858, + "kl": 0.09645780920982361, + "learning_rate": 4.993910125649561e-06, + "loss": 0.0039, + "reward": 1.945833444595337, + "reward_std": 1.1002748012542725, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.612500011920929, + "step": 240 + }, + { + "completion_length": 716.0, + "epoch": 0.8426573426573427, + "grad_norm": 0.6385201811790466, + "kl": 0.10877624154090881, + "learning_rate": 4.993601957327361e-06, + "loss": 0.0044, + "reward": 1.7999999523162842, + "reward_std": 1.3168143033981323, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 241 + }, + { + "completion_length": 783.0, + "epoch": 0.8461538461538461, + "grad_norm": 0.4785465598106384, + "kl": 0.06399235874414444, + "learning_rate": 4.993286193061145e-06, + "loss": 0.0026, + "reward": 2.258333444595337, + "reward_std": 0.5389031767845154, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9249999523162842, + "step": 242 + }, + { + "completion_length": 660.6666870117188, + "epoch": 0.8496503496503497, + "grad_norm": 0.7678278684616089, + "kl": 0.07323874533176422, + "learning_rate": 4.9929628338127904e-06, + "loss": 0.0029, + "reward": 2.575000047683716, + "reward_std": 1.0048632621765137, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7416666746139526, + "step": 243 + }, + { + "completion_length": 904.5, + "epoch": 0.8531468531468531, + "grad_norm": 0.41908255219459534, + "kl": 0.049275174736976624, + "learning_rate": 4.992631880567301e-06, + "loss": 0.002, + "reward": 1.9250000715255737, + "reward_std": 0.6354132890701294, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.9249999523162842, + "step": 244 + }, + { + "completion_length": 524.8333740234375, + "epoch": 0.8566433566433567, + "grad_norm": 0.9670363068580627, + "kl": 0.17363564670085907, + "learning_rate": 4.992293334332821e-06, + "loss": 0.0069, + "reward": 1.558333396911621, + "reward_std": 1.3331979513168335, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333373069763, + "step": 245 + }, + { + "completion_length": 869.1666870117188, + "epoch": 0.8601398601398601, + "grad_norm": 0.45620983839035034, + "kl": 0.0668826699256897, + "learning_rate": 4.991947196140619e-06, + "loss": 0.0027, + "reward": 2.5416667461395264, + "reward_std": 0.9057685732841492, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 246 + }, + { + "completion_length": 841.3333740234375, + "epoch": 0.8636363636363636, + "grad_norm": 0.559363603591919, + "kl": 0.0583985298871994, + "learning_rate": 4.991593467045092e-06, + "loss": 0.0023, + "reward": 2.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 247 + }, + { + "completion_length": 599.1666870117188, + "epoch": 0.8671328671328671, + "grad_norm": 0.9642091989517212, + "kl": 0.11994724720716476, + "learning_rate": 4.9912321481237616e-06, + "loss": 0.0048, + "reward": 2.5250000953674316, + "reward_std": 1.0810874700546265, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 248 + }, + { + "completion_length": 462.16668701171875, + "epoch": 0.8706293706293706, + "grad_norm": 36.93287658691406, + "kl": 9.688800811767578, + "learning_rate": 4.990863240477266e-06, + "loss": 0.3876, + "reward": 2.133333444595337, + "reward_std": 1.5154757499694824, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.46666666865348816, + "step": 249 + }, + { + "completion_length": 339.0, + "epoch": 0.8741258741258742, + "grad_norm": 26.625389099121094, + "kl": 0.959087610244751, + "learning_rate": 4.990486745229364e-06, + "loss": 0.0384, + "reward": 2.4000000953674316, + "reward_std": 1.4926488399505615, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5666666626930237, + "step": 250 + }, + { + "completion_length": 618.1666870117188, + "epoch": 0.8776223776223776, + "grad_norm": 0.8756181597709656, + "kl": 0.1540575623512268, + "learning_rate": 4.990102663526925e-06, + "loss": 0.0062, + "reward": 2.3583335876464844, + "reward_std": 0.7564169764518738, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916666030883789, + "step": 251 + }, + { + "completion_length": 659.0, + "epoch": 0.8811188811188811, + "grad_norm": 1.4729007482528687, + "kl": 0.22244331240653992, + "learning_rate": 4.989710996539926e-06, + "loss": 0.0089, + "reward": 2.6666667461395264, + "reward_std": 1.386602759361267, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6666666865348816, + "step": 252 + }, + { + "completion_length": 471.0, + "epoch": 0.8846153846153846, + "grad_norm": 1.7183626890182495, + "kl": 0.19531545042991638, + "learning_rate": 4.989311745461456e-06, + "loss": 0.0078, + "reward": 2.2624998092651367, + "reward_std": 1.547720193862915, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.42916664481163025, + "step": 253 + }, + { + "completion_length": 809.5, + "epoch": 0.8881118881118881, + "grad_norm": 1.3393943309783936, + "kl": 0.06276177614927292, + "learning_rate": 4.9889049115077e-06, + "loss": 0.0025, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 254 + }, + { + "completion_length": 696.0, + "epoch": 0.8916083916083916, + "grad_norm": 0.5159295201301575, + "kl": 0.06829811632633209, + "learning_rate": 4.988490495917948e-06, + "loss": 0.0027, + "reward": 2.375, + "reward_std": 0.8226482272148132, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.875, + "step": 255 + }, + { + "completion_length": 469.8333435058594, + "epoch": 0.8951048951048951, + "grad_norm": 15.731892585754395, + "kl": 5.195942401885986, + "learning_rate": 4.988068499954578e-06, + "loss": 0.2078, + "reward": 2.5333333015441895, + "reward_std": 1.7218208312988281, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5333333611488342, + "step": 256 + }, + { + "completion_length": 267.66668701171875, + "epoch": 0.8986013986013986, + "grad_norm": 2.6494510173797607, + "kl": 0.2645886242389679, + "learning_rate": 4.987638924903066e-06, + "loss": 0.0106, + "reward": 1.9833333492279053, + "reward_std": 1.6277797222137451, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4833333194255829, + "step": 257 + }, + { + "completion_length": 772.3333740234375, + "epoch": 0.9020979020979021, + "grad_norm": 0.4527927339076996, + "kl": 0.06693247705698013, + "learning_rate": 4.987201772071971e-06, + "loss": 0.0027, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 258 + }, + { + "completion_length": 585.6666870117188, + "epoch": 0.9055944055944056, + "grad_norm": 0.689224362373352, + "kl": 0.08530323952436447, + "learning_rate": 4.9867570427929356e-06, + "loss": 0.0034, + "reward": 0.7916666865348816, + "reward_std": 0.9183772802352905, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4583333432674408, + "step": 259 + }, + { + "completion_length": 537.1666870117188, + "epoch": 0.9090909090909091, + "grad_norm": 0.6728858947753906, + "kl": 0.0897747129201889, + "learning_rate": 4.986304738420684e-06, + "loss": 0.0036, + "reward": 3.129167079925537, + "reward_std": 1.1996268033981323, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7958333492279053, + "step": 260 + }, + { + "completion_length": 407.8333435058594, + "epoch": 0.9125874125874126, + "grad_norm": 1.1994887590408325, + "kl": 0.09183052182197571, + "learning_rate": 4.985844860333012e-06, + "loss": 0.0037, + "reward": 2.3583335876464844, + "reward_std": 1.752831220626831, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 261 + }, + { + "completion_length": 677.5, + "epoch": 0.916083916083916, + "grad_norm": 0.508855402469635, + "kl": 0.07326661795377731, + "learning_rate": 4.985377409930789e-06, + "loss": 0.0029, + "reward": 3.375, + "reward_std": 0.8635681867599487, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 262 + }, + { + "completion_length": 736.8333740234375, + "epoch": 0.9195804195804196, + "grad_norm": 0.9614912271499634, + "kl": 0.09196578711271286, + "learning_rate": 4.98490238863795e-06, + "loss": 0.0037, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 263 + }, + { + "completion_length": 770.8333740234375, + "epoch": 0.9230769230769231, + "grad_norm": 0.47455278038978577, + "kl": 0.06785900890827179, + "learning_rate": 4.984419797901491e-06, + "loss": 0.0027, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 264 + }, + { + "completion_length": 623.6666870117188, + "epoch": 0.9265734265734266, + "grad_norm": 0.5573136210441589, + "kl": 0.08627455681562424, + "learning_rate": 4.9839296391914696e-06, + "loss": 0.0035, + "reward": 3.116666793823242, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 265 + }, + { + "completion_length": 391.3333435058594, + "epoch": 0.9300699300699301, + "grad_norm": 1.9462356567382812, + "kl": 0.16661277413368225, + "learning_rate": 4.983431914000991e-06, + "loss": 0.0067, + "reward": 2.4749999046325684, + "reward_std": 1.4665435552597046, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 266 + }, + { + "completion_length": 397.3333435058594, + "epoch": 0.9335664335664335, + "grad_norm": 1.011677622795105, + "kl": 0.23764805495738983, + "learning_rate": 4.982926623846216e-06, + "loss": 0.0095, + "reward": 3.366666793823242, + "reward_std": 0.6274287104606628, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7000000476837158, + "step": 267 + }, + { + "completion_length": 417.0, + "epoch": 0.9370629370629371, + "grad_norm": 1.4490914344787598, + "kl": 0.13754335045814514, + "learning_rate": 4.9824137702663424e-06, + "loss": 0.0055, + "reward": 1.7125000953674316, + "reward_std": 0.9684717655181885, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7124999761581421, + "step": 268 + }, + { + "completion_length": 410.5, + "epoch": 0.9405594405594405, + "grad_norm": 0.8436146974563599, + "kl": 0.14260268211364746, + "learning_rate": 4.981893354823614e-06, + "loss": 0.0057, + "reward": 1.8125, + "reward_std": 1.1806514263153076, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6458333730697632, + "step": 269 + }, + { + "completion_length": 644.6666870117188, + "epoch": 0.9440559440559441, + "grad_norm": 0.7549885511398315, + "kl": 0.09023593366146088, + "learning_rate": 4.981365379103306e-06, + "loss": 0.0036, + "reward": 2.3500001430511475, + "reward_std": 1.3856406211853027, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 270 + }, + { + "completion_length": 195.5, + "epoch": 0.9475524475524476, + "grad_norm": 1.895914077758789, + "kl": 0.29670989513397217, + "learning_rate": 4.980829844713722e-06, + "loss": 0.0119, + "reward": 1.649999976158142, + "reward_std": 1.0168579816818237, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.3166666626930237, + "step": 271 + }, + { + "completion_length": 359.8333435058594, + "epoch": 0.951048951048951, + "grad_norm": 1.0856112241744995, + "kl": 0.255443274974823, + "learning_rate": 4.980286753286196e-06, + "loss": 0.0102, + "reward": 2.2916667461395264, + "reward_std": 1.2310227155685425, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.625, + "step": 272 + }, + { + "completion_length": 726.8333740234375, + "epoch": 0.9545454545454546, + "grad_norm": 0.2943981885910034, + "kl": 0.12990406155586243, + "learning_rate": 4.979736106475075e-06, + "loss": 0.0064, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 273 + }, + { + "completion_length": 680.0, + "epoch": 0.958041958041958, + "grad_norm": 0.5072641372680664, + "kl": 0.07472037523984909, + "learning_rate": 4.979177905957726e-06, + "loss": 0.003, + "reward": 3.012500286102295, + "reward_std": 1.1379531621932983, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8458333015441895, + "step": 274 + }, + { + "completion_length": 491.5, + "epoch": 0.9615384615384616, + "grad_norm": 0.6770206689834595, + "kl": 0.13075995445251465, + "learning_rate": 4.978612153434527e-06, + "loss": 0.0052, + "reward": 2.008333444595337, + "reward_std": 0.7618508338928223, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6750000715255737, + "step": 275 + }, + { + "completion_length": 749.6666870117188, + "epoch": 0.965034965034965, + "grad_norm": 0.5412439107894897, + "kl": 0.10561086982488632, + "learning_rate": 4.978038850628855e-06, + "loss": 0.0042, + "reward": 2.870833396911621, + "reward_std": 0.6615166068077087, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 276 + }, + { + "completion_length": 511.5, + "epoch": 0.9685314685314685, + "grad_norm": 1.1368520259857178, + "kl": 0.14474637806415558, + "learning_rate": 4.977457999287091e-06, + "loss": 0.0058, + "reward": 1.7583332061767578, + "reward_std": 1.0646204948425293, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 277 + }, + { + "completion_length": 750.6666870117188, + "epoch": 0.972027972027972, + "grad_norm": 1.0957084894180298, + "kl": 0.10108073800802231, + "learning_rate": 4.9768696011786095e-06, + "loss": 0.004, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 278 + }, + { + "completion_length": 324.3333435058594, + "epoch": 0.9755244755244755, + "grad_norm": 1.0172570943832397, + "kl": 0.31204575300216675, + "learning_rate": 4.976273658095772e-06, + "loss": 0.0125, + "reward": 0.908333420753479, + "reward_std": 1.0532886981964111, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40833330154418945, + "step": 279 + }, + { + "completion_length": 329.66668701171875, + "epoch": 0.9790209790209791, + "grad_norm": 0.753690242767334, + "kl": 0.09907300770282745, + "learning_rate": 4.975670171853926e-06, + "loss": 0.004, + "reward": 2.7750003337860107, + "reward_std": 1.0994317531585693, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7749999761581421, + "step": 280 + }, + { + "completion_length": 615.3333740234375, + "epoch": 0.9825174825174825, + "grad_norm": 0.8215593695640564, + "kl": 0.09376661479473114, + "learning_rate": 4.975059144291395e-06, + "loss": 0.0038, + "reward": 2.375, + "reward_std": 0.6509608626365662, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8749999403953552, + "step": 281 + }, + { + "completion_length": 435.8333435058594, + "epoch": 0.986013986013986, + "grad_norm": 1.3309355974197388, + "kl": 0.21346941590309143, + "learning_rate": 4.974440577269473e-06, + "loss": 0.0085, + "reward": 2.0333333015441895, + "reward_std": 1.6485350131988525, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5333333611488342, + "step": 282 + }, + { + "completion_length": 470.3333435058594, + "epoch": 0.9895104895104895, + "grad_norm": 1.1230376958847046, + "kl": 0.1047142893075943, + "learning_rate": 4.973814472672424e-06, + "loss": 0.0042, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 283 + }, + { + "completion_length": 887.5, + "epoch": 0.993006993006993, + "grad_norm": 0.6477030515670776, + "kl": 0.08142790198326111, + "learning_rate": 4.973180832407471e-06, + "loss": 0.0033, + "reward": 1.4250000715255737, + "reward_std": 0.9661781191825867, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5916666388511658, + "step": 284 + }, + { + "completion_length": 566.3333740234375, + "epoch": 0.9965034965034965, + "grad_norm": 0.7089259624481201, + "kl": 0.1486695259809494, + "learning_rate": 4.972539658404793e-06, + "loss": 0.0059, + "reward": 1.7166666984558105, + "reward_std": 0.7332576513290405, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 285 + }, + { + "completion_length": 899.3333740234375, + "epoch": 1.0, + "grad_norm": 0.6575971841812134, + "kl": 0.0989997610449791, + "learning_rate": 4.971890952617515e-06, + "loss": 0.004, + "reward": 2.8583335876464844, + "reward_std": 0.9960757493972778, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 286 + }, + { + "completion_length": 414.5, + "epoch": 1.0034965034965035, + "grad_norm": 1.0364247560501099, + "kl": 0.19011634588241577, + "learning_rate": 4.971234717021709e-06, + "loss": 0.0076, + "reward": 1.7916667461395264, + "reward_std": 1.7468304634094238, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6250000596046448, + "step": 287 + }, + { + "completion_length": 524.0, + "epoch": 1.006993006993007, + "grad_norm": 0.9833644032478333, + "kl": 0.14835724234580994, + "learning_rate": 4.970570953616383e-06, + "loss": 0.0059, + "reward": 2.3583335876464844, + "reward_std": 1.1191142797470093, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916666030883789, + "step": 288 + }, + { + "completion_length": 681.1666870117188, + "epoch": 1.0104895104895104, + "grad_norm": 0.6175888180732727, + "kl": 0.10941031575202942, + "learning_rate": 4.969899664423473e-06, + "loss": 0.0044, + "reward": 2.704166889190674, + "reward_std": 0.7567061185836792, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8708333373069763, + "step": 289 + }, + { + "completion_length": 386.5, + "epoch": 1.013986013986014, + "grad_norm": 2.7495882511138916, + "kl": 0.5513795614242554, + "learning_rate": 4.9692208514878445e-06, + "loss": 0.0221, + "reward": 1.3666666746139526, + "reward_std": 1.0023306608200073, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5333333611488342, + "step": 290 + }, + { + "completion_length": 679.6666870117188, + "epoch": 1.0174825174825175, + "grad_norm": 0.9174596667289734, + "kl": 0.14350205659866333, + "learning_rate": 4.968534516877279e-06, + "loss": 0.0057, + "reward": 2.879167079925537, + "reward_std": 1.0047906637191772, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7124999761581421, + "step": 291 + }, + { + "completion_length": 322.0, + "epoch": 1.020979020979021, + "grad_norm": 6.856034278869629, + "kl": 3.479478597640991, + "learning_rate": 4.96784066268247e-06, + "loss": 0.1392, + "reward": 0.875, + "reward_std": 0.9832345247268677, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.375, + "step": 292 + }, + { + "completion_length": 500.5, + "epoch": 1.0244755244755244, + "grad_norm": 0.8394511938095093, + "kl": 0.14955884218215942, + "learning_rate": 4.967139291017018e-06, + "loss": 0.006, + "reward": 2.133333206176758, + "reward_std": 1.149202585220337, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 293 + }, + { + "completion_length": 470.5, + "epoch": 1.027972027972028, + "grad_norm": 1.0547795295715332, + "kl": 0.26865124702453613, + "learning_rate": 4.966430404017424e-06, + "loss": 0.0107, + "reward": 1.7916667461395264, + "reward_std": 1.1534368991851807, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 294 + }, + { + "completion_length": 357.3333435058594, + "epoch": 1.0314685314685315, + "grad_norm": 1.61123788356781, + "kl": 0.2728823125362396, + "learning_rate": 4.965714003843079e-06, + "loss": 0.0109, + "reward": 3.266666889190674, + "reward_std": 1.6014575958251953, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7666666507720947, + "step": 295 + }, + { + "completion_length": 388.3333435058594, + "epoch": 1.034965034965035, + "grad_norm": 0.8229731917381287, + "kl": 0.33708059787750244, + "learning_rate": 4.964990092676263e-06, + "loss": 0.0135, + "reward": 0.7125000357627869, + "reward_std": 0.5300353765487671, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3791666626930237, + "step": 296 + }, + { + "completion_length": 667.0, + "epoch": 1.0384615384615385, + "grad_norm": 1.0831242799758911, + "kl": 0.26999422907829285, + "learning_rate": 4.964258672722135e-06, + "loss": 0.0108, + "reward": 2.5458335876464844, + "reward_std": 1.5425965785980225, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7124999761581421, + "step": 297 + }, + { + "completion_length": 804.1666870117188, + "epoch": 1.0419580419580419, + "grad_norm": 0.625715434551239, + "kl": 0.12136679887771606, + "learning_rate": 4.963519746208726e-06, + "loss": 0.0049, + "reward": 1.5791667699813843, + "reward_std": 1.2249915599822998, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7458333373069763, + "step": 298 + }, + { + "completion_length": 615.3333740234375, + "epoch": 1.0454545454545454, + "grad_norm": 0.9705678820610046, + "kl": 0.2214520424604416, + "learning_rate": 4.962773315386935e-06, + "loss": 0.0089, + "reward": 2.516666889190674, + "reward_std": 1.2355836629867554, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 299 + }, + { + "completion_length": 836.1666870117188, + "epoch": 1.048951048951049, + "grad_norm": 1.5465428829193115, + "kl": 0.24709966778755188, + "learning_rate": 4.962019382530521e-06, + "loss": 0.0099, + "reward": 2.0458333492279053, + "reward_std": 1.097544550895691, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 300 + }, + { + "completion_length": 597.6666870117188, + "epoch": 1.0524475524475525, + "grad_norm": 3.8257570266723633, + "kl": 0.9686455130577087, + "learning_rate": 4.961257949936092e-06, + "loss": 0.0387, + "reward": 1.4750001430511475, + "reward_std": 1.025061011314392, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 301 + }, + { + "completion_length": 516.6666870117188, + "epoch": 1.055944055944056, + "grad_norm": 2.1578736305236816, + "kl": 0.25257474184036255, + "learning_rate": 4.960489019923105e-06, + "loss": 0.0101, + "reward": 1.712499976158142, + "reward_std": 1.2360976934432983, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7125000357627869, + "step": 302 + }, + { + "completion_length": 390.3333435058594, + "epoch": 1.0594405594405594, + "grad_norm": 1.1851695775985718, + "kl": 0.30646514892578125, + "learning_rate": 4.959712594833855e-06, + "loss": 0.0123, + "reward": 1.3875000476837158, + "reward_std": 1.3440377712249756, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5541666746139526, + "step": 303 + }, + { + "completion_length": 329.66668701171875, + "epoch": 1.062937062937063, + "grad_norm": 1.7874314785003662, + "kl": 0.5978689193725586, + "learning_rate": 4.958928677033465e-06, + "loss": 0.0239, + "reward": 2.5625, + "reward_std": 1.447562575340271, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5625, + "step": 304 + }, + { + "completion_length": 676.5, + "epoch": 1.0664335664335665, + "grad_norm": 1.6353819370269775, + "kl": 0.2865048348903656, + "learning_rate": 4.958137268909887e-06, + "loss": 0.0115, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 305 + }, + { + "completion_length": 685.1666870117188, + "epoch": 1.06993006993007, + "grad_norm": 0.5405178666114807, + "kl": 0.16403402388095856, + "learning_rate": 4.957338372873886e-06, + "loss": 0.0066, + "reward": 1.1166666746139526, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 306 + }, + { + "completion_length": 377.16668701171875, + "epoch": 1.0734265734265733, + "grad_norm": 1.3861095905303955, + "kl": 0.5912900567054749, + "learning_rate": 4.956531991359038e-06, + "loss": 0.0237, + "reward": 0.9541667699813843, + "reward_std": 0.9423928260803223, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4541666507720947, + "step": 307 + }, + { + "completion_length": 568.1666870117188, + "epoch": 1.0769230769230769, + "grad_norm": 2.0841739177703857, + "kl": 0.3946326673030853, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.0158, + "reward": 1.2583333253860474, + "reward_std": 1.1876096725463867, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666388511658, + "step": 308 + }, + { + "completion_length": 610.1666870117188, + "epoch": 1.0804195804195804, + "grad_norm": 0.7838713526725769, + "kl": 0.20940952003002167, + "learning_rate": 4.95489678174111e-06, + "loss": 0.0084, + "reward": 1.1750000715255737, + "reward_std": 1.1035170555114746, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6750000715255737, + "step": 309 + }, + { + "completion_length": 780.3333740234375, + "epoch": 1.083916083916084, + "grad_norm": 0.91953444480896, + "kl": 0.13563194870948792, + "learning_rate": 4.9540679586191605e-06, + "loss": 0.0054, + "reward": 1.8500001430511475, + "reward_std": 1.006479024887085, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 310 + }, + { + "completion_length": 468.66668701171875, + "epoch": 1.0874125874125875, + "grad_norm": 1.1062681674957275, + "kl": 0.36474311351776123, + "learning_rate": 4.953231659980613e-06, + "loss": 0.0146, + "reward": 2.058333396911621, + "reward_std": 1.7576736211776733, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 311 + }, + { + "completion_length": 571.3333740234375, + "epoch": 1.0909090909090908, + "grad_norm": 0.7562583088874817, + "kl": 0.17403468489646912, + "learning_rate": 4.9523878883729794e-06, + "loss": 0.007, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 312 + }, + { + "completion_length": 580.6666870117188, + "epoch": 1.0944055944055944, + "grad_norm": 0.7236371040344238, + "kl": 0.20542237162590027, + "learning_rate": 4.9515366463665324e-06, + "loss": 0.0082, + "reward": 2.4000000953674316, + "reward_std": 0.8803409337997437, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 313 + }, + { + "completion_length": 372.5, + "epoch": 1.097902097902098, + "grad_norm": 0.736242949962616, + "kl": 0.19798314571380615, + "learning_rate": 4.9506779365543054e-06, + "loss": 0.0079, + "reward": 3.0916666984558105, + "reward_std": 0.4247548282146454, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9250000715255737, + "step": 314 + }, + { + "completion_length": 660.8333740234375, + "epoch": 1.1013986013986015, + "grad_norm": 0.7641960978507996, + "kl": 0.29524654150009155, + "learning_rate": 4.949811761552074e-06, + "loss": 0.0118, + "reward": 2.4166669845581055, + "reward_std": 1.2176480293273926, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7499999403953552, + "step": 315 + }, + { + "completion_length": 838.3333740234375, + "epoch": 1.104895104895105, + "grad_norm": 0.5717921853065491, + "kl": 0.14558419585227966, + "learning_rate": 4.94893812399836e-06, + "loss": 0.0058, + "reward": 2.258333206176758, + "reward_std": 0.8662660121917725, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9249999523162842, + "step": 316 + }, + { + "completion_length": 308.8333435058594, + "epoch": 1.1083916083916083, + "grad_norm": 1.5407124757766724, + "kl": 0.36382099986076355, + "learning_rate": 4.948057026554415e-06, + "loss": 0.0146, + "reward": 1.2291667461395264, + "reward_std": 1.2054479122161865, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5625, + "step": 317 + }, + { + "completion_length": 582.1666870117188, + "epoch": 1.1118881118881119, + "grad_norm": 0.5300387144088745, + "kl": 0.19406351447105408, + "learning_rate": 4.947168471904213e-06, + "loss": 0.0078, + "reward": 1.375, + "reward_std": 0.4937104880809784, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8749999403953552, + "step": 318 + }, + { + "completion_length": 889.3333740234375, + "epoch": 1.1153846153846154, + "grad_norm": 0.7921298146247864, + "kl": 0.14385448396205902, + "learning_rate": 4.946272462754447e-06, + "loss": 0.0058, + "reward": 1.629166603088379, + "reward_std": 0.8614546656608582, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 319 + }, + { + "completion_length": 576.6666870117188, + "epoch": 1.118881118881119, + "grad_norm": 2.1564207077026367, + "kl": 0.8259252309799194, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.033, + "reward": 1.399999976158142, + "reward_std": 1.3337916135787964, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40000003576278687, + "step": 320 + }, + { + "completion_length": 471.8333435058594, + "epoch": 1.1223776223776223, + "grad_norm": 1.2515596151351929, + "kl": 0.24163812398910522, + "learning_rate": 4.944458091896515e-06, + "loss": 0.0097, + "reward": 1.883333444595337, + "reward_std": 1.3325413465499878, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666984558105, + "step": 321 + }, + { + "completion_length": 416.66668701171875, + "epoch": 1.1258741258741258, + "grad_norm": 0.7721207141876221, + "kl": 0.2213769555091858, + "learning_rate": 4.9435397357152406e-06, + "loss": 0.0089, + "reward": 1.899999976158142, + "reward_std": 0.6442049741744995, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 322 + }, + { + "completion_length": 1349.5, + "epoch": 1.1293706293706294, + "grad_norm": 0.3130567967891693, + "kl": 0.10197386145591736, + "learning_rate": 4.94261393608816e-06, + "loss": 0.0041, + "reward": 1.9666666984558105, + "reward_std": 0.9277212023735046, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7999999523162842, + "step": 323 + }, + { + "completion_length": 669.5, + "epoch": 1.132867132867133, + "grad_norm": 0.9291994571685791, + "kl": 0.22598087787628174, + "learning_rate": 4.9416806958354206e-06, + "loss": 0.009, + "reward": 0.949999988079071, + "reward_std": 0.6595453023910522, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 324 + }, + { + "completion_length": 184.1666717529297, + "epoch": 1.1363636363636362, + "grad_norm": 2.9357590675354004, + "kl": 0.44805118441581726, + "learning_rate": 4.9407400177998335e-06, + "loss": 0.0179, + "reward": 2.450000047683716, + "reward_std": 1.4673106670379639, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 325 + }, + { + "completion_length": 786.8333740234375, + "epoch": 1.1398601398601398, + "grad_norm": 0.7112540006637573, + "kl": 0.23709163069725037, + "learning_rate": 4.939791904846869e-06, + "loss": 0.0095, + "reward": 2.7333335876464844, + "reward_std": 0.40207797288894653, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 326 + }, + { + "completion_length": 391.16668701171875, + "epoch": 1.1433566433566433, + "grad_norm": 1.6311299800872803, + "kl": 0.31598275899887085, + "learning_rate": 4.938836359864641e-06, + "loss": 0.0126, + "reward": 2.2791666984558105, + "reward_std": 0.9937827587127686, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6124999523162842, + "step": 327 + }, + { + "completion_length": 325.8333435058594, + "epoch": 1.1468531468531469, + "grad_norm": 1.6858141422271729, + "kl": 0.40026235580444336, + "learning_rate": 4.937873385763909e-06, + "loss": 0.016, + "reward": 2.0250000953674316, + "reward_std": 1.1339092254638672, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 328 + }, + { + "completion_length": 313.3333435058594, + "epoch": 1.1503496503496504, + "grad_norm": 1.9852374792099, + "kl": 0.36842843890190125, + "learning_rate": 4.936902985478055e-06, + "loss": 0.0147, + "reward": 2.5250000953674316, + "reward_std": 1.1206024885177612, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 329 + }, + { + "completion_length": 333.66668701171875, + "epoch": 1.1538461538461537, + "grad_norm": 1.0456072092056274, + "kl": 0.3002980351448059, + "learning_rate": 4.935925161963089e-06, + "loss": 0.012, + "reward": 2.1083335876464844, + "reward_std": 0.9068719744682312, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7749999761581421, + "step": 330 + }, + { + "completion_length": 419.16668701171875, + "epoch": 1.1573426573426573, + "grad_norm": 0.9209095239639282, + "kl": 0.19463126361370087, + "learning_rate": 4.93493991819763e-06, + "loss": 0.0078, + "reward": 3.566666603088379, + "reward_std": 0.9389710426330566, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 331 + }, + { + "completion_length": 501.3333435058594, + "epoch": 1.1608391608391608, + "grad_norm": 0.9894822239875793, + "kl": 0.23653444647789001, + "learning_rate": 4.933947257182901e-06, + "loss": 0.0095, + "reward": 2.4583334922790527, + "reward_std": 1.6280101537704468, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 332 + }, + { + "completion_length": 283.8333435058594, + "epoch": 1.1643356643356644, + "grad_norm": 1.3056206703186035, + "kl": 0.3558562397956848, + "learning_rate": 4.932947181942721e-06, + "loss": 0.0142, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 333 + }, + { + "completion_length": 617.8333740234375, + "epoch": 1.167832167832168, + "grad_norm": 0.7905691266059875, + "kl": 0.2221965491771698, + "learning_rate": 4.9319396955234925e-06, + "loss": 0.0089, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 334 + }, + { + "completion_length": 802.3333740234375, + "epoch": 1.1713286713286712, + "grad_norm": 0.650930643081665, + "kl": 0.2902371287345886, + "learning_rate": 4.930924800994192e-06, + "loss": 0.0116, + "reward": 2.9375, + "reward_std": 0.9523326754570007, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7708332538604736, + "step": 335 + }, + { + "completion_length": 571.5, + "epoch": 1.1748251748251748, + "grad_norm": 2.592233180999756, + "kl": 0.44388240575790405, + "learning_rate": 4.9299025014463665e-06, + "loss": 0.0178, + "reward": 2.625, + "reward_std": 1.0167349576950073, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 336 + }, + { + "completion_length": 765.0, + "epoch": 1.1783216783216783, + "grad_norm": 0.8478806018829346, + "kl": 0.23496964573860168, + "learning_rate": 4.928872799994116e-06, + "loss": 0.0094, + "reward": 2.4166665077209473, + "reward_std": 1.0943796634674072, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.75, + "step": 337 + }, + { + "completion_length": 369.5, + "epoch": 1.1818181818181819, + "grad_norm": 1.2003388404846191, + "kl": 0.283313125371933, + "learning_rate": 4.92783569977409e-06, + "loss": 0.0113, + "reward": 2.4625000953674316, + "reward_std": 1.1056389808654785, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 338 + }, + { + "completion_length": 241.1666717529297, + "epoch": 1.1853146853146854, + "grad_norm": 1.1362509727478027, + "kl": 0.36542683839797974, + "learning_rate": 4.926791203945477e-06, + "loss": 0.0146, + "reward": 2.941667079925537, + "reward_std": 1.237908124923706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7749999761581421, + "step": 339 + }, + { + "completion_length": 262.3333435058594, + "epoch": 1.1888111888111887, + "grad_norm": 2.5425589084625244, + "kl": 0.46542689204216003, + "learning_rate": 4.925739315689991e-06, + "loss": 0.0186, + "reward": 2.2166666984558105, + "reward_std": 1.3840761184692383, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666388511658, + "step": 340 + }, + { + "completion_length": 458.8333435058594, + "epoch": 1.1923076923076923, + "grad_norm": 1.0685269832611084, + "kl": 0.28533288836479187, + "learning_rate": 4.924680038211868e-06, + "loss": 0.0114, + "reward": 3.0375001430511475, + "reward_std": 0.7974568605422974, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 341 + }, + { + "completion_length": 680.6666870117188, + "epoch": 1.1958041958041958, + "grad_norm": 1.049636960029602, + "kl": 0.2565695643424988, + "learning_rate": 4.923613374737848e-06, + "loss": 0.0103, + "reward": 2.875, + "reward_std": 0.7834221124649048, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.875, + "step": 342 + }, + { + "completion_length": 669.5, + "epoch": 1.1993006993006994, + "grad_norm": 0.47562330961227417, + "kl": 0.15911276638507843, + "learning_rate": 4.922539328517174e-06, + "loss": 0.0064, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 343 + }, + { + "completion_length": 533.1666870117188, + "epoch": 1.2027972027972027, + "grad_norm": 2.7278823852539062, + "kl": 0.42878812551498413, + "learning_rate": 4.921457902821578e-06, + "loss": 0.0172, + "reward": 2.191666603088379, + "reward_std": 1.1499637365341187, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 344 + }, + { + "completion_length": 410.5, + "epoch": 1.2062937062937062, + "grad_norm": 1.2009421586990356, + "kl": 0.30361247062683105, + "learning_rate": 4.92036910094527e-06, + "loss": 0.0121, + "reward": 2.2958333492279053, + "reward_std": 0.7362772822380066, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7958333492279053, + "step": 345 + }, + { + "completion_length": 678.0, + "epoch": 1.2097902097902098, + "grad_norm": 1.1339452266693115, + "kl": 0.36994367837905884, + "learning_rate": 4.9192729262049285e-06, + "loss": 0.0148, + "reward": 1.375, + "reward_std": 1.7195203304290771, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.375, + "step": 346 + }, + { + "completion_length": 364.66668701171875, + "epoch": 1.2132867132867133, + "grad_norm": 1.0105022192001343, + "kl": 0.22824347019195557, + "learning_rate": 4.918169381939693e-06, + "loss": 0.0091, + "reward": 1.75, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.75, + "step": 347 + }, + { + "completion_length": 231.83334350585938, + "epoch": 1.2167832167832167, + "grad_norm": 2.2665371894836426, + "kl": 0.5012367963790894, + "learning_rate": 4.917058471511149e-06, + "loss": 0.02, + "reward": 0.8916667699813843, + "reward_std": 0.8929818868637085, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 348 + }, + { + "completion_length": 149.6666717529297, + "epoch": 1.2202797202797202, + "grad_norm": 1.465401530265808, + "kl": 0.71610426902771, + "learning_rate": 4.915940198303324e-06, + "loss": 0.0286, + "reward": 2.183333396911621, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5166666507720947, + "step": 349 + }, + { + "completion_length": 265.66668701171875, + "epoch": 1.2237762237762237, + "grad_norm": 1.1324924230575562, + "kl": 0.39196571707725525, + "learning_rate": 4.914814565722671e-06, + "loss": 0.0157, + "reward": 2.016666889190674, + "reward_std": 0.9521905779838562, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 350 + }, + { + "completion_length": 228.1666717529297, + "epoch": 1.2272727272727273, + "grad_norm": 2.361294746398926, + "kl": 0.5443918704986572, + "learning_rate": 4.913681577198063e-06, + "loss": 0.0218, + "reward": 2.7916667461395264, + "reward_std": 1.5844295024871826, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 351 + }, + { + "completion_length": 645.1666870117188, + "epoch": 1.2307692307692308, + "grad_norm": 1.6541866064071655, + "kl": 0.3587082326412201, + "learning_rate": 4.912541236180779e-06, + "loss": 0.0143, + "reward": 3.0208334922790527, + "reward_std": 1.1969144344329834, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6875, + "step": 352 + }, + { + "completion_length": 592.1666870117188, + "epoch": 1.2342657342657342, + "grad_norm": 3.038172483444214, + "kl": 0.6741119623184204, + "learning_rate": 4.9113935461444955e-06, + "loss": 0.027, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 353 + }, + { + "completion_length": 416.16668701171875, + "epoch": 1.2377622377622377, + "grad_norm": 1.0763347148895264, + "kl": 0.32444697618484497, + "learning_rate": 4.910238510585275e-06, + "loss": 0.013, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 354 + }, + { + "completion_length": 276.3333435058594, + "epoch": 1.2412587412587412, + "grad_norm": 2.7986843585968018, + "kl": 0.9174998998641968, + "learning_rate": 4.909076133021558e-06, + "loss": 0.0367, + "reward": 2.625, + "reward_std": 1.616709589958191, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 355 + }, + { + "completion_length": 269.16668701171875, + "epoch": 1.2447552447552448, + "grad_norm": 0.9633187055587769, + "kl": 0.3955456614494324, + "learning_rate": 4.907906416994146e-06, + "loss": 0.0158, + "reward": 3.066667079925537, + "reward_std": 0.4490731656551361, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8999999761581421, + "step": 356 + }, + { + "completion_length": 313.16668701171875, + "epoch": 1.2482517482517483, + "grad_norm": 2.313849449157715, + "kl": 0.662523627281189, + "learning_rate": 4.906729366066197e-06, + "loss": 0.0265, + "reward": 1.7666667699813843, + "reward_std": 1.1767185926437378, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 357 + }, + { + "completion_length": 216.0, + "epoch": 1.2517482517482517, + "grad_norm": 4.379472255706787, + "kl": 0.7677586078643799, + "learning_rate": 4.905544983823214e-06, + "loss": 0.0307, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 358 + }, + { + "completion_length": 860.3333740234375, + "epoch": 1.2552447552447552, + "grad_norm": 2.9275009632110596, + "kl": 0.6438803672790527, + "learning_rate": 4.904353273873029e-06, + "loss": 0.0258, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 359 + }, + { + "completion_length": 217.83334350585938, + "epoch": 1.2587412587412588, + "grad_norm": 2.738201141357422, + "kl": 0.6947124004364014, + "learning_rate": 4.903154239845798e-06, + "loss": 0.0278, + "reward": 2.125, + "reward_std": 1.3129165172576904, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 360 + }, + { + "completion_length": 850.6666870117188, + "epoch": 1.2622377622377623, + "grad_norm": 0.6407853364944458, + "kl": 0.21777069568634033, + "learning_rate": 4.901947885393986e-06, + "loss": 0.0087, + "reward": 3.066667079925537, + "reward_std": 0.9389710426330566, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 361 + }, + { + "completion_length": 430.5, + "epoch": 1.2657342657342658, + "grad_norm": 3.934774398803711, + "kl": 1.3171093463897705, + "learning_rate": 4.900734214192358e-06, + "loss": 0.0527, + "reward": 2.4666666984558105, + "reward_std": 1.7380064725875854, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 362 + }, + { + "completion_length": 1049.0, + "epoch": 1.2692307692307692, + "grad_norm": 1.0587317943572998, + "kl": 0.3339938521385193, + "learning_rate": 4.899513229937968e-06, + "loss": 0.0134, + "reward": 1.183333396911621, + "reward_std": 0.6088240146636963, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8499999046325684, + "step": 363 + }, + { + "completion_length": 752.5, + "epoch": 1.2727272727272727, + "grad_norm": 0.9463182687759399, + "kl": 0.2867739796638489, + "learning_rate": 4.898284936350144e-06, + "loss": 0.0115, + "reward": 1.445833444595337, + "reward_std": 1.1011831760406494, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.612500011920929, + "step": 364 + }, + { + "completion_length": 302.3333435058594, + "epoch": 1.2762237762237763, + "grad_norm": 1.0470837354660034, + "kl": 0.4384109377861023, + "learning_rate": 4.897049337170483e-06, + "loss": 0.0175, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 365 + }, + { + "completion_length": 299.5, + "epoch": 1.2797202797202798, + "grad_norm": 1.4532350301742554, + "kl": 0.48457586765289307, + "learning_rate": 4.8958064361628334e-06, + "loss": 0.0194, + "reward": 2.183333396911621, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 366 + }, + { + "completion_length": 591.1666870117188, + "epoch": 1.2832167832167833, + "grad_norm": 1.7987697124481201, + "kl": 0.44638824462890625, + "learning_rate": 4.894556237113287e-06, + "loss": 0.0179, + "reward": 1.1166666746139526, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 367 + }, + { + "completion_length": 1384.5, + "epoch": 1.2867132867132867, + "grad_norm": 0.4040040373802185, + "kl": 0.12767352163791656, + "learning_rate": 4.893298743830168e-06, + "loss": 0.0051, + "reward": 1.691666841506958, + "reward_std": 1.4019334316253662, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 368 + }, + { + "completion_length": 440.3333435058594, + "epoch": 1.2902097902097902, + "grad_norm": 1.9347208738327026, + "kl": 0.46111249923706055, + "learning_rate": 4.89203396014402e-06, + "loss": 0.0184, + "reward": 1.9333332777023315, + "reward_std": 1.0510313510894775, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7666666507720947, + "step": 369 + }, + { + "completion_length": 602.8333740234375, + "epoch": 1.2937062937062938, + "grad_norm": 1.7568728923797607, + "kl": 0.5643346309661865, + "learning_rate": 4.890761889907589e-06, + "loss": 0.0226, + "reward": 1.2333333492279053, + "reward_std": 1.1513760089874268, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40000003576278687, + "step": 370 + }, + { + "completion_length": 584.1666870117188, + "epoch": 1.297202797202797, + "grad_norm": 2.6727964878082275, + "kl": 0.5424228310585022, + "learning_rate": 4.889482536995826e-06, + "loss": 0.0217, + "reward": 2.7916667461395264, + "reward_std": 1.4527275562286377, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 371 + }, + { + "completion_length": 302.16668701171875, + "epoch": 1.3006993006993006, + "grad_norm": 1.0215359926223755, + "kl": 0.38776999711990356, + "learning_rate": 4.888195905305859e-06, + "loss": 0.0155, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 372 + }, + { + "completion_length": 1038.5, + "epoch": 1.3041958041958042, + "grad_norm": 0.8328973054885864, + "kl": 0.31271958351135254, + "learning_rate": 4.886901998756995e-06, + "loss": 0.0125, + "reward": 1.4750001430511475, + "reward_std": 1.0486897230148315, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 373 + }, + { + "completion_length": 407.16668701171875, + "epoch": 1.3076923076923077, + "grad_norm": 1.812672734260559, + "kl": 0.3156376779079437, + "learning_rate": 4.885600821290692e-06, + "loss": 0.0126, + "reward": 3.183333396911621, + "reward_std": 1.2027745246887207, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 374 + }, + { + "completion_length": 264.16668701171875, + "epoch": 1.3111888111888113, + "grad_norm": 4.727421760559082, + "kl": 1.329188585281372, + "learning_rate": 4.884292376870567e-06, + "loss": 0.0532, + "reward": 2.0916666984558105, + "reward_std": 0.94890296459198, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 375 + }, + { + "completion_length": 516.5, + "epoch": 1.3146853146853146, + "grad_norm": 2.27711820602417, + "kl": 0.6330995559692383, + "learning_rate": 4.882976669482368e-06, + "loss": 0.0253, + "reward": 1.3583333492279053, + "reward_std": 1.1029127836227417, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6916667222976685, + "step": 376 + }, + { + "completion_length": 420.66668701171875, + "epoch": 1.3181818181818181, + "grad_norm": 2.9678735733032227, + "kl": 0.8875288367271423, + "learning_rate": 4.881653703133966e-06, + "loss": 0.0355, + "reward": 1.883333444595337, + "reward_std": 1.3325413465499878, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 377 + }, + { + "completion_length": 753.1666870117188, + "epoch": 1.3216783216783217, + "grad_norm": 0.774476945400238, + "kl": 0.36767667531967163, + "learning_rate": 4.880323481855347e-06, + "loss": 0.0147, + "reward": 2.3583335876464844, + "reward_std": 1.55962073802948, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 378 + }, + { + "completion_length": 182.5, + "epoch": 1.3251748251748252, + "grad_norm": 1.207739233970642, + "kl": 0.43915602564811707, + "learning_rate": 4.878986009698596e-06, + "loss": 0.0176, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 379 + }, + { + "completion_length": 341.0, + "epoch": 1.3286713286713288, + "grad_norm": 0.7512596249580383, + "kl": 0.3403867483139038, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.0136, + "reward": 3.0416667461395264, + "reward_std": 1.4800056219100952, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.875, + "step": 380 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.332167832167832, + "grad_norm": 2.4150354862213135, + "kl": 0.6687287092208862, + "learning_rate": 4.87628932906946e-06, + "loss": 0.0267, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 381 + }, + { + "completion_length": 657.5, + "epoch": 1.3356643356643356, + "grad_norm": 1.1033812761306763, + "kl": 0.2525772750377655, + "learning_rate": 4.874930128811631e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 382 + }, + { + "completion_length": 655.6666870117188, + "epoch": 1.3391608391608392, + "grad_norm": 2.7283008098602295, + "kl": 0.7087686061859131, + "learning_rate": 4.87356369410476e-06, + "loss": 0.0284, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6333333253860474, + "step": 383 + }, + { + "completion_length": 1037.166748046875, + "epoch": 1.3426573426573427, + "grad_norm": 1.4860605001449585, + "kl": 0.35516053438186646, + "learning_rate": 4.8721900291112415e-06, + "loss": 0.0142, + "reward": 1.3416666984558105, + "reward_std": 1.0956352949142456, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6750000715255737, + "step": 384 + }, + { + "completion_length": 776.0, + "epoch": 1.3461538461538463, + "grad_norm": 2.1169064044952393, + "kl": 0.6649973392486572, + "learning_rate": 4.870809138015499e-06, + "loss": 0.0266, + "reward": 1.4750001430511475, + "reward_std": 1.2451908588409424, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 385 + }, + { + "completion_length": 803.8333740234375, + "epoch": 1.3496503496503496, + "grad_norm": 1.5138658285140991, + "kl": 0.5593903064727783, + "learning_rate": 4.869421025023965e-06, + "loss": 0.0224, + "reward": 1.2250001430511475, + "reward_std": 1.229125738143921, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5583333373069763, + "step": 386 + }, + { + "completion_length": 579.8333740234375, + "epoch": 1.3531468531468531, + "grad_norm": 0.8988491892814636, + "kl": 0.2851899266242981, + "learning_rate": 4.868025694365073e-06, + "loss": 0.0114, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 387 + }, + { + "completion_length": 173.5, + "epoch": 1.3566433566433567, + "grad_norm": 1.3644022941589355, + "kl": 0.5744073390960693, + "learning_rate": 4.866623150289241e-06, + "loss": 0.023, + "reward": 1.9666666984558105, + "reward_std": 1.2412359714508057, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 388 + }, + { + "completion_length": 578.3333740234375, + "epoch": 1.3601398601398602, + "grad_norm": 0.8156600594520569, + "kl": 0.2687755227088928, + "learning_rate": 4.865213397068864e-06, + "loss": 0.0108, + "reward": 2.2916667461395264, + "reward_std": 1.4901063442230225, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 389 + }, + { + "completion_length": 1756.8333740234375, + "epoch": 1.3636363636363638, + "grad_norm": 0.36968812346458435, + "kl": 0.11372655630111694, + "learning_rate": 4.863796438998293e-06, + "loss": 0.0045, + "reward": 1.4666666984558105, + "reward_std": 0.9174239635467529, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6333333253860474, + "step": 390 + }, + { + "completion_length": 605.5, + "epoch": 1.367132867132867, + "grad_norm": 1.086455225944519, + "kl": 0.2938157916069031, + "learning_rate": 4.862372280393828e-06, + "loss": 0.0118, + "reward": 2.4375, + "reward_std": 1.2702115774154663, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7708333730697632, + "step": 391 + }, + { + "completion_length": 736.0, + "epoch": 1.3706293706293706, + "grad_norm": 3.411510705947876, + "kl": 0.9218753576278687, + "learning_rate": 4.860940925593703e-06, + "loss": 0.0369, + "reward": 1.4583333730697632, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666865348816, + "step": 392 + }, + { + "completion_length": 166.5, + "epoch": 1.3741258741258742, + "grad_norm": 1.464406132698059, + "kl": 0.34225571155548096, + "learning_rate": 4.8595023789580745e-06, + "loss": 0.0137, + "reward": 1.6041667461395264, + "reward_std": 0.7573666572570801, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7708332538604736, + "step": 393 + }, + { + "completion_length": 646.5, + "epoch": 1.3776223776223775, + "grad_norm": 1.6122732162475586, + "kl": 0.4424184560775757, + "learning_rate": 4.858056644869002e-06, + "loss": 0.0177, + "reward": 1.3250000476837158, + "reward_std": 0.9527591466903687, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8250000476837158, + "step": 394 + }, + { + "completion_length": 641.1666870117188, + "epoch": 1.381118881118881, + "grad_norm": 0.6985570192337036, + "kl": 0.23967330157756805, + "learning_rate": 4.856603727730446e-06, + "loss": 0.0096, + "reward": 2.5458333492279053, + "reward_std": 1.5425965785980225, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7125000357627869, + "step": 395 + }, + { + "completion_length": 161.83334350585938, + "epoch": 1.3846153846153846, + "grad_norm": 1.9270485639572144, + "kl": 0.7514389753341675, + "learning_rate": 4.855143631968242e-06, + "loss": 0.0301, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 396 + }, + { + "completion_length": 166.0, + "epoch": 1.3881118881118881, + "grad_norm": 1.2144757509231567, + "kl": 0.35039469599723816, + "learning_rate": 4.853676362030095e-06, + "loss": 0.014, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 397 + }, + { + "completion_length": 569.0, + "epoch": 1.3916083916083917, + "grad_norm": 6.755039215087891, + "kl": 0.7890805006027222, + "learning_rate": 4.852201922385564e-06, + "loss": 0.0316, + "reward": 2.1083333492279053, + "reward_std": 1.7987264394760132, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6083333492279053, + "step": 398 + }, + { + "completion_length": 909.0, + "epoch": 1.395104895104895, + "grad_norm": 0.7347401976585388, + "kl": 0.18117789924144745, + "learning_rate": 4.850720317526047e-06, + "loss": 0.0072, + "reward": 1.962499976158142, + "reward_std": 0.534263551235199, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7958333492279053, + "step": 399 + }, + { + "completion_length": 793.5, + "epoch": 1.3986013986013985, + "grad_norm": 0.849243700504303, + "kl": 0.27008673548698425, + "learning_rate": 4.849231551964771e-06, + "loss": 0.0108, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 400 + }, + { + "completion_length": 554.1666870117188, + "epoch": 1.402097902097902, + "grad_norm": 2.7050747871398926, + "kl": 0.5240260362625122, + "learning_rate": 4.847735630236773e-06, + "loss": 0.021, + "reward": 3.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 401 + }, + { + "completion_length": 215.83334350585938, + "epoch": 1.4055944055944056, + "grad_norm": 0.9243234992027283, + "kl": 0.3121068477630615, + "learning_rate": 4.84623255689889e-06, + "loss": 0.0125, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 402 + }, + { + "completion_length": 235.6666717529297, + "epoch": 1.4090909090909092, + "grad_norm": 3.3891875743865967, + "kl": 0.5218031406402588, + "learning_rate": 4.844722336529745e-06, + "loss": 0.0209, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 403 + }, + { + "completion_length": 923.5, + "epoch": 1.4125874125874125, + "grad_norm": 3.197908878326416, + "kl": 0.7076524496078491, + "learning_rate": 4.84320497372973e-06, + "loss": 0.0283, + "reward": 2.0458335876464844, + "reward_std": 1.3396285772323608, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7125000357627869, + "step": 404 + }, + { + "completion_length": 197.83334350585938, + "epoch": 1.416083916083916, + "grad_norm": 1.1261261701583862, + "kl": 0.3264281153678894, + "learning_rate": 4.841680473120994e-06, + "loss": 0.0131, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 405 + }, + { + "completion_length": 554.5, + "epoch": 1.4195804195804196, + "grad_norm": 3.3561604022979736, + "kl": 0.8642048835754395, + "learning_rate": 4.840148839347434e-06, + "loss": 0.0346, + "reward": 1.8500001430511475, + "reward_std": 1.0315039157867432, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 406 + }, + { + "completion_length": 795.8333740234375, + "epoch": 1.4230769230769231, + "grad_norm": 4.25921630859375, + "kl": 0.770601749420166, + "learning_rate": 4.838610077074669e-06, + "loss": 0.0308, + "reward": 1.2916667461395264, + "reward_std": 1.0551856756210327, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 407 + }, + { + "completion_length": 915.0, + "epoch": 1.4265734265734267, + "grad_norm": 0.571506142616272, + "kl": 0.20412606000900269, + "learning_rate": 4.837064190990036e-06, + "loss": 0.0082, + "reward": 2.241666793823242, + "reward_std": 1.3698238134384155, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7416666746139526, + "step": 408 + }, + { + "completion_length": 520.6666870117188, + "epoch": 1.43006993006993, + "grad_norm": 0.9773194193840027, + "kl": 0.29276588559150696, + "learning_rate": 4.835511185802574e-06, + "loss": 0.0117, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 409 + }, + { + "completion_length": 357.5, + "epoch": 1.4335664335664335, + "grad_norm": 2.5951545238494873, + "kl": 0.4989779591560364, + "learning_rate": 4.833951066243004e-06, + "loss": 0.02, + "reward": 1.945833444595337, + "reward_std": 1.279689073562622, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.612500011920929, + "step": 410 + }, + { + "completion_length": 794.3333740234375, + "epoch": 1.437062937062937, + "grad_norm": 0.761000394821167, + "kl": 0.20721551775932312, + "learning_rate": 4.832383837063723e-06, + "loss": 0.0083, + "reward": 2.0416667461395264, + "reward_std": 1.100189447402954, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 411 + }, + { + "completion_length": 1086.5, + "epoch": 1.4405594405594406, + "grad_norm": 0.9872347116470337, + "kl": 0.296750009059906, + "learning_rate": 4.830809503038781e-06, + "loss": 0.0119, + "reward": 2.0916666984558105, + "reward_std": 1.442365050315857, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 412 + }, + { + "completion_length": 168.5, + "epoch": 1.4440559440559442, + "grad_norm": 1.2185351848602295, + "kl": 0.34197482466697693, + "learning_rate": 4.829228068963873e-06, + "loss": 0.0137, + "reward": 3.2333333492279053, + "reward_std": 0.5671566724777222, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 413 + }, + { + "completion_length": 775.3333740234375, + "epoch": 1.4475524475524475, + "grad_norm": 1.1913334131240845, + "kl": 0.3759481906890869, + "learning_rate": 4.8276395396563215e-06, + "loss": 0.015, + "reward": 0.8916667699813843, + "reward_std": 0.5633975267410278, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7250000834465027, + "step": 414 + }, + { + "completion_length": 203.6666717529297, + "epoch": 1.451048951048951, + "grad_norm": 1.0359302759170532, + "kl": 0.31211602687835693, + "learning_rate": 4.826043919955062e-06, + "loss": 0.0125, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 415 + }, + { + "completion_length": 543.6666870117188, + "epoch": 1.4545454545454546, + "grad_norm": 0.7396105527877808, + "kl": 0.25116777420043945, + "learning_rate": 4.824441214720629e-06, + "loss": 0.01, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 416 + }, + { + "completion_length": 253.0, + "epoch": 1.458041958041958, + "grad_norm": 2.3947131633758545, + "kl": 0.3577002286911011, + "learning_rate": 4.8228314288351405e-06, + "loss": 0.0143, + "reward": 1.8500001430511475, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 417 + }, + { + "completion_length": 776.0, + "epoch": 1.4615384615384617, + "grad_norm": 0.9339893460273743, + "kl": 0.2636467218399048, + "learning_rate": 4.821214567202284e-06, + "loss": 0.0105, + "reward": 2.2333333492279053, + "reward_std": 0.5671566724777222, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 418 + }, + { + "completion_length": 185.33334350585938, + "epoch": 1.465034965034965, + "grad_norm": 3.6216635704040527, + "kl": 0.6233493685722351, + "learning_rate": 4.8195906347473e-06, + "loss": 0.0249, + "reward": 1.8000000715255737, + "reward_std": 1.579240322113037, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 419 + }, + { + "completion_length": 1112.0, + "epoch": 1.4685314685314685, + "grad_norm": 0.6356344223022461, + "kl": 0.26539915800094604, + "learning_rate": 4.817959636416969e-06, + "loss": 0.0106, + "reward": 2.375, + "reward_std": 1.001873254776001, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 420 + }, + { + "completion_length": 531.1666870117188, + "epoch": 1.472027972027972, + "grad_norm": 0.8300501108169556, + "kl": 0.31844228506088257, + "learning_rate": 4.816321577179594e-06, + "loss": 0.0127, + "reward": 2.875, + "reward_std": 0.7834221124649048, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 421 + }, + { + "completion_length": 218.83334350585938, + "epoch": 1.4755244755244754, + "grad_norm": 0.796237051486969, + "kl": 0.331187903881073, + "learning_rate": 4.814676462024988e-06, + "loss": 0.0132, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 422 + }, + { + "completion_length": 186.83334350585938, + "epoch": 1.479020979020979, + "grad_norm": 1.279965877532959, + "kl": 0.3236890733242035, + "learning_rate": 4.8130242959644555e-06, + "loss": 0.0129, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 423 + }, + { + "completion_length": 249.0, + "epoch": 1.4825174825174825, + "grad_norm": 4.079779624938965, + "kl": 0.39256423711776733, + "learning_rate": 4.811365084030784e-06, + "loss": 0.0157, + "reward": 1.7125000953674316, + "reward_std": 0.9684717655181885, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7125000357627869, + "step": 424 + }, + { + "completion_length": 183.33334350585938, + "epoch": 1.486013986013986, + "grad_norm": 1.1069165468215942, + "kl": 0.262847363948822, + "learning_rate": 4.809698831278217e-06, + "loss": 0.0105, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 425 + }, + { + "completion_length": 199.6666717529297, + "epoch": 1.4895104895104896, + "grad_norm": 1.413517713546753, + "kl": 0.39733991026878357, + "learning_rate": 4.808025542782453e-06, + "loss": 0.0159, + "reward": 2.7083334922790527, + "reward_std": 0.8662660121917725, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 426 + }, + { + "completion_length": 235.6666717529297, + "epoch": 1.493006993006993, + "grad_norm": 0.9659198522567749, + "kl": 0.2365071177482605, + "learning_rate": 4.806345223640616e-06, + "loss": 0.0095, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 427 + }, + { + "completion_length": 774.1666870117188, + "epoch": 1.4965034965034965, + "grad_norm": 0.830765962600708, + "kl": 0.33350443840026855, + "learning_rate": 4.804657878971252e-06, + "loss": 0.0133, + "reward": 2.183333396911621, + "reward_std": 1.3265244960784912, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 428 + }, + { + "completion_length": 203.0, + "epoch": 1.5, + "grad_norm": 1.0319793224334717, + "kl": 0.27221041917800903, + "learning_rate": 4.802963513914304e-06, + "loss": 0.0109, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 429 + }, + { + "completion_length": 461.16668701171875, + "epoch": 1.5034965034965035, + "grad_norm": 1.0231879949569702, + "kl": 0.24733422696590424, + "learning_rate": 4.801262133631101e-06, + "loss": 0.0099, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 430 + }, + { + "completion_length": 244.83334350585938, + "epoch": 1.506993006993007, + "grad_norm": 0.9520881772041321, + "kl": 0.31419527530670166, + "learning_rate": 4.799553743304345e-06, + "loss": 0.0126, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 431 + }, + { + "completion_length": 206.83334350585938, + "epoch": 1.5104895104895104, + "grad_norm": 0.8148533701896667, + "kl": 0.2550124228000641, + "learning_rate": 4.7978383481380865e-06, + "loss": 0.0102, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 432 + }, + { + "completion_length": 1087.8333740234375, + "epoch": 1.513986013986014, + "grad_norm": 0.3516090214252472, + "kl": 0.2816867530345917, + "learning_rate": 4.796115953357718e-06, + "loss": 0.0113, + "reward": 2.2833333015441895, + "reward_std": 1.2408331632614136, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 433 + }, + { + "completion_length": 556.3333740234375, + "epoch": 1.5174825174825175, + "grad_norm": 3.6779227256774902, + "kl": 0.4250108003616333, + "learning_rate": 4.794386564209953e-06, + "loss": 0.017, + "reward": 2.4083335399627686, + "reward_std": 1.687132716178894, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 434 + }, + { + "completion_length": 707.8333740234375, + "epoch": 1.5209790209790208, + "grad_norm": 1.121485948562622, + "kl": 0.24696388840675354, + "learning_rate": 4.79265018596281e-06, + "loss": 0.0099, + "reward": 2.9000000953674316, + "reward_std": 0.9027735590934753, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 435 + }, + { + "completion_length": 469.8333435058594, + "epoch": 1.5244755244755246, + "grad_norm": 2.6518046855926514, + "kl": 0.7716752886772156, + "learning_rate": 4.790906823905599e-06, + "loss": 0.0309, + "reward": 1.8000000715255737, + "reward_std": 1.447066068649292, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 436 + }, + { + "completion_length": 192.83334350585938, + "epoch": 1.527972027972028, + "grad_norm": 1.165176272392273, + "kl": 0.2884241044521332, + "learning_rate": 4.7891564833489035e-06, + "loss": 0.0115, + "reward": 2.450000286102295, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 437 + }, + { + "completion_length": 254.6666717529297, + "epoch": 1.5314685314685315, + "grad_norm": 0.8783808350563049, + "kl": 0.26613113284111023, + "learning_rate": 4.787399169624562e-06, + "loss": 0.0106, + "reward": 3.370833396911621, + "reward_std": 1.011238932609558, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 438 + }, + { + "completion_length": 158.5, + "epoch": 1.534965034965035, + "grad_norm": 2.008617877960205, + "kl": 0.5028926134109497, + "learning_rate": 4.7856348880856595e-06, + "loss": 0.0201, + "reward": 1.7416666746139526, + "reward_std": 1.1517016887664795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7416666746139526, + "step": 439 + }, + { + "completion_length": 208.5, + "epoch": 1.5384615384615383, + "grad_norm": 0.8693957924842834, + "kl": 0.2799164056777954, + "learning_rate": 4.783863644106502e-06, + "loss": 0.0112, + "reward": 2.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 440 + }, + { + "completion_length": 211.5, + "epoch": 1.541958041958042, + "grad_norm": 1.5437381267547607, + "kl": 0.3011782467365265, + "learning_rate": 4.782085443082607e-06, + "loss": 0.012, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 441 + }, + { + "completion_length": 491.8333435058594, + "epoch": 1.5454545454545454, + "grad_norm": 3.308060884475708, + "kl": 0.43526870012283325, + "learning_rate": 4.780300290430683e-06, + "loss": 0.0174, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 442 + }, + { + "completion_length": 177.1666717529297, + "epoch": 1.548951048951049, + "grad_norm": 2.3108198642730713, + "kl": 0.6005208492279053, + "learning_rate": 4.778508191588613e-06, + "loss": 0.024, + "reward": 2.683333396911621, + "reward_std": 1.2110600471496582, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 443 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.5524475524475525, + "grad_norm": 0.9576809406280518, + "kl": 0.3041282296180725, + "learning_rate": 4.776709152015443e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 444 + }, + { + "completion_length": 807.3333740234375, + "epoch": 1.5559440559440558, + "grad_norm": 0.6298768520355225, + "kl": 0.2337806224822998, + "learning_rate": 4.774903177191358e-06, + "loss": 0.0094, + "reward": 2.5458335876464844, + "reward_std": 1.3377609252929688, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 445 + }, + { + "completion_length": 523.6666870117188, + "epoch": 1.5594405594405596, + "grad_norm": 1.1019190549850464, + "kl": 0.39509618282318115, + "learning_rate": 4.773090272617672e-06, + "loss": 0.0158, + "reward": 2.049999952316284, + "reward_std": 1.5391557216644287, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 446 + }, + { + "completion_length": 787.6666870117188, + "epoch": 1.562937062937063, + "grad_norm": 0.893694281578064, + "kl": 0.37470337748527527, + "learning_rate": 4.771270443816805e-06, + "loss": 0.015, + "reward": 2.2083334922790527, + "reward_std": 0.8720186948776245, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 447 + }, + { + "completion_length": 546.8333740234375, + "epoch": 1.5664335664335665, + "grad_norm": 0.837485134601593, + "kl": 0.22402605414390564, + "learning_rate": 4.769443696332272e-06, + "loss": 0.009, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 448 + }, + { + "completion_length": 177.6666717529297, + "epoch": 1.56993006993007, + "grad_norm": 1.617317795753479, + "kl": 0.3958384692668915, + "learning_rate": 4.767610035728663e-06, + "loss": 0.0158, + "reward": 2.875, + "reward_std": 1.0068515539169312, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 449 + }, + { + "completion_length": 147.33334350585938, + "epoch": 1.5734265734265733, + "grad_norm": 0.9628480076789856, + "kl": 0.3490566611289978, + "learning_rate": 4.765769467591626e-06, + "loss": 0.014, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 450 + }, + { + "completion_length": 203.83334350585938, + "epoch": 1.5769230769230769, + "grad_norm": 0.9194980263710022, + "kl": 0.3181028962135315, + "learning_rate": 4.763921997527849e-06, + "loss": 0.0127, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 451 + }, + { + "completion_length": 167.5, + "epoch": 1.5804195804195804, + "grad_norm": 3.041954517364502, + "kl": 0.426164835691452, + "learning_rate": 4.762067631165049e-06, + "loss": 0.017, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 452 + }, + { + "completion_length": 212.33334350585938, + "epoch": 1.583916083916084, + "grad_norm": 1.1762245893478394, + "kl": 0.2974995970726013, + "learning_rate": 4.760206374151947e-06, + "loss": 0.0119, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 453 + }, + { + "completion_length": 493.66668701171875, + "epoch": 1.5874125874125875, + "grad_norm": 1.3206851482391357, + "kl": 0.36789295077323914, + "learning_rate": 4.7583382321582525e-06, + "loss": 0.0147, + "reward": 1.9166667461395264, + "reward_std": 1.2738393545150757, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.75, + "step": 454 + }, + { + "completion_length": 205.0, + "epoch": 1.5909090909090908, + "grad_norm": 1.0482568740844727, + "kl": 0.2594867944717407, + "learning_rate": 4.7564632108746524e-06, + "loss": 0.0104, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 455 + }, + { + "completion_length": 502.8333435058594, + "epoch": 1.5944055944055944, + "grad_norm": 2.1341159343719482, + "kl": 0.4591405391693115, + "learning_rate": 4.754581316012785e-06, + "loss": 0.0184, + "reward": 3.7083334922790527, + "reward_std": 0.591960072517395, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 456 + }, + { + "completion_length": 633.3333740234375, + "epoch": 1.597902097902098, + "grad_norm": 1.0107204914093018, + "kl": 0.24642407894134521, + "learning_rate": 4.752692553305229e-06, + "loss": 0.0099, + "reward": 3.0375001430511475, + "reward_std": 0.7974569201469421, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708333373069763, + "step": 457 + }, + { + "completion_length": 517.0, + "epoch": 1.6013986013986012, + "grad_norm": 0.6217291355133057, + "kl": 0.22938358783721924, + "learning_rate": 4.750796928505484e-06, + "loss": 0.0092, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 458 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.604895104895105, + "grad_norm": 0.5446264743804932, + "kl": 0.1968853920698166, + "learning_rate": 4.7488944473879515e-06, + "loss": 0.0079, + "reward": 1.625, + "reward_std": 0.7960842847824097, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 459 + }, + { + "completion_length": 193.83334350585938, + "epoch": 1.6083916083916083, + "grad_norm": 0.8946224451065063, + "kl": 0.25773894786834717, + "learning_rate": 4.746985115747918e-06, + "loss": 0.0103, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 460 + }, + { + "completion_length": 204.6666717529297, + "epoch": 1.6118881118881119, + "grad_norm": 0.8260864019393921, + "kl": 0.2527741491794586, + "learning_rate": 4.745068939401539e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 461 + }, + { + "completion_length": 848.6666870117188, + "epoch": 1.6153846153846154, + "grad_norm": 1.5746495723724365, + "kl": 0.3351367712020874, + "learning_rate": 4.743145924185821e-06, + "loss": 0.0134, + "reward": 2.25, + "reward_std": 0.7803846597671509, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.75, + "step": 462 + }, + { + "completion_length": 190.0, + "epoch": 1.6188811188811187, + "grad_norm": 1.0435597896575928, + "kl": 0.26553571224212646, + "learning_rate": 4.741216075958602e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 463 + }, + { + "completion_length": 209.6666717529297, + "epoch": 1.6223776223776225, + "grad_norm": 1.0996354818344116, + "kl": 0.31133967638015747, + "learning_rate": 4.7392794005985324e-06, + "loss": 0.0125, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 464 + }, + { + "completion_length": 512.6666870117188, + "epoch": 1.6258741258741258, + "grad_norm": 0.7010518908500671, + "kl": 0.21432137489318848, + "learning_rate": 4.737335904005063e-06, + "loss": 0.0086, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 465 + }, + { + "completion_length": 527.0, + "epoch": 1.6293706293706294, + "grad_norm": 0.5995029211044312, + "kl": 0.22433510422706604, + "learning_rate": 4.735385592098421e-06, + "loss": 0.009, + "reward": 2.7916667461395264, + "reward_std": 1.4527275562286377, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 466 + }, + { + "completion_length": 191.0, + "epoch": 1.632867132867133, + "grad_norm": 1.2079272270202637, + "kl": 0.2614157795906067, + "learning_rate": 4.733428470819595e-06, + "loss": 0.0105, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 467 + }, + { + "completion_length": 783.1666870117188, + "epoch": 1.6363636363636362, + "grad_norm": 2.2251851558685303, + "kl": 0.6713162660598755, + "learning_rate": 4.731464546130315e-06, + "loss": 0.0269, + "reward": 2.4375, + "reward_std": 1.3401259183883667, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7708333730697632, + "step": 468 + }, + { + "completion_length": 529.1666870117188, + "epoch": 1.63986013986014, + "grad_norm": 0.5742272138595581, + "kl": 0.23623262345790863, + "learning_rate": 4.729493824013036e-06, + "loss": 0.0094, + "reward": 2.2125000953674316, + "reward_std": 1.234073519706726, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7124999761581421, + "step": 469 + }, + { + "completion_length": 181.0, + "epoch": 1.6433566433566433, + "grad_norm": 1.7596086263656616, + "kl": 0.33919036388397217, + "learning_rate": 4.72751631047092e-06, + "loss": 0.0136, + "reward": 1.8500001430511475, + "reward_std": 1.2247450351715088, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 470 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.6468531468531469, + "grad_norm": 1.0671755075454712, + "kl": 0.27314767241477966, + "learning_rate": 4.725532011527817e-06, + "loss": 0.0109, + "reward": 3.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 471 + }, + { + "completion_length": 189.6666717529297, + "epoch": 1.6503496503496504, + "grad_norm": 1.0676515102386475, + "kl": 0.2805836498737335, + "learning_rate": 4.723540933228245e-06, + "loss": 0.0112, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 472 + }, + { + "completion_length": 836.5, + "epoch": 1.6538461538461537, + "grad_norm": 0.8203516006469727, + "kl": 0.172221839427948, + "learning_rate": 4.721543081637372e-06, + "loss": 0.0069, + "reward": 1.5833333730697632, + "reward_std": 1.0308573246002197, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7499999403953552, + "step": 473 + }, + { + "completion_length": 169.0, + "epoch": 1.6573426573426573, + "grad_norm": 1.7924721240997314, + "kl": 0.30363911390304565, + "learning_rate": 4.719538462841003e-06, + "loss": 0.0121, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 474 + }, + { + "completion_length": 176.6666717529297, + "epoch": 1.6608391608391608, + "grad_norm": 0.19596193730831146, + "kl": 0.24111799895763397, + "learning_rate": 4.717527082945555e-06, + "loss": 0.0108, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 475 + }, + { + "completion_length": 234.6666717529297, + "epoch": 1.6643356643356644, + "grad_norm": 0.9966434240341187, + "kl": 0.25714850425720215, + "learning_rate": 4.715508948078037e-06, + "loss": 0.0103, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 476 + }, + { + "completion_length": 1046.8333740234375, + "epoch": 1.667832167832168, + "grad_norm": 0.6285001635551453, + "kl": 0.1687658280134201, + "learning_rate": 4.71348406438604e-06, + "loss": 0.0068, + "reward": 2.0250000953674316, + "reward_std": 1.4372718334197998, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 477 + }, + { + "completion_length": 219.1666717529297, + "epoch": 1.6713286713286712, + "grad_norm": 1.0476932525634766, + "kl": 0.29544544219970703, + "learning_rate": 4.71145243803771e-06, + "loss": 0.0118, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 478 + }, + { + "completion_length": 561.1666870117188, + "epoch": 1.6748251748251748, + "grad_norm": 1.0641223192214966, + "kl": 0.1950298398733139, + "learning_rate": 4.709414075221734e-06, + "loss": 0.0078, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 479 + }, + { + "completion_length": 228.5, + "epoch": 1.6783216783216783, + "grad_norm": 0.8561164736747742, + "kl": 0.26422810554504395, + "learning_rate": 4.707368982147318e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 480 + }, + { + "completion_length": 509.3333435058594, + "epoch": 1.6818181818181817, + "grad_norm": 0.5843437314033508, + "kl": 0.20474323630332947, + "learning_rate": 4.70531716504417e-06, + "loss": 0.0082, + "reward": 2.183333396911621, + "reward_std": 1.2027745246887207, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 481 + }, + { + "completion_length": 548.6666870117188, + "epoch": 1.6853146853146854, + "grad_norm": 0.648353636264801, + "kl": 0.18905925750732422, + "learning_rate": 4.703258630162481e-06, + "loss": 0.0076, + "reward": 2.2125000953674316, + "reward_std": 1.3164108991622925, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7124999761581421, + "step": 482 + }, + { + "completion_length": 219.6666717529297, + "epoch": 1.6888111888111887, + "grad_norm": 4.2207932472229, + "kl": 1.0905920267105103, + "learning_rate": 4.701193383772905e-06, + "loss": 0.0436, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 483 + }, + { + "completion_length": 1049.166748046875, + "epoch": 1.6923076923076923, + "grad_norm": 0.5171648859977722, + "kl": 0.20516209304332733, + "learning_rate": 4.699121432166542e-06, + "loss": 0.0082, + "reward": 2.2333333492279053, + "reward_std": 0.9174240827560425, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 484 + }, + { + "completion_length": 201.6666717529297, + "epoch": 1.6958041958041958, + "grad_norm": 1.1004559993743896, + "kl": 0.2839426100254059, + "learning_rate": 4.697042781654913e-06, + "loss": 0.0114, + "reward": 1.870833396911621, + "reward_std": 0.193917915225029, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 485 + }, + { + "completion_length": 190.33334350585938, + "epoch": 1.6993006993006992, + "grad_norm": 1.0573567152023315, + "kl": 0.22315821051597595, + "learning_rate": 4.6949574385699514e-06, + "loss": 0.0089, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 486 + }, + { + "completion_length": 835.5, + "epoch": 1.702797202797203, + "grad_norm": 0.7173390984535217, + "kl": 0.1510881930589676, + "learning_rate": 4.6928654092639725e-06, + "loss": 0.006, + "reward": 1.5500000715255737, + "reward_std": 1.0904128551483154, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666388511658, + "step": 487 + }, + { + "completion_length": 615.8333740234375, + "epoch": 1.7062937062937062, + "grad_norm": 0.8014463186264038, + "kl": 0.22651296854019165, + "learning_rate": 4.690766700109659e-06, + "loss": 0.0091, + "reward": 2.7083334922790527, + "reward_std": 1.315453052520752, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 488 + }, + { + "completion_length": 169.6666717529297, + "epoch": 1.7097902097902098, + "grad_norm": 3.6473190784454346, + "kl": 0.40026336908340454, + "learning_rate": 4.688661317500045e-06, + "loss": 0.016, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 489 + }, + { + "completion_length": 1151.5, + "epoch": 1.7132867132867133, + "grad_norm": 0.8561959266662598, + "kl": 0.16577297449111938, + "learning_rate": 4.68654926784849e-06, + "loss": 0.0066, + "reward": 2.7083334922790527, + "reward_std": 1.0641508102416992, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.875, + "step": 490 + }, + { + "completion_length": 397.3333435058594, + "epoch": 1.7167832167832167, + "grad_norm": 1.0723934173583984, + "kl": 0.21682481467723846, + "learning_rate": 4.6844305575886635e-06, + "loss": 0.0087, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 491 + }, + { + "completion_length": 169.6666717529297, + "epoch": 1.7202797202797204, + "grad_norm": 1.4164685010910034, + "kl": 0.245243638753891, + "learning_rate": 4.682305193174524e-06, + "loss": 0.0098, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 492 + }, + { + "completion_length": 110.33333587646484, + "epoch": 1.7237762237762237, + "grad_norm": 5.974154949188232, + "kl": 1.1889418363571167, + "learning_rate": 4.680173181080302e-06, + "loss": 0.0476, + "reward": 3.075000286102295, + "reward_std": 1.1660832166671753, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 493 + }, + { + "completion_length": 215.5, + "epoch": 1.7272727272727273, + "grad_norm": 0.9199399352073669, + "kl": 0.2431143820285797, + "learning_rate": 4.6780345278004744e-06, + "loss": 0.0097, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 494 + }, + { + "completion_length": 190.83334350585938, + "epoch": 1.7307692307692308, + "grad_norm": 0.9801461696624756, + "kl": 0.25382137298583984, + "learning_rate": 4.675889239849749e-06, + "loss": 0.0102, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 495 + }, + { + "completion_length": 846.6666870117188, + "epoch": 1.7342657342657342, + "grad_norm": 0.6822401881217957, + "kl": 0.21501430869102478, + "learning_rate": 4.673737323763048e-06, + "loss": 0.0086, + "reward": 2.679166793823242, + "reward_std": 1.3748105764389038, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8458333015441895, + "step": 496 + }, + { + "completion_length": 182.33334350585938, + "epoch": 1.737762237762238, + "grad_norm": 6.3415422439575195, + "kl": 1.284159541130066, + "learning_rate": 4.671578786095479e-06, + "loss": 0.0514, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 497 + }, + { + "completion_length": 164.83334350585938, + "epoch": 1.7412587412587412, + "grad_norm": 1.421428918838501, + "kl": 0.3243716359138489, + "learning_rate": 4.669413633422322e-06, + "loss": 0.013, + "reward": 3.566666603088379, + "reward_std": 0.6013872623443604, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 498 + }, + { + "completion_length": 229.6666717529297, + "epoch": 1.7447552447552448, + "grad_norm": 0.8355535864830017, + "kl": 0.24279817938804626, + "learning_rate": 4.667241872339007e-06, + "loss": 0.0097, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 499 + }, + { + "completion_length": 672.6666870117188, + "epoch": 1.7482517482517483, + "grad_norm": 0.5215955376625061, + "kl": 0.19877499341964722, + "learning_rate": 4.665063509461098e-06, + "loss": 0.008, + "reward": 2.924999952316284, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 500 + }, + { + "completion_length": 198.83334350585938, + "epoch": 1.7517482517482517, + "grad_norm": 0.9148537516593933, + "kl": 0.24169328808784485, + "learning_rate": 4.6628785514242615e-06, + "loss": 0.0097, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 501 + }, + { + "completion_length": 928.5, + "epoch": 1.7552447552447552, + "grad_norm": 0.4413454532623291, + "kl": 0.15593400597572327, + "learning_rate": 4.6606870048842626e-06, + "loss": 0.0062, + "reward": 2.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 502 + }, + { + "completion_length": 508.0, + "epoch": 1.7587412587412588, + "grad_norm": 0.7536454796791077, + "kl": 0.24186736345291138, + "learning_rate": 4.658488876516929e-06, + "loss": 0.0097, + "reward": 2.625, + "reward_std": 1.616709589958191, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 503 + }, + { + "completion_length": 208.33334350585938, + "epoch": 1.762237762237762, + "grad_norm": 1.1730728149414062, + "kl": 0.2987002432346344, + "learning_rate": 4.656284173018144e-06, + "loss": 0.0119, + "reward": 2.758333206176758, + "reward_std": 1.0394309759140015, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 504 + }, + { + "completion_length": 197.1666717529297, + "epoch": 1.7657342657342658, + "grad_norm": 2.2083706855773926, + "kl": 0.3215945363044739, + "learning_rate": 4.654072901103815e-06, + "loss": 0.0129, + "reward": 2.0416667461395264, + "reward_std": 0.9002315402030945, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 505 + }, + { + "completion_length": 572.0, + "epoch": 1.7692307692307692, + "grad_norm": 0.8655341863632202, + "kl": 0.24153539538383484, + "learning_rate": 4.65185506750986e-06, + "loss": 0.0097, + "reward": 1.870833396911621, + "reward_std": 1.0137083530426025, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 506 + }, + { + "completion_length": 517.5, + "epoch": 1.7727272727272727, + "grad_norm": 0.49979329109191895, + "kl": 0.16330799460411072, + "learning_rate": 4.649630678992184e-06, + "loss": 0.0065, + "reward": 2.4000000953674316, + "reward_std": 0.9460445642471313, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 507 + }, + { + "completion_length": 324.16668701171875, + "epoch": 1.7762237762237763, + "grad_norm": 0.9129101037979126, + "kl": 0.26079505681991577, + "learning_rate": 4.6473997423266615e-06, + "loss": 0.0104, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 508 + }, + { + "completion_length": 316.16668701171875, + "epoch": 1.7797202797202796, + "grad_norm": 0.7381297945976257, + "kl": 0.34089159965515137, + "learning_rate": 4.645162264309112e-06, + "loss": 0.0136, + "reward": 3.2333335876464844, + "reward_std": 0.849509596824646, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 509 + }, + { + "completion_length": 207.83334350585938, + "epoch": 1.7832167832167833, + "grad_norm": 1.0436253547668457, + "kl": 0.2835765480995178, + "learning_rate": 4.642918251755281e-06, + "loss": 0.0113, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 510 + }, + { + "completion_length": 230.33334350585938, + "epoch": 1.7867132867132867, + "grad_norm": 0.9628374576568604, + "kl": 0.2641430199146271, + "learning_rate": 4.640667711500821e-06, + "loss": 0.0106, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 511 + }, + { + "completion_length": 507.66668701171875, + "epoch": 1.7902097902097902, + "grad_norm": 0.3851446211338043, + "kl": 0.251933217048645, + "learning_rate": 4.638410650401267e-06, + "loss": 0.0101, + "reward": 3.516666889190674, + "reward_std": 1.0614454746246338, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 512 + }, + { + "completion_length": 192.0, + "epoch": 1.7937062937062938, + "grad_norm": 1.3856638669967651, + "kl": 0.2984909415245056, + "learning_rate": 4.636147075332019e-06, + "loss": 0.0119, + "reward": 3.0916666984558105, + "reward_std": 1.2249150276184082, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 513 + }, + { + "completion_length": 206.83334350585938, + "epoch": 1.797202797202797, + "grad_norm": 0.9139816164970398, + "kl": 0.24960675835609436, + "learning_rate": 4.633876993188319e-06, + "loss": 0.01, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 514 + }, + { + "completion_length": 538.0, + "epoch": 1.8006993006993008, + "grad_norm": 0.7666388750076294, + "kl": 0.2067805826663971, + "learning_rate": 4.631600410885231e-06, + "loss": 0.0083, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 515 + }, + { + "completion_length": 186.0, + "epoch": 1.8041958041958042, + "grad_norm": 0.9322411417961121, + "kl": 0.24232684075832367, + "learning_rate": 4.62931733535762e-06, + "loss": 0.0097, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 516 + }, + { + "completion_length": 170.6666717529297, + "epoch": 1.8076923076923077, + "grad_norm": 1.5746034383773804, + "kl": 0.36948150396347046, + "learning_rate": 4.627027773560129e-06, + "loss": 0.0148, + "reward": 2.516666889190674, + "reward_std": 1.525341510772705, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 517 + }, + { + "completion_length": 193.0, + "epoch": 1.8111888111888113, + "grad_norm": 0.9759989380836487, + "kl": 0.3557225167751312, + "learning_rate": 4.62473173246716e-06, + "loss": 0.0142, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 518 + }, + { + "completion_length": 523.6666870117188, + "epoch": 1.8146853146853146, + "grad_norm": 0.9804190993309021, + "kl": 0.2574712038040161, + "learning_rate": 4.622429219072854e-06, + "loss": 0.0103, + "reward": 1.633333444595337, + "reward_std": 1.1919171810150146, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 519 + }, + { + "completion_length": 1029.166748046875, + "epoch": 1.8181818181818183, + "grad_norm": 0.5941687822341919, + "kl": 0.1915300190448761, + "learning_rate": 4.620120240391065e-06, + "loss": 0.0077, + "reward": 2.883333206176758, + "reward_std": 1.7192052602767944, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 520 + }, + { + "completion_length": 157.1666717529297, + "epoch": 1.8216783216783217, + "grad_norm": 3.1836304664611816, + "kl": 0.6161837577819824, + "learning_rate": 4.6178048034553435e-06, + "loss": 0.0246, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 521 + }, + { + "completion_length": 201.33334350585938, + "epoch": 1.8251748251748252, + "grad_norm": 1.5185062885284424, + "kl": 0.31097742915153503, + "learning_rate": 4.6154829153189105e-06, + "loss": 0.0124, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 522 + }, + { + "completion_length": 186.1666717529297, + "epoch": 1.8286713286713288, + "grad_norm": 0.936562180519104, + "kl": 0.3272198438644409, + "learning_rate": 4.613154583054641e-06, + "loss": 0.0131, + "reward": 2.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 523 + }, + { + "completion_length": 216.6666717529297, + "epoch": 1.832167832167832, + "grad_norm": 0.9323495626449585, + "kl": 0.3112618923187256, + "learning_rate": 4.610819813755038e-06, + "loss": 0.0125, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 524 + }, + { + "completion_length": 525.3333740234375, + "epoch": 1.8356643356643356, + "grad_norm": 0.40873953700065613, + "kl": 0.241009920835495, + "learning_rate": 4.608478614532215e-06, + "loss": 0.0096, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 525 + }, + { + "completion_length": 160.83334350585938, + "epoch": 1.8391608391608392, + "grad_norm": 1.1447237730026245, + "kl": 0.37633103132247925, + "learning_rate": 4.60613099251787e-06, + "loss": 0.0151, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 526 + }, + { + "completion_length": 176.5, + "epoch": 1.8426573426573427, + "grad_norm": 1.4215019941329956, + "kl": 0.31421756744384766, + "learning_rate": 4.603776954863266e-06, + "loss": 0.0126, + "reward": 2.2083334922790527, + "reward_std": 0.6003471612930298, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 527 + }, + { + "completion_length": 511.16668701171875, + "epoch": 1.8461538461538463, + "grad_norm": 0.7890862226486206, + "kl": 0.21260276436805725, + "learning_rate": 4.601416508739211e-06, + "loss": 0.0085, + "reward": 2.2916667461395264, + "reward_std": 1.4901063442230225, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 528 + }, + { + "completion_length": 145.6666717529297, + "epoch": 1.8496503496503496, + "grad_norm": 2.972633123397827, + "kl": 1.6821321249008179, + "learning_rate": 4.599049661336033e-06, + "loss": 0.0673, + "reward": 2.4583334922790527, + "reward_std": 1.3603004217147827, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 529 + }, + { + "completion_length": 337.66668701171875, + "epoch": 1.8531468531468531, + "grad_norm": 0.4933686852455139, + "kl": 0.2972989082336426, + "learning_rate": 4.596676419863561e-06, + "loss": 0.0119, + "reward": 3.758333206176758, + "reward_std": 0.4694856107234955, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9250000715255737, + "step": 530 + }, + { + "completion_length": 1491.166748046875, + "epoch": 1.8566433566433567, + "grad_norm": 0.7114420533180237, + "kl": 0.16526620090007782, + "learning_rate": 4.5942967915510975e-06, + "loss": 0.0066, + "reward": 2.683333396911621, + "reward_std": 0.8942409753799438, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 531 + }, + { + "completion_length": 822.0, + "epoch": 1.86013986013986, + "grad_norm": 0.4190931022167206, + "kl": 0.21502110362052917, + "learning_rate": 4.591910783647405e-06, + "loss": 0.0086, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 532 + }, + { + "completion_length": 739.5, + "epoch": 1.8636363636363638, + "grad_norm": 0.5615747570991516, + "kl": 0.223265141248703, + "learning_rate": 4.589518403420676e-06, + "loss": 0.0089, + "reward": 2.3500001430511475, + "reward_std": 1.5231547355651855, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 533 + }, + { + "completion_length": 188.6666717529297, + "epoch": 1.867132867132867, + "grad_norm": 0.754673957824707, + "kl": 0.2731919288635254, + "learning_rate": 4.587119658158517e-06, + "loss": 0.0109, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 534 + }, + { + "completion_length": 528.3333740234375, + "epoch": 1.8706293706293706, + "grad_norm": 0.45285508036613464, + "kl": 0.21540388464927673, + "learning_rate": 4.584714555167921e-06, + "loss": 0.0086, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 535 + }, + { + "completion_length": 513.1666870117188, + "epoch": 1.8741258741258742, + "grad_norm": 0.6436936259269714, + "kl": 0.2541727125644684, + "learning_rate": 4.582303101775249e-06, + "loss": 0.0102, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 536 + }, + { + "completion_length": 503.3333435058594, + "epoch": 1.8776223776223775, + "grad_norm": 0.5080775618553162, + "kl": 0.2073960304260254, + "learning_rate": 4.579885305326206e-06, + "loss": 0.0083, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 537 + }, + { + "completion_length": 209.6666717529297, + "epoch": 1.8811188811188813, + "grad_norm": 0.9030362963676453, + "kl": 0.283308744430542, + "learning_rate": 4.577461173185821e-06, + "loss": 0.0113, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 538 + }, + { + "completion_length": 121.5, + "epoch": 1.8846153846153846, + "grad_norm": 2.8895628452301025, + "kl": 0.8616495132446289, + "learning_rate": 4.5750307127384194e-06, + "loss": 0.0345, + "reward": 1.4666666984558105, + "reward_std": 1.2002778053283691, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 539 + }, + { + "completion_length": 208.83334350585938, + "epoch": 1.8881118881118881, + "grad_norm": 1.0781502723693848, + "kl": 0.2666887640953064, + "learning_rate": 4.572593931387604e-06, + "loss": 0.0107, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 540 + }, + { + "completion_length": 529.8333740234375, + "epoch": 1.8916083916083917, + "grad_norm": 0.8341970443725586, + "kl": 0.27578771114349365, + "learning_rate": 4.570150836556236e-06, + "loss": 0.011, + "reward": 2.683333396911621, + "reward_std": 0.9092121124267578, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 541 + }, + { + "completion_length": 509.0, + "epoch": 1.895104895104895, + "grad_norm": 0.7221694588661194, + "kl": 0.20753830671310425, + "learning_rate": 4.567701435686405e-06, + "loss": 0.0083, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 542 + }, + { + "completion_length": 999.0, + "epoch": 1.8986013986013988, + "grad_norm": 0.8567831516265869, + "kl": 0.2119346261024475, + "learning_rate": 4.5652457362394094e-06, + "loss": 0.0085, + "reward": 1.808333396911621, + "reward_std": 2.014302968978882, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 543 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.902097902097902, + "grad_norm": 0.5826951265335083, + "kl": 0.2415902316570282, + "learning_rate": 4.562783745695738e-06, + "loss": 0.0097, + "reward": 2.183333396911621, + "reward_std": 1.0230673551559448, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 544 + }, + { + "completion_length": 831.0, + "epoch": 1.9055944055944056, + "grad_norm": 0.5661029815673828, + "kl": 0.2621002495288849, + "learning_rate": 4.560315471555039e-06, + "loss": 0.0105, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 545 + }, + { + "completion_length": 190.6666717529297, + "epoch": 1.9090909090909092, + "grad_norm": 0.8984940648078918, + "kl": 0.261735200881958, + "learning_rate": 4.5578409213361055e-06, + "loss": 0.0105, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 546 + }, + { + "completion_length": 672.5, + "epoch": 1.9125874125874125, + "grad_norm": 0.6307451128959656, + "kl": 0.3331562280654907, + "learning_rate": 4.555360102576844e-06, + "loss": 0.0133, + "reward": 3.5916666984558105, + "reward_std": 0.5571505427360535, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9250000715255737, + "step": 547 + }, + { + "completion_length": 193.5, + "epoch": 1.916083916083916, + "grad_norm": 0.9689189791679382, + "kl": 0.31761375069618225, + "learning_rate": 4.55287302283426e-06, + "loss": 0.0127, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 548 + }, + { + "completion_length": 477.0, + "epoch": 1.9195804195804196, + "grad_norm": 1.1217161417007446, + "kl": 0.4803551435470581, + "learning_rate": 4.550379689684431e-06, + "loss": 0.0192, + "reward": 2.924999952316284, + "reward_std": 0.06123730167746544, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.9249999523162842, + "step": 549 + }, + { + "completion_length": 501.66668701171875, + "epoch": 1.9230769230769231, + "grad_norm": 0.48732584714889526, + "kl": 0.3280116021633148, + "learning_rate": 4.54788011072248e-06, + "loss": 0.0131, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 550 + }, + { + "completion_length": 190.5, + "epoch": 1.9265734265734267, + "grad_norm": 0.05169845372438431, + "kl": 0.2321687638759613, + "learning_rate": 4.545374293562559e-06, + "loss": 0.0117, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 551 + }, + { + "completion_length": 226.33334350585938, + "epoch": 1.93006993006993, + "grad_norm": 1.1284880638122559, + "kl": 0.3435511291027069, + "learning_rate": 4.542862245837821e-06, + "loss": 0.0137, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 552 + }, + { + "completion_length": 197.5, + "epoch": 1.9335664335664335, + "grad_norm": 0.8085185289382935, + "kl": 0.2905815541744232, + "learning_rate": 4.540343975200401e-06, + "loss": 0.0116, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 553 + }, + { + "completion_length": 504.8333435058594, + "epoch": 1.937062937062937, + "grad_norm": 0.38323989510536194, + "kl": 0.26971811056137085, + "learning_rate": 4.537819489321385e-06, + "loss": 0.0108, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 554 + }, + { + "completion_length": 172.5, + "epoch": 1.9405594405594404, + "grad_norm": 1.8462821245193481, + "kl": 0.32645952701568604, + "learning_rate": 4.535288795890799e-06, + "loss": 0.0131, + "reward": 2.8500001430511475, + "reward_std": 1.3190906047821045, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 555 + }, + { + "completion_length": 508.66668701171875, + "epoch": 1.9440559440559442, + "grad_norm": 0.48262494802474976, + "kl": 0.26610442996025085, + "learning_rate": 4.5327519026175694e-06, + "loss": 0.0106, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 556 + }, + { + "completion_length": 205.33334350585938, + "epoch": 1.9475524475524475, + "grad_norm": 0.8724077343940735, + "kl": 0.34979626536369324, + "learning_rate": 4.530208817229516e-06, + "loss": 0.014, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 557 + }, + { + "completion_length": 466.3333435058594, + "epoch": 1.951048951048951, + "grad_norm": 1.2409106492996216, + "kl": 0.5075003504753113, + "learning_rate": 4.527659547473317e-06, + "loss": 0.0203, + "reward": 1.774999976158142, + "reward_std": 1.3299436569213867, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6083333492279053, + "step": 558 + }, + { + "completion_length": 201.0, + "epoch": 1.9545454545454546, + "grad_norm": 0.9538130760192871, + "kl": 0.22750967741012573, + "learning_rate": 4.5251041011144905e-06, + "loss": 0.0091, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 559 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.958041958041958, + "grad_norm": 0.8161240220069885, + "kl": 0.28019654750823975, + "learning_rate": 4.522542485937369e-06, + "loss": 0.0112, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 560 + }, + { + "completion_length": 515.5, + "epoch": 1.9615384615384617, + "grad_norm": 0.6905736327171326, + "kl": 0.20913702249526978, + "learning_rate": 4.519974709745076e-06, + "loss": 0.0084, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 561 + }, + { + "completion_length": 201.5, + "epoch": 1.965034965034965, + "grad_norm": 1.109075665473938, + "kl": 0.29383933544158936, + "learning_rate": 4.517400780359505e-06, + "loss": 0.0118, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 562 + }, + { + "completion_length": 849.0, + "epoch": 1.9685314685314685, + "grad_norm": 0.5454800128936768, + "kl": 0.16988810896873474, + "learning_rate": 4.51482070562129e-06, + "loss": 0.0068, + "reward": 2.4666666984558105, + "reward_std": 1.949530005455017, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 563 + }, + { + "completion_length": 826.0, + "epoch": 1.972027972027972, + "grad_norm": 0.521063506603241, + "kl": 0.2149253934621811, + "learning_rate": 4.512234493389785e-06, + "loss": 0.0086, + "reward": 1.9666666984558105, + "reward_std": 1.5233734846115112, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 564 + }, + { + "completion_length": 502.8333435058594, + "epoch": 1.9755244755244754, + "grad_norm": 0.4798555076122284, + "kl": 0.26902374625205994, + "learning_rate": 4.509642151543043e-06, + "loss": 0.0108, + "reward": 1.625, + "reward_std": 0.7960842847824097, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 565 + }, + { + "completion_length": 525.0, + "epoch": 1.9790209790209792, + "grad_norm": 0.566384494304657, + "kl": 0.2703857123851776, + "learning_rate": 4.507043687977787e-06, + "loss": 0.0108, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 566 + }, + { + "completion_length": 194.33334350585938, + "epoch": 1.9825174825174825, + "grad_norm": 2.502077579498291, + "kl": 0.4179210364818573, + "learning_rate": 4.504439110609385e-06, + "loss": 0.0167, + "reward": 1.383333444595337, + "reward_std": 0.8920015096664429, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 567 + }, + { + "completion_length": 199.33334350585938, + "epoch": 1.986013986013986, + "grad_norm": 0.07109465450048447, + "kl": 0.2686344385147095, + "learning_rate": 4.501828427371834e-06, + "loss": 0.0131, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 568 + }, + { + "completion_length": 190.83334350585938, + "epoch": 1.9895104895104896, + "grad_norm": 1.11842942237854, + "kl": 0.2603175640106201, + "learning_rate": 4.4992116462177274e-06, + "loss": 0.0104, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 569 + }, + { + "completion_length": 513.8333740234375, + "epoch": 1.993006993006993, + "grad_norm": 0.47602808475494385, + "kl": 0.20756664872169495, + "learning_rate": 4.496588775118232e-06, + "loss": 0.0083, + "reward": 2.2916667461395264, + "reward_std": 1.19181227684021, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 570 + }, + { + "completion_length": 197.1666717529297, + "epoch": 1.9965034965034965, + "grad_norm": 0.7599025368690491, + "kl": 0.23664715886116028, + "learning_rate": 4.4939598220630724e-06, + "loss": 0.0095, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 571 + }, + { + "completion_length": 207.83334350585938, + "epoch": 2.0, + "grad_norm": 0.7908173203468323, + "kl": 0.28615739941596985, + "learning_rate": 4.491324795060491e-06, + "loss": 0.0114, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 572 + }, + { + "completion_length": 194.6666717529297, + "epoch": 2.0034965034965033, + "grad_norm": 0.9715352654457092, + "kl": 0.3183891177177429, + "learning_rate": 4.48868370213724e-06, + "loss": 0.0127, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 573 + }, + { + "completion_length": 173.83334350585938, + "epoch": 2.006993006993007, + "grad_norm": 2.3841874599456787, + "kl": 1.3214149475097656, + "learning_rate": 4.4860365513385456e-06, + "loss": 0.0529, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 574 + }, + { + "completion_length": 213.83334350585938, + "epoch": 2.0104895104895104, + "grad_norm": 0.9496575593948364, + "kl": 0.22735705971717834, + "learning_rate": 4.4833833507280884e-06, + "loss": 0.0091, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 575 + }, + { + "completion_length": 511.0, + "epoch": 2.013986013986014, + "grad_norm": 0.6045878529548645, + "kl": 0.25393787026405334, + "learning_rate": 4.4807241083879774e-06, + "loss": 0.0102, + "reward": 1.4583333730697632, + "reward_std": 0.8187898397445679, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 576 + }, + { + "completion_length": 222.1666717529297, + "epoch": 2.0174825174825175, + "grad_norm": 0.7379043102264404, + "kl": 0.22020569443702698, + "learning_rate": 4.478058832418726e-06, + "loss": 0.0088, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 577 + }, + { + "completion_length": 204.6666717529297, + "epoch": 2.020979020979021, + "grad_norm": 0.9404547810554504, + "kl": 0.2797861695289612, + "learning_rate": 4.475387530939226e-06, + "loss": 0.0112, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 578 + }, + { + "completion_length": 206.6666717529297, + "epoch": 2.0244755244755246, + "grad_norm": 0.8784480690956116, + "kl": 0.24152153730392456, + "learning_rate": 4.4727102120867274e-06, + "loss": 0.0097, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 579 + }, + { + "completion_length": 414.66668701171875, + "epoch": 2.027972027972028, + "grad_norm": 0.6715477705001831, + "kl": 0.21307629346847534, + "learning_rate": 4.470026884016805e-06, + "loss": 0.0085, + "reward": 2.4583334922790527, + "reward_std": 1.5001389980316162, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 580 + }, + { + "completion_length": 528.5, + "epoch": 2.0314685314685317, + "grad_norm": 0.7886191010475159, + "kl": 0.4145243763923645, + "learning_rate": 4.467337554903344e-06, + "loss": 0.0166, + "reward": 3.5416667461395264, + "reward_std": 1.0002083778381348, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.875, + "step": 581 + }, + { + "completion_length": 457.5, + "epoch": 2.034965034965035, + "grad_norm": 5.719381809234619, + "kl": 1.370613932609558, + "learning_rate": 4.464642232938505e-06, + "loss": 0.0548, + "reward": 1.9750001430511475, + "reward_std": 2.163504123687744, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 582 + }, + { + "completion_length": 361.5, + "epoch": 2.0384615384615383, + "grad_norm": 0.5381609201431274, + "kl": 0.23687216639518738, + "learning_rate": 4.461940926332708e-06, + "loss": 0.0095, + "reward": 1.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 583 + }, + { + "completion_length": 874.6666870117188, + "epoch": 2.041958041958042, + "grad_norm": 0.45025861263275146, + "kl": 0.16833463311195374, + "learning_rate": 4.4592336433146e-06, + "loss": 0.0067, + "reward": 2.9583334922790527, + "reward_std": 1.6554203033447266, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 584 + }, + { + "completion_length": 726.3333740234375, + "epoch": 2.0454545454545454, + "grad_norm": 0.4446694254875183, + "kl": 0.17844387888908386, + "learning_rate": 4.456520392131035e-06, + "loss": 0.0071, + "reward": 1.133333444595337, + "reward_std": 0.9595138430595398, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 585 + }, + { + "completion_length": 830.3333740234375, + "epoch": 2.0489510489510487, + "grad_norm": 0.8371572494506836, + "kl": 0.16316595673561096, + "learning_rate": 4.453801181047047e-06, + "loss": 0.0065, + "reward": 1.524999976158142, + "reward_std": 1.1206024885177612, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 586 + }, + { + "completion_length": 110.5, + "epoch": 2.0524475524475525, + "grad_norm": 3.6648356914520264, + "kl": 0.4860494136810303, + "learning_rate": 4.4510760183458246e-06, + "loss": 0.0194, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 587 + }, + { + "completion_length": 228.6666717529297, + "epoch": 2.055944055944056, + "grad_norm": 0.8717478513717651, + "kl": 0.28448450565338135, + "learning_rate": 4.448344912328686e-06, + "loss": 0.0114, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 588 + }, + { + "completion_length": 614.0, + "epoch": 2.0594405594405596, + "grad_norm": 0.352130651473999, + "kl": 0.19009076058864594, + "learning_rate": 4.445607871315053e-06, + "loss": 0.0076, + "reward": 1.7333333492279053, + "reward_std": 0.5307227969169617, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 589 + }, + { + "completion_length": 476.3333435058594, + "epoch": 2.062937062937063, + "grad_norm": 2.5581870079040527, + "kl": 0.5677192807197571, + "learning_rate": 4.442864903642428e-06, + "loss": 0.0227, + "reward": 1.8000000715255737, + "reward_std": 1.5792405605316162, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 590 + }, + { + "completion_length": 314.66668701171875, + "epoch": 2.0664335664335662, + "grad_norm": 0.657811164855957, + "kl": 0.20458662509918213, + "learning_rate": 4.440116017666365e-06, + "loss": 0.0082, + "reward": 3.116666793823242, + "reward_std": 1.3291600942611694, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 591 + }, + { + "completion_length": 516.0, + "epoch": 2.06993006993007, + "grad_norm": 0.473056823015213, + "kl": 0.19687163829803467, + "learning_rate": 4.437361221760449e-06, + "loss": 0.0079, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 592 + }, + { + "completion_length": 217.0, + "epoch": 2.0734265734265733, + "grad_norm": 0.793745756149292, + "kl": 0.2862774133682251, + "learning_rate": 4.434600524316266e-06, + "loss": 0.0115, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 593 + }, + { + "completion_length": 216.0, + "epoch": 2.076923076923077, + "grad_norm": 0.7589979767799377, + "kl": 0.2887541651725769, + "learning_rate": 4.431833933743378e-06, + "loss": 0.0116, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 594 + }, + { + "completion_length": 234.0, + "epoch": 2.0804195804195804, + "grad_norm": 0.952064037322998, + "kl": 0.30340343713760376, + "learning_rate": 4.4290614584693005e-06, + "loss": 0.0121, + "reward": 2.5375001430511475, + "reward_std": 0.9115578532218933, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 595 + }, + { + "completion_length": 1109.8333740234375, + "epoch": 2.0839160839160837, + "grad_norm": 0.382217139005661, + "kl": 0.1974603831768036, + "learning_rate": 4.426283106939474e-06, + "loss": 0.0079, + "reward": 1.7166666984558105, + "reward_std": 0.967298686504364, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7166666984558105, + "step": 596 + }, + { + "completion_length": 497.66668701171875, + "epoch": 2.0874125874125875, + "grad_norm": 0.7741627097129822, + "kl": 0.2393149733543396, + "learning_rate": 4.423498887617238e-06, + "loss": 0.0096, + "reward": 1.9583333730697632, + "reward_std": 1.400148868560791, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 597 + }, + { + "completion_length": 518.0, + "epoch": 2.090909090909091, + "grad_norm": 0.534230649471283, + "kl": 0.22715210914611816, + "learning_rate": 4.420708808983809e-06, + "loss": 0.0091, + "reward": 2.4583334922790527, + "reward_std": 1.3603003025054932, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 598 + }, + { + "completion_length": 502.8333435058594, + "epoch": 2.0944055944055946, + "grad_norm": 0.5411605834960938, + "kl": 0.2008448839187622, + "learning_rate": 4.41791287953825e-06, + "loss": 0.008, + "reward": 2.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 599 + }, + { + "completion_length": 545.6666870117188, + "epoch": 2.097902097902098, + "grad_norm": 0.44943779706954956, + "kl": 0.225155770778656, + "learning_rate": 4.415111107797445e-06, + "loss": 0.009, + "reward": 3.016666889190674, + "reward_std": 1.3952300548553467, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 600 + }, + { + "completion_length": 239.0, + "epoch": 2.1013986013986012, + "grad_norm": 0.9387716054916382, + "kl": 0.2535586357116699, + "learning_rate": 4.412303502296081e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 601 + }, + { + "completion_length": 188.0, + "epoch": 2.104895104895105, + "grad_norm": 3.3025033473968506, + "kl": 0.3564508557319641, + "learning_rate": 4.409490071586606e-06, + "loss": 0.0143, + "reward": 2.9583334922790527, + "reward_std": 1.6554205417633057, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 602 + }, + { + "completion_length": 526.8333740234375, + "epoch": 2.1083916083916083, + "grad_norm": 0.7135488986968994, + "kl": 0.25961729884147644, + "learning_rate": 4.406670824239221e-06, + "loss": 0.0104, + "reward": 2.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 603 + }, + { + "completion_length": 201.0, + "epoch": 2.111888111888112, + "grad_norm": 0.5526494979858398, + "kl": 0.26036110520362854, + "learning_rate": 4.403845768841842e-06, + "loss": 0.0104, + "reward": 2.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 604 + }, + { + "completion_length": 516.8333740234375, + "epoch": 2.1153846153846154, + "grad_norm": 0.4089651107788086, + "kl": 0.2617362141609192, + "learning_rate": 4.401014914000078e-06, + "loss": 0.0105, + "reward": 3.2916667461395264, + "reward_std": 1.6125807762145996, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 605 + }, + { + "completion_length": 192.5, + "epoch": 2.1188811188811187, + "grad_norm": 0.7996219396591187, + "kl": 0.30715522170066833, + "learning_rate": 4.398178268337202e-06, + "loss": 0.0123, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 606 + }, + { + "completion_length": 793.3333740234375, + "epoch": 2.1223776223776225, + "grad_norm": 0.8545472025871277, + "kl": 0.20438644289970398, + "learning_rate": 4.395335840494131e-06, + "loss": 0.0082, + "reward": 3.375, + "reward_std": 0.493710458278656, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.875, + "step": 607 + }, + { + "completion_length": 197.5, + "epoch": 2.125874125874126, + "grad_norm": 0.09662449359893799, + "kl": 0.2624778151512146, + "learning_rate": 4.3924876391293915e-06, + "loss": 0.0117, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 608 + }, + { + "completion_length": 199.0, + "epoch": 2.129370629370629, + "grad_norm": 0.8693634867668152, + "kl": 0.232680082321167, + "learning_rate": 4.389633672919099e-06, + "loss": 0.0093, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 609 + }, + { + "completion_length": 213.1666717529297, + "epoch": 2.132867132867133, + "grad_norm": 0.23271039128303528, + "kl": 0.2889987826347351, + "learning_rate": 4.386773950556931e-06, + "loss": 0.0139, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 610 + }, + { + "completion_length": 197.83334350585938, + "epoch": 2.1363636363636362, + "grad_norm": 0.8127601742744446, + "kl": 0.35951054096221924, + "learning_rate": 4.3839084807540956e-06, + "loss": 0.0144, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 611 + }, + { + "completion_length": 164.6666717529297, + "epoch": 2.13986013986014, + "grad_norm": 1.0649946928024292, + "kl": 0.26743820309638977, + "learning_rate": 4.381037272239311e-06, + "loss": 0.0107, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 612 + }, + { + "completion_length": 217.6666717529297, + "epoch": 2.1433566433566433, + "grad_norm": 0.8122753500938416, + "kl": 0.27118992805480957, + "learning_rate": 4.378160333758779e-06, + "loss": 0.0108, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 613 + }, + { + "completion_length": 201.6666717529297, + "epoch": 2.1468531468531467, + "grad_norm": 0.8640854358673096, + "kl": 0.2445271909236908, + "learning_rate": 4.3752776740761495e-06, + "loss": 0.0098, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 614 + }, + { + "completion_length": 188.6666717529297, + "epoch": 2.1503496503496504, + "grad_norm": 1.3168154954910278, + "kl": 0.2900705933570862, + "learning_rate": 4.372389301972506e-06, + "loss": 0.0116, + "reward": 1.7083333730697632, + "reward_std": 0.591960072517395, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 615 + }, + { + "completion_length": 241.6666717529297, + "epoch": 2.1538461538461537, + "grad_norm": 1.1053791046142578, + "kl": 0.4096168875694275, + "learning_rate": 4.36949522624633e-06, + "loss": 0.0164, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 616 + }, + { + "completion_length": 147.83334350585938, + "epoch": 2.1573426573426575, + "grad_norm": 3.980419874191284, + "kl": 1.5825055837631226, + "learning_rate": 4.366595455713479e-06, + "loss": 0.0633, + "reward": 2.3000001907348633, + "reward_std": 1.4812158346176147, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 617 + }, + { + "completion_length": 197.0, + "epoch": 2.160839160839161, + "grad_norm": 0.8954426050186157, + "kl": 0.23646585643291473, + "learning_rate": 4.3636899992071555e-06, + "loss": 0.0095, + "reward": 2.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 618 + }, + { + "completion_length": 221.33334350585938, + "epoch": 2.164335664335664, + "grad_norm": 0.8455007076263428, + "kl": 0.25921204686164856, + "learning_rate": 4.360778865577885e-06, + "loss": 0.0104, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 619 + }, + { + "completion_length": 196.5, + "epoch": 2.167832167832168, + "grad_norm": 0.8735758662223816, + "kl": 0.27696120738983154, + "learning_rate": 4.357862063693486e-06, + "loss": 0.0111, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 620 + }, + { + "completion_length": 177.83334350585938, + "epoch": 2.1713286713286712, + "grad_norm": 32.12022018432617, + "kl": 2.4454264640808105, + "learning_rate": 4.354939602439041e-06, + "loss": 0.0978, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 621 + }, + { + "completion_length": 184.6666717529297, + "epoch": 2.174825174825175, + "grad_norm": 2.8916237354278564, + "kl": 0.3946024775505066, + "learning_rate": 4.352011490716875e-06, + "loss": 0.0158, + "reward": 3.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 622 + }, + { + "completion_length": 210.33334350585938, + "epoch": 2.1783216783216783, + "grad_norm": 1.4287588596343994, + "kl": 0.32967257499694824, + "learning_rate": 4.349077737446525e-06, + "loss": 0.0132, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 623 + }, + { + "completion_length": 229.83334350585938, + "epoch": 2.1818181818181817, + "grad_norm": 0.04024571180343628, + "kl": 0.2965821325778961, + "learning_rate": 4.346138351564711e-06, + "loss": 0.0142, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 624 + }, + { + "completion_length": 153.83334350585938, + "epoch": 2.1853146853146854, + "grad_norm": 0.9452215433120728, + "kl": 0.26284661889076233, + "learning_rate": 4.34319334202531e-06, + "loss": 0.0105, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 625 + }, + { + "completion_length": 162.1666717529297, + "epoch": 2.1888111888111887, + "grad_norm": 32.100563049316406, + "kl": 7.969426155090332, + "learning_rate": 4.340242717799337e-06, + "loss": 0.3188, + "reward": 2.2916667461395264, + "reward_std": 0.8002604246139526, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 626 + }, + { + "completion_length": 175.5, + "epoch": 2.1923076923076925, + "grad_norm": 6.515329360961914, + "kl": 0.3849031627178192, + "learning_rate": 4.3372864878749e-06, + "loss": 0.0154, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 627 + }, + { + "completion_length": 504.3333435058594, + "epoch": 2.195804195804196, + "grad_norm": 0.6083482503890991, + "kl": 0.19082359969615936, + "learning_rate": 4.334324661257191e-06, + "loss": 0.0076, + "reward": 2.4583334922790527, + "reward_std": 1.5001389980316162, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 628 + }, + { + "completion_length": 196.0, + "epoch": 2.199300699300699, + "grad_norm": 0.9820056557655334, + "kl": 0.2912360727787018, + "learning_rate": 4.331357246968447e-06, + "loss": 0.0116, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 629 + }, + { + "completion_length": 544.0, + "epoch": 2.202797202797203, + "grad_norm": 0.5948340892791748, + "kl": 0.22720639407634735, + "learning_rate": 4.328384254047927e-06, + "loss": 0.0091, + "reward": 2.375, + "reward_std": 0.6509608626365662, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 630 + }, + { + "completion_length": 237.0, + "epoch": 2.2062937062937062, + "grad_norm": 0.0632646456360817, + "kl": 0.2671894431114197, + "learning_rate": 4.3254056915518815e-06, + "loss": 0.0131, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 631 + }, + { + "completion_length": 501.16668701171875, + "epoch": 2.20979020979021, + "grad_norm": 0.44626739621162415, + "kl": 0.2233467698097229, + "learning_rate": 4.322421568553529e-06, + "loss": 0.0089, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 632 + }, + { + "completion_length": 187.5, + "epoch": 2.2132867132867133, + "grad_norm": 0.9024590849876404, + "kl": 0.299750417470932, + "learning_rate": 4.319431894143027e-06, + "loss": 0.012, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 633 + }, + { + "completion_length": 532.5, + "epoch": 2.2167832167832167, + "grad_norm": 0.38001272082328796, + "kl": 0.28776365518569946, + "learning_rate": 4.316436677427441e-06, + "loss": 0.0115, + "reward": 3.566666603088379, + "reward_std": 0.9389711618423462, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 634 + }, + { + "completion_length": 201.6666717529297, + "epoch": 2.2202797202797204, + "grad_norm": 1.1841076612472534, + "kl": 0.3013113737106323, + "learning_rate": 4.313435927530719e-06, + "loss": 0.0121, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 635 + }, + { + "completion_length": 185.6666717529297, + "epoch": 2.2237762237762237, + "grad_norm": 0.8018883466720581, + "kl": 0.2923080325126648, + "learning_rate": 4.3104296535936695e-06, + "loss": 0.0117, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 636 + }, + { + "completion_length": 525.3333740234375, + "epoch": 2.227272727272727, + "grad_norm": 0.4936811923980713, + "kl": 0.25341111421585083, + "learning_rate": 4.3074178647739205e-06, + "loss": 0.0101, + "reward": 3.2083334922790527, + "reward_std": 0.9697508215904236, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 637 + }, + { + "completion_length": 191.83334350585938, + "epoch": 2.230769230769231, + "grad_norm": 0.6575815677642822, + "kl": 0.3100575804710388, + "learning_rate": 4.3044005702459055e-06, + "loss": 0.0124, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 638 + }, + { + "completion_length": 178.5, + "epoch": 2.234265734265734, + "grad_norm": 0.8525052666664124, + "kl": 0.31076908111572266, + "learning_rate": 4.301377779200826e-06, + "loss": 0.0124, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 639 + }, + { + "completion_length": 185.33334350585938, + "epoch": 2.237762237762238, + "grad_norm": 1.0106300115585327, + "kl": 0.30621784925460815, + "learning_rate": 4.2983495008466285e-06, + "loss": 0.0122, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 640 + }, + { + "completion_length": 186.5, + "epoch": 2.2412587412587412, + "grad_norm": 0.885761022567749, + "kl": 0.3738858103752136, + "learning_rate": 4.295315744407972e-06, + "loss": 0.015, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 641 + }, + { + "completion_length": 171.6666717529297, + "epoch": 2.2447552447552446, + "grad_norm": 1.113839030265808, + "kl": 0.3465404212474823, + "learning_rate": 4.2922765191262075e-06, + "loss": 0.0139, + "reward": 2.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 642 + }, + { + "completion_length": 203.0, + "epoch": 2.2482517482517483, + "grad_norm": 0.8950809836387634, + "kl": 0.2658528983592987, + "learning_rate": 4.28923183425934e-06, + "loss": 0.0106, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 643 + }, + { + "completion_length": 198.5, + "epoch": 2.2517482517482517, + "grad_norm": 0.9561752080917358, + "kl": 0.31710129976272583, + "learning_rate": 4.286181699082008e-06, + "loss": 0.0127, + "reward": 2.2833335399627686, + "reward_std": 1.0327956676483154, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 644 + }, + { + "completion_length": 168.1666717529297, + "epoch": 2.2552447552447554, + "grad_norm": 0.8310069441795349, + "kl": 0.27687615156173706, + "learning_rate": 4.283126122885455e-06, + "loss": 0.0111, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 645 + }, + { + "completion_length": 196.83334350585938, + "epoch": 2.2587412587412588, + "grad_norm": 0.09269661456346512, + "kl": 0.2699682414531708, + "learning_rate": 4.280065114977492e-06, + "loss": 0.012, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 646 + }, + { + "completion_length": 163.6666717529297, + "epoch": 2.262237762237762, + "grad_norm": 1.2992812395095825, + "kl": 0.3616819381713867, + "learning_rate": 4.276998684682482e-06, + "loss": 0.0145, + "reward": 2.375, + "reward_std": 1.1847995519638062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 647 + }, + { + "completion_length": 198.6666717529297, + "epoch": 2.265734265734266, + "grad_norm": 0.8000275492668152, + "kl": 0.2609575390815735, + "learning_rate": 4.273926841341303e-06, + "loss": 0.0104, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 648 + }, + { + "completion_length": 196.1666717529297, + "epoch": 2.269230769230769, + "grad_norm": 0.8786153197288513, + "kl": 0.3877195119857788, + "learning_rate": 4.270849594311323e-06, + "loss": 0.0155, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 649 + }, + { + "completion_length": 201.0, + "epoch": 2.2727272727272725, + "grad_norm": 0.9727340936660767, + "kl": 0.3743540942668915, + "learning_rate": 4.267766952966369e-06, + "loss": 0.015, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 650 + }, + { + "completion_length": 205.33334350585938, + "epoch": 2.2762237762237763, + "grad_norm": 0.09209764748811722, + "kl": 0.27989333868026733, + "learning_rate": 4.264678926696703e-06, + "loss": 0.0136, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 651 + }, + { + "completion_length": 202.5, + "epoch": 2.2797202797202796, + "grad_norm": 0.9205158948898315, + "kl": 0.3037436008453369, + "learning_rate": 4.261585524908987e-06, + "loss": 0.0121, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 652 + }, + { + "completion_length": 304.66668701171875, + "epoch": 2.2832167832167833, + "grad_norm": 0.8844843506813049, + "kl": 0.3668223023414612, + "learning_rate": 4.25848675702626e-06, + "loss": 0.0147, + "reward": 1.9500001668930054, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 653 + }, + { + "completion_length": 194.6666717529297, + "epoch": 2.2867132867132867, + "grad_norm": 1.0558805465698242, + "kl": 0.3064219057559967, + "learning_rate": 4.255382632487907e-06, + "loss": 0.0123, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 654 + }, + { + "completion_length": 185.6666717529297, + "epoch": 2.29020979020979, + "grad_norm": 0.9313608407974243, + "kl": 0.31230098009109497, + "learning_rate": 4.2522731607496275e-06, + "loss": 0.0125, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 655 + }, + { + "completion_length": 211.1666717529297, + "epoch": 2.2937062937062938, + "grad_norm": 0.19107016921043396, + "kl": 0.373710036277771, + "learning_rate": 4.249158351283414e-06, + "loss": 0.0173, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 656 + }, + { + "completion_length": 348.8333435058594, + "epoch": 2.297202797202797, + "grad_norm": 0.7309221029281616, + "kl": 0.3733287751674652, + "learning_rate": 4.246038213577516e-06, + "loss": 0.0149, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 657 + }, + { + "completion_length": 180.6666717529297, + "epoch": 2.300699300699301, + "grad_norm": 0.8861889839172363, + "kl": 0.35562607645988464, + "learning_rate": 4.242912757136412e-06, + "loss": 0.0142, + "reward": 2.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 658 + }, + { + "completion_length": 204.5, + "epoch": 2.304195804195804, + "grad_norm": 0.7407400608062744, + "kl": 0.28287678956985474, + "learning_rate": 4.239781991480786e-06, + "loss": 0.0113, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 659 + }, + { + "completion_length": 174.0, + "epoch": 2.3076923076923075, + "grad_norm": 8.534856796264648, + "kl": 1.5403010845184326, + "learning_rate": 4.236645926147493e-06, + "loss": 0.0616, + "reward": 2.2916667461395264, + "reward_std": 0.8002604246139526, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 660 + }, + { + "completion_length": 184.33334350585938, + "epoch": 2.3111888111888113, + "grad_norm": 0.06887773424386978, + "kl": 0.2856985628604889, + "learning_rate": 4.233504570689533e-06, + "loss": 0.0138, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 661 + }, + { + "completion_length": 202.83334350585938, + "epoch": 2.3146853146853146, + "grad_norm": 0.8288156986236572, + "kl": 0.2896421253681183, + "learning_rate": 4.230357934676017e-06, + "loss": 0.0116, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 662 + }, + { + "completion_length": 207.6666717529297, + "epoch": 2.3181818181818183, + "grad_norm": 1.119509220123291, + "kl": 0.4124630391597748, + "learning_rate": 4.227206027692146e-06, + "loss": 0.0165, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 663 + }, + { + "completion_length": 198.1666717529297, + "epoch": 2.3216783216783217, + "grad_norm": 0.8312250971794128, + "kl": 0.3108134865760803, + "learning_rate": 4.224048859339175e-06, + "loss": 0.0124, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 664 + }, + { + "completion_length": 610.8333740234375, + "epoch": 2.325174825174825, + "grad_norm": 0.5707215070724487, + "kl": 0.23091670870780945, + "learning_rate": 4.220886439234385e-06, + "loss": 0.0092, + "reward": 2.383333444595337, + "reward_std": 1.5413198471069336, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 665 + }, + { + "completion_length": 460.0, + "epoch": 2.3286713286713288, + "grad_norm": 10.873461723327637, + "kl": 2.6264634132385254, + "learning_rate": 4.217718777011058e-06, + "loss": 0.1051, + "reward": 1.4666666984558105, + "reward_std": 1.356711745262146, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 666 + }, + { + "completion_length": 207.0, + "epoch": 2.332167832167832, + "grad_norm": 0.6674370765686035, + "kl": 0.2692621350288391, + "learning_rate": 4.2145458823184414e-06, + "loss": 0.0108, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 667 + }, + { + "completion_length": 567.8333740234375, + "epoch": 2.335664335664336, + "grad_norm": 0.42179885506629944, + "kl": 0.2716664671897888, + "learning_rate": 4.211367764821722e-06, + "loss": 0.0109, + "reward": 3.566666603088379, + "reward_std": 0.938971221446991, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 668 + }, + { + "completion_length": 223.5, + "epoch": 2.339160839160839, + "grad_norm": 0.6866164803504944, + "kl": 0.24070698022842407, + "learning_rate": 4.208184434201999e-06, + "loss": 0.0096, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 669 + }, + { + "completion_length": 214.5, + "epoch": 2.3426573426573425, + "grad_norm": 0.9751102924346924, + "kl": 0.2499878704547882, + "learning_rate": 4.204995900156247e-06, + "loss": 0.01, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 670 + }, + { + "completion_length": 182.33334350585938, + "epoch": 2.3461538461538463, + "grad_norm": 3.7804720401763916, + "kl": 0.46188828349113464, + "learning_rate": 4.201802172397295e-06, + "loss": 0.0185, + "reward": 3.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 671 + }, + { + "completion_length": 1019.6666870117188, + "epoch": 2.3496503496503496, + "grad_norm": 0.4247821569442749, + "kl": 0.21799665689468384, + "learning_rate": 4.198603260653792e-06, + "loss": 0.0087, + "reward": 2.7166669368743896, + "reward_std": 1.6418485641479492, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666388511658, + "step": 672 + }, + { + "completion_length": 753.8333740234375, + "epoch": 2.3531468531468533, + "grad_norm": 0.5194523334503174, + "kl": 0.22523364424705505, + "learning_rate": 4.195399174670177e-06, + "loss": 0.009, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 673 + }, + { + "completion_length": 573.6666870117188, + "epoch": 2.3566433566433567, + "grad_norm": 0.5000849366188049, + "kl": 0.22850388288497925, + "learning_rate": 4.192189924206652e-06, + "loss": 0.0091, + "reward": 2.950000286102295, + "reward_std": 1.095445156097412, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 674 + }, + { + "completion_length": 1213.8333740234375, + "epoch": 2.36013986013986, + "grad_norm": 0.5522187352180481, + "kl": 0.177886962890625, + "learning_rate": 4.188975519039151e-06, + "loss": 0.0071, + "reward": 1.3916667699813843, + "reward_std": 1.4026464223861694, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5583333969116211, + "step": 675 + }, + { + "completion_length": 872.6666870117188, + "epoch": 2.3636363636363638, + "grad_norm": 0.4857361912727356, + "kl": 0.20906971395015717, + "learning_rate": 4.185755968959308e-06, + "loss": 0.0084, + "reward": 2.9083335399627686, + "reward_std": 1.696000337600708, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 676 + }, + { + "completion_length": 502.0, + "epoch": 2.367132867132867, + "grad_norm": 0.5935739278793335, + "kl": 0.27800655364990234, + "learning_rate": 4.182531283774434e-06, + "loss": 0.0111, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 677 + }, + { + "completion_length": 738.5, + "epoch": 2.370629370629371, + "grad_norm": 0.5985221862792969, + "kl": 0.2548876702785492, + "learning_rate": 4.179301473307476e-06, + "loss": 0.0102, + "reward": 2.2125000953674316, + "reward_std": 1.3164108991622925, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 678 + }, + { + "completion_length": 173.83334350585938, + "epoch": 2.374125874125874, + "grad_norm": 1.7061294317245483, + "kl": 0.3693540692329407, + "learning_rate": 4.176066547396998e-06, + "loss": 0.0148, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 679 + }, + { + "completion_length": 203.6666717529297, + "epoch": 2.3776223776223775, + "grad_norm": 1.0101178884506226, + "kl": 0.31931599974632263, + "learning_rate": 4.172826515897146e-06, + "loss": 0.0128, + "reward": 2.2083334922790527, + "reward_std": 1.1577637195587158, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 680 + }, + { + "completion_length": 205.83334350585938, + "epoch": 2.3811188811188813, + "grad_norm": 0.8966777920722961, + "kl": 0.3051684498786926, + "learning_rate": 4.169581388677617e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 681 + }, + { + "completion_length": 210.6666717529297, + "epoch": 2.3846153846153846, + "grad_norm": 0.7840998768806458, + "kl": 0.31647345423698425, + "learning_rate": 4.166331175623631e-06, + "loss": 0.0127, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 682 + }, + { + "completion_length": 209.1666717529297, + "epoch": 2.3881118881118883, + "grad_norm": 0.9048584699630737, + "kl": 0.25157231092453003, + "learning_rate": 4.163075886635902e-06, + "loss": 0.0101, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 683 + }, + { + "completion_length": 494.66668701171875, + "epoch": 2.3916083916083917, + "grad_norm": 0.612885057926178, + "kl": 0.1984379142522812, + "learning_rate": 4.159815531630604e-06, + "loss": 0.0079, + "reward": 2.125, + "reward_std": 1.3129165172576904, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 684 + }, + { + "completion_length": 182.83334350585938, + "epoch": 2.395104895104895, + "grad_norm": 1.069145679473877, + "kl": 0.33643895387649536, + "learning_rate": 4.1565501205393445e-06, + "loss": 0.0135, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 685 + }, + { + "completion_length": 192.6666717529297, + "epoch": 2.3986013986013988, + "grad_norm": 0.8116271495819092, + "kl": 0.29202282428741455, + "learning_rate": 4.15327966330913e-06, + "loss": 0.0117, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 686 + }, + { + "completion_length": 196.0, + "epoch": 2.402097902097902, + "grad_norm": 0.9276851415634155, + "kl": 0.31228408217430115, + "learning_rate": 4.150004169902343e-06, + "loss": 0.0125, + "reward": 1.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 687 + }, + { + "completion_length": 213.6666717529297, + "epoch": 2.4055944055944054, + "grad_norm": 1.0499162673950195, + "kl": 0.24672053754329681, + "learning_rate": 4.146723650296701e-06, + "loss": 0.0099, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 688 + }, + { + "completion_length": 219.1666717529297, + "epoch": 2.409090909090909, + "grad_norm": 0.7051374912261963, + "kl": 0.24717721343040466, + "learning_rate": 4.14343811448524e-06, + "loss": 0.0099, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 689 + }, + { + "completion_length": 226.5, + "epoch": 2.4125874125874125, + "grad_norm": 0.7789434194564819, + "kl": 0.2564643919467926, + "learning_rate": 4.140147572476269e-06, + "loss": 0.0103, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 690 + }, + { + "completion_length": 212.0, + "epoch": 2.4160839160839163, + "grad_norm": 0.8126075267791748, + "kl": 0.23958399891853333, + "learning_rate": 4.136852034293349e-06, + "loss": 0.0096, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 691 + }, + { + "completion_length": 210.6666717529297, + "epoch": 2.4195804195804196, + "grad_norm": 0.8626409769058228, + "kl": 0.2777412533760071, + "learning_rate": 4.133551509975264e-06, + "loss": 0.0111, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 692 + }, + { + "completion_length": 529.8333740234375, + "epoch": 2.423076923076923, + "grad_norm": 0.5266372561454773, + "kl": 0.2946487069129944, + "learning_rate": 4.130246009575981e-06, + "loss": 0.0118, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 693 + }, + { + "completion_length": 217.6666717529297, + "epoch": 2.4265734265734267, + "grad_norm": 0.814607560634613, + "kl": 0.31643202900886536, + "learning_rate": 4.126935543164628e-06, + "loss": 0.0127, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 694 + }, + { + "completion_length": 206.1666717529297, + "epoch": 2.43006993006993, + "grad_norm": 0.6121898293495178, + "kl": 0.24353787302970886, + "learning_rate": 4.123620120825459e-06, + "loss": 0.0097, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 695 + }, + { + "completion_length": 416.16668701171875, + "epoch": 2.4335664335664333, + "grad_norm": 0.65854811668396, + "kl": 0.29339665174484253, + "learning_rate": 4.120299752657828e-06, + "loss": 0.0117, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 696 + }, + { + "completion_length": 524.3333740234375, + "epoch": 2.437062937062937, + "grad_norm": 0.5596239566802979, + "kl": 0.26455265283584595, + "learning_rate": 4.11697444877615e-06, + "loss": 0.0106, + "reward": 2.7916667461395264, + "reward_std": 1.4527273178100586, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 697 + }, + { + "completion_length": 173.6666717529297, + "epoch": 2.4405594405594404, + "grad_norm": 2.7013747692108154, + "kl": 0.5755926370620728, + "learning_rate": 4.113644219309877e-06, + "loss": 0.023, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 698 + }, + { + "completion_length": 893.0, + "epoch": 2.444055944055944, + "grad_norm": 0.5892761945724487, + "kl": 0.22364209592342377, + "learning_rate": 4.110309074403467e-06, + "loss": 0.0089, + "reward": 2.433333396911621, + "reward_std": 1.3728317022323608, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 699 + }, + { + "completion_length": 1029.166748046875, + "epoch": 2.4475524475524475, + "grad_norm": 0.41362571716308594, + "kl": 0.20189592242240906, + "learning_rate": 4.106969024216348e-06, + "loss": 0.0081, + "reward": 1.7416666746139526, + "reward_std": 0.9625055193901062, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7416666746139526, + "step": 700 + } + ], + "logging_steps": 1, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-700/training_args.bin b/checkpoint-700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..404a67ca1097568ef818195412e92eb5df6df003 --- /dev/null +++ b/checkpoint-700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9b809202c83316443ca7c3596f9666d891e249e918f031374256726d85b5070 +size 6008 diff --git a/checkpoint-750/README.md b/checkpoint-750/README.md new file mode 100644 index 0000000000000000000000000000000000000000..342a23987f57b711334f1f7c4b72004ab4751d11 --- /dev/null +++ b/checkpoint-750/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/checkpoint-750/adapter_config.json b/checkpoint-750/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca69f90ffbea02ffd530ac27f43588458c02af39 --- /dev/null +++ b/checkpoint-750/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "up_proj", + "o_proj", + "k_proj", + "gate_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-750/adapter_model.safetensors b/checkpoint-750/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cae3fa24dfba4ae817cc81748ed9a62bd8b6f632 --- /dev/null +++ b/checkpoint-750/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce25de75d445d5d2d3998a12c84b9dcf5e07e14b906759e65c600d7bf10def39 +size 778096664 diff --git a/checkpoint-750/optimizer.pt b/checkpoint-750/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8bdec03d56e3b721bf2e523f8490926bd2c20af1 --- /dev/null +++ b/checkpoint-750/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1149769bb058aa6c3f8cd5ecb05049e0f618e80a6e998ef8e37e5e342aba17e5 +size 395571252 diff --git a/checkpoint-750/rng_state.pth b/checkpoint-750/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e74e7074b0a084be6f2df5b814c6112d9dfa4e7 --- /dev/null +++ b/checkpoint-750/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fc5d747fdcfa7c414c72a2922f53239839da5dc6fbd68aa02aafc7c492fe984 +size 14244 diff --git a/checkpoint-750/scheduler.pt b/checkpoint-750/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..75b1b11e520d11e6c7a2de573228d99b4252a826 --- /dev/null +++ b/checkpoint-750/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a18a5853cd18eb03b38c0a1263a879b320d4ec0a4e100662af6a0fecce6e4ba +size 1064 diff --git a/checkpoint-750/special_tokens_map.json b/checkpoint-750/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-750/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-750/tokenizer.json b/checkpoint-750/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-750/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-750/tokenizer_config.json b/checkpoint-750/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f29bafcf7d24e386a389486e71a4e81dfef0f5c2 --- /dev/null +++ b/checkpoint-750/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-750/trainer_state.json b/checkpoint-750/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e0cbb0c56c92f73b25409c7f482bac9c8f4d4232 --- /dev/null +++ b/checkpoint-750/trainer_state.json @@ -0,0 +1,11283 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.6223776223776225, + "eval_steps": 500, + "global_step": 750, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 399.0, + "epoch": 0.0034965034965034965, + "grad_norm": 0.9857833385467529, + "kl": 0.0, + "learning_rate": 2.5000000000000002e-08, + "loss": 0.0, + "reward": 1.75, + "reward_std": 1.069111704826355, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4166666865348816, + "step": 1 + }, + { + "completion_length": 305.3333435058594, + "epoch": 0.006993006993006993, + "grad_norm": 1.3122953176498413, + "kl": 0.0, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.0, + "reward": 1.0500000715255737, + "reward_std": 0.6340347528457642, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 2 + }, + { + "completion_length": 475.3333435058594, + "epoch": 0.01048951048951049, + "grad_norm": 6.344944953918457, + "kl": 0.0006356238736771047, + "learning_rate": 7.500000000000001e-08, + "loss": 0.0, + "reward": 1.4750001430511475, + "reward_std": 1.025061011314392, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 3 + }, + { + "completion_length": 378.3333435058594, + "epoch": 0.013986013986013986, + "grad_norm": 0.9831988215446472, + "kl": 0.0006719424272887409, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.0, + "reward": 1.2208333015441895, + "reward_std": 1.3383214473724365, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.22083334624767303, + "step": 4 + }, + { + "completion_length": 925.0, + "epoch": 0.017482517482517484, + "grad_norm": 1.042701005935669, + "kl": 0.000699286290910095, + "learning_rate": 1.2500000000000002e-07, + "loss": 0.0, + "reward": 2.4666666984558105, + "reward_std": 1.618847370147705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 5 + }, + { + "completion_length": 130.6666717529297, + "epoch": 0.02097902097902098, + "grad_norm": 1.276957631111145, + "kl": 0.0007741473382338881, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.0, + "reward": 0.38333332538604736, + "reward_std": 0.7222649455070496, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 6 + }, + { + "completion_length": 185.5, + "epoch": 0.024475524475524476, + "grad_norm": 1.277024507522583, + "kl": 0.0007853443967178464, + "learning_rate": 1.7500000000000002e-07, + "loss": 0.0, + "reward": 0.7250000238418579, + "reward_std": 0.44017040729522705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 7 + }, + { + "completion_length": 113.83333587646484, + "epoch": 0.027972027972027972, + "grad_norm": 4.894377708435059, + "kl": 0.0010196010116487741, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.0, + "reward": 0.7250000238418579, + "reward_std": 0.5777109861373901, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 8 + }, + { + "completion_length": 195.33334350585938, + "epoch": 0.03146853146853147, + "grad_norm": 0.9491543769836426, + "kl": 0.0009398699621669948, + "learning_rate": 2.2500000000000002e-07, + "loss": 0.0, + "reward": 1.2750000953674316, + "reward_std": 0.673609733581543, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10833333432674408, + "step": 9 + }, + { + "completion_length": 168.83334350585938, + "epoch": 0.03496503496503497, + "grad_norm": 4.634313583374023, + "kl": 0.0008446139981970191, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.0, + "reward": 0.5791666507720947, + "reward_std": 1.011238932609558, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.07916666567325592, + "step": 10 + }, + { + "completion_length": 181.0, + "epoch": 0.038461538461538464, + "grad_norm": 0.9203607439994812, + "kl": 0.0005472182529047132, + "learning_rate": 2.75e-07, + "loss": 0.0, + "reward": 1.2833333015441895, + "reward_std": 0.9125057458877563, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.11666666716337204, + "step": 11 + }, + { + "completion_length": 181.1666717529297, + "epoch": 0.04195804195804196, + "grad_norm": 1.4339206218719482, + "kl": 0.0007050944259390235, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.0, + "reward": 1.7333333492279053, + "reward_std": 1.0063133239746094, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.23333333432674408, + "step": 12 + }, + { + "completion_length": 130.0, + "epoch": 0.045454545454545456, + "grad_norm": 1.073473334312439, + "kl": 0.0007636564550921321, + "learning_rate": 3.25e-07, + "loss": 0.0, + "reward": 1.058333396911621, + "reward_std": 0.7486097812652588, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 13 + }, + { + "completion_length": 356.16668701171875, + "epoch": 0.04895104895104895, + "grad_norm": 0.8452476859092712, + "kl": 0.0006562608177773654, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.0, + "reward": 0.7416666746139526, + "reward_std": 0.8662659525871277, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.24166667461395264, + "step": 14 + }, + { + "completion_length": 143.1666717529297, + "epoch": 0.05244755244755245, + "grad_norm": 0.9590725302696228, + "kl": 0.0008172739762812853, + "learning_rate": 3.75e-07, + "loss": 0.0, + "reward": 0.5541666746139526, + "reward_std": 0.9553031921386719, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05416666716337204, + "step": 15 + }, + { + "completion_length": 454.16668701171875, + "epoch": 0.055944055944055944, + "grad_norm": 1.2272268533706665, + "kl": 0.0007388863014057279, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.0, + "reward": 1.2083333730697632, + "reward_std": 1.0360583066940308, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 16 + }, + { + "completion_length": 152.5, + "epoch": 0.05944055944055944, + "grad_norm": 1.0074872970581055, + "kl": 0.0006766216829419136, + "learning_rate": 4.2500000000000006e-07, + "loss": 0.0, + "reward": 0.8916666507720947, + "reward_std": 0.8662659525871277, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 17 + }, + { + "completion_length": 250.1666717529297, + "epoch": 0.06293706293706294, + "grad_norm": 1.305372953414917, + "kl": 0.001035388559103012, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.0, + "reward": 0.7166666984558105, + "reward_std": 1.2201093435287476, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 18 + }, + { + "completion_length": 243.0, + "epoch": 0.06643356643356643, + "grad_norm": 1.0690687894821167, + "kl": 0.0006665514083579183, + "learning_rate": 4.7500000000000006e-07, + "loss": 0.0, + "reward": 0.9916666746139526, + "reward_std": 0.6167792677879333, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.15833333134651184, + "step": 19 + }, + { + "completion_length": 276.16668701171875, + "epoch": 0.06993006993006994, + "grad_norm": 1.052300214767456, + "kl": 0.0005925261066295207, + "learning_rate": 5.000000000000001e-07, + "loss": 0.0, + "reward": 1.5333333015441895, + "reward_std": 1.0186593532562256, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.20000001788139343, + "step": 20 + }, + { + "completion_length": 333.3333435058594, + "epoch": 0.07342657342657342, + "grad_norm": 0.95088130235672, + "kl": 0.0006341444095596671, + "learning_rate": 5.250000000000001e-07, + "loss": 0.0, + "reward": 1.8583333492279053, + "reward_std": 0.8458231687545776, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3583333194255829, + "step": 21 + }, + { + "completion_length": 166.6666717529297, + "epoch": 0.07692307692307693, + "grad_norm": 1.2825149297714233, + "kl": 0.0007712479564361274, + "learning_rate": 5.5e-07, + "loss": 0.0, + "reward": 0.7666666507720947, + "reward_std": 1.1881358623504639, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10000000894069672, + "step": 22 + }, + { + "completion_length": 380.0, + "epoch": 0.08041958041958042, + "grad_norm": 1.2229748964309692, + "kl": 0.0007141837850213051, + "learning_rate": 5.750000000000001e-07, + "loss": 0.0, + "reward": 1.933333396911621, + "reward_std": 0.7672461867332458, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 23 + }, + { + "completion_length": 250.0, + "epoch": 0.08391608391608392, + "grad_norm": 1.1869820356369019, + "kl": 0.0007901927456259727, + "learning_rate": 6.000000000000001e-07, + "loss": 0.0, + "reward": 0.9666666984558105, + "reward_std": 0.797287106513977, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 24 + }, + { + "completion_length": 224.33334350585938, + "epoch": 0.08741258741258741, + "grad_norm": 1.1140718460083008, + "kl": 0.0006676652701571584, + "learning_rate": 6.25e-07, + "loss": 0.0, + "reward": 1.125, + "reward_std": 1.069462537765503, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.125, + "step": 25 + }, + { + "completion_length": 112.33333587646484, + "epoch": 0.09090909090909091, + "grad_norm": 1.20625901222229, + "kl": 0.0006995900766924024, + "learning_rate": 6.5e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0, + "step": 26 + }, + { + "completion_length": 398.8333435058594, + "epoch": 0.0944055944055944, + "grad_norm": 5.332723617553711, + "kl": 0.0007186655420809984, + "learning_rate": 6.750000000000001e-07, + "loss": 0.0, + "reward": 1.6625001430511475, + "reward_std": 0.9664044380187988, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3291666805744171, + "step": 27 + }, + { + "completion_length": 336.3333435058594, + "epoch": 0.0979020979020979, + "grad_norm": 0.7707162499427795, + "kl": 0.0007305681938305497, + "learning_rate": 7.000000000000001e-07, + "loss": 0.0, + "reward": 1.441666603088379, + "reward_std": 0.9876319766044617, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.2750000059604645, + "step": 28 + }, + { + "completion_length": 355.8333435058594, + "epoch": 0.10139860139860139, + "grad_norm": 0.999113142490387, + "kl": 0.0006821553106419742, + "learning_rate": 7.25e-07, + "loss": 0.0, + "reward": 1.1583333015441895, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.15833333134651184, + "step": 29 + }, + { + "completion_length": 188.1666717529297, + "epoch": 0.1048951048951049, + "grad_norm": 1.1029480695724487, + "kl": 0.0007804523920640349, + "learning_rate": 7.5e-07, + "loss": 0.0, + "reward": 1.183333396911621, + "reward_std": 1.0680201053619385, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.18333333730697632, + "step": 30 + }, + { + "completion_length": 380.3333435058594, + "epoch": 0.10839160839160839, + "grad_norm": 0.9132871627807617, + "kl": 0.0008556495886296034, + "learning_rate": 7.750000000000001e-07, + "loss": 0.0, + "reward": 2.2375001907348633, + "reward_std": 1.4762918949127197, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40416666865348816, + "step": 31 + }, + { + "completion_length": 348.0, + "epoch": 0.11188811188811189, + "grad_norm": 1.549122929573059, + "kl": 0.0009064790210686624, + "learning_rate": 8.000000000000001e-07, + "loss": 0.0, + "reward": 0.8291666507720947, + "reward_std": 1.029613733291626, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.16250000894069672, + "step": 32 + }, + { + "completion_length": 349.5, + "epoch": 0.11538461538461539, + "grad_norm": 0.8771302700042725, + "kl": 0.0008574656676501036, + "learning_rate": 8.250000000000001e-07, + "loss": 0.0, + "reward": 1.133333444595337, + "reward_std": 0.9867455363273621, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.30000001192092896, + "step": 33 + }, + { + "completion_length": 698.8333740234375, + "epoch": 0.11888111888111888, + "grad_norm": 0.7568854689598083, + "kl": 0.0007735582767054439, + "learning_rate": 8.500000000000001e-07, + "loss": 0.0, + "reward": 1.933333396911621, + "reward_std": 1.1737406253814697, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 34 + }, + { + "completion_length": 655.3333740234375, + "epoch": 0.12237762237762238, + "grad_norm": 1.5077099800109863, + "kl": 0.0007145506679080427, + "learning_rate": 8.75e-07, + "loss": 0.0, + "reward": 1.337499976158142, + "reward_std": 0.7572566270828247, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5041667222976685, + "step": 35 + }, + { + "completion_length": 156.0, + "epoch": 0.1258741258741259, + "grad_norm": 1.1091190576553345, + "kl": 0.0010963345412164927, + "learning_rate": 9.000000000000001e-07, + "loss": 0.0, + "reward": 1.1583333015441895, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.15833333134651184, + "step": 36 + }, + { + "completion_length": 184.6666717529297, + "epoch": 0.12937062937062938, + "grad_norm": 1.1978340148925781, + "kl": 0.000993944238871336, + "learning_rate": 9.25e-07, + "loss": 0.0, + "reward": 0.8333333730697632, + "reward_std": 1.2944754362106323, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 37 + }, + { + "completion_length": 170.1666717529297, + "epoch": 0.13286713286713286, + "grad_norm": 0.9296630620956421, + "kl": 0.0012741987593472004, + "learning_rate": 9.500000000000001e-07, + "loss": 0.0001, + "reward": 1.25, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 38 + }, + { + "completion_length": 284.3333435058594, + "epoch": 0.13636363636363635, + "grad_norm": 1.3948841094970703, + "kl": 0.0010804318590089679, + "learning_rate": 9.750000000000002e-07, + "loss": 0.0, + "reward": 1.1083333492279053, + "reward_std": 1.263098120689392, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2750000059604645, + "step": 39 + }, + { + "completion_length": 132.1666717529297, + "epoch": 0.13986013986013987, + "grad_norm": 1.0202951431274414, + "kl": 0.0013121496886014938, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0001, + "reward": 0.3333333432674408, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0, + "step": 40 + }, + { + "completion_length": 156.1666717529297, + "epoch": 0.14335664335664336, + "grad_norm": 0.9724128246307373, + "kl": 0.0010785979684442282, + "learning_rate": 1.025e-06, + "loss": 0.0, + "reward": 0.6083333492279053, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10833333432674408, + "step": 41 + }, + { + "completion_length": 603.1666870117188, + "epoch": 0.14685314685314685, + "grad_norm": 0.7776791453361511, + "kl": 0.0006764258723706007, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.0, + "reward": 1.4500001668930054, + "reward_std": 0.30659419298171997, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.45000001788139343, + "step": 42 + }, + { + "completion_length": 183.1666717529297, + "epoch": 0.15034965034965034, + "grad_norm": 1.2581369876861572, + "kl": 0.0012429999187588692, + "learning_rate": 1.075e-06, + "loss": 0.0, + "reward": 1.1749999523162842, + "reward_std": 1.0567638874053955, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17499999701976776, + "step": 43 + }, + { + "completion_length": 379.16668701171875, + "epoch": 0.15384615384615385, + "grad_norm": 2.0310208797454834, + "kl": 0.0011767616961151361, + "learning_rate": 1.1e-06, + "loss": 0.0, + "reward": 2.633333683013916, + "reward_std": 1.0595598220825195, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.46666663885116577, + "step": 44 + }, + { + "completion_length": 637.3333740234375, + "epoch": 0.15734265734265734, + "grad_norm": 1.2500090599060059, + "kl": 0.001643048133701086, + "learning_rate": 1.125e-06, + "loss": 0.0001, + "reward": 1.1500000953674316, + "reward_std": 0.7307531237602234, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 45 + }, + { + "completion_length": 182.0, + "epoch": 0.16083916083916083, + "grad_norm": 2.3323163986206055, + "kl": 0.003556631039828062, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.0001, + "reward": 0.9666666984558105, + "reward_std": 1.0230672359466553, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.13333334028720856, + "step": 46 + }, + { + "completion_length": 109.83333587646484, + "epoch": 0.16433566433566432, + "grad_norm": 1.834832787513733, + "kl": 0.002168774139136076, + "learning_rate": 1.175e-06, + "loss": 0.0001, + "reward": 0.5583333373069763, + "reward_std": 0.6248332858085632, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 47 + }, + { + "completion_length": 337.16668701171875, + "epoch": 0.16783216783216784, + "grad_norm": 1.1725846529006958, + "kl": 0.002405840437859297, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0001, + "reward": 0.6500000357627869, + "reward_std": 0.7962412238121033, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 48 + }, + { + "completion_length": 437.3333435058594, + "epoch": 0.17132867132867133, + "grad_norm": 0.743201494216919, + "kl": 0.0013375936541706324, + "learning_rate": 1.2250000000000001e-06, + "loss": 0.0001, + "reward": 1.183333396911621, + "reward_std": 1.3611271381378174, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3499999940395355, + "step": 49 + }, + { + "completion_length": 533.8333740234375, + "epoch": 0.17482517482517482, + "grad_norm": 0.7576809525489807, + "kl": 0.0019401045283302665, + "learning_rate": 1.25e-06, + "loss": 0.0001, + "reward": 1.7291667461395264, + "reward_std": 0.7050561308860779, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5625, + "step": 50 + }, + { + "completion_length": 203.5, + "epoch": 0.17832167832167833, + "grad_norm": 1.4076164960861206, + "kl": 0.0030774520710110664, + "learning_rate": 1.275e-06, + "loss": 0.0001, + "reward": 0.7750000357627869, + "reward_std": 0.5135659575462341, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2750000059604645, + "step": 51 + }, + { + "completion_length": 409.0, + "epoch": 0.18181818181818182, + "grad_norm": 0.8726016879081726, + "kl": 0.0025800741277635098, + "learning_rate": 1.3e-06, + "loss": 0.0001, + "reward": 0.5916666984558105, + "reward_std": 0.7324047088623047, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 52 + }, + { + "completion_length": 356.5, + "epoch": 0.1853146853146853, + "grad_norm": 0.877477765083313, + "kl": 0.0021268115378916264, + "learning_rate": 1.3250000000000002e-06, + "loss": 0.0001, + "reward": 1.6166666746139526, + "reward_std": 0.6976150274276733, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.28333336114883423, + "step": 53 + }, + { + "completion_length": 243.33334350585938, + "epoch": 0.1888111888111888, + "grad_norm": 0.9792532324790955, + "kl": 0.0043938253074884415, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.0002, + "reward": 1.1708333492279053, + "reward_std": 1.282616138458252, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17083333432674408, + "step": 54 + }, + { + "completion_length": 393.3333435058594, + "epoch": 0.19230769230769232, + "grad_norm": 1.205925703048706, + "kl": 0.0031106050591915846, + "learning_rate": 1.3750000000000002e-06, + "loss": 0.0001, + "reward": 0.9666666984558105, + "reward_std": 0.8084965944290161, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 55 + }, + { + "completion_length": 228.83334350585938, + "epoch": 0.1958041958041958, + "grad_norm": 0.7984407544136047, + "kl": 0.007072250358760357, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.0003, + "reward": 0.6916667222976685, + "reward_std": 1.1655113697052002, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19166666269302368, + "step": 56 + }, + { + "completion_length": 361.66668701171875, + "epoch": 0.1993006993006993, + "grad_norm": 3.0838680267333984, + "kl": 0.006738494616001844, + "learning_rate": 1.425e-06, + "loss": 0.0003, + "reward": 1.3041667938232422, + "reward_std": 0.2600080370903015, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30416667461395264, + "step": 57 + }, + { + "completion_length": 502.66668701171875, + "epoch": 0.20279720279720279, + "grad_norm": 0.7226095795631409, + "kl": 0.0058082761242985725, + "learning_rate": 1.45e-06, + "loss": 0.0002, + "reward": 1.399999976158142, + "reward_std": 1.3337916135787964, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.40000003576278687, + "step": 58 + }, + { + "completion_length": 210.5, + "epoch": 0.2062937062937063, + "grad_norm": 1.079681158065796, + "kl": 0.009464471600949764, + "learning_rate": 1.475e-06, + "loss": 0.0004, + "reward": 0.9750000238418579, + "reward_std": 1.1890122890472412, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.14166668057441711, + "step": 59 + }, + { + "completion_length": 208.5, + "epoch": 0.2097902097902098, + "grad_norm": 1.8312753438949585, + "kl": 0.03959222882986069, + "learning_rate": 1.5e-06, + "loss": 0.0016, + "reward": 0.5333333611488342, + "reward_std": 0.8553751707077026, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.20000001788139343, + "step": 60 + }, + { + "completion_length": 285.5, + "epoch": 0.21328671328671328, + "grad_norm": 0.9337784051895142, + "kl": 0.011914614588022232, + "learning_rate": 1.525e-06, + "loss": 0.0005, + "reward": 1.4458332061767578, + "reward_std": 0.4955846071243286, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.27916666865348816, + "step": 61 + }, + { + "completion_length": 276.3333435058594, + "epoch": 0.21678321678321677, + "grad_norm": 1.4266396760940552, + "kl": 0.02391706220805645, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.001, + "reward": 1.1583333015441895, + "reward_std": 0.8598934412002563, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32500001788139343, + "step": 62 + }, + { + "completion_length": 381.3333435058594, + "epoch": 0.2202797202797203, + "grad_norm": 1.1708087921142578, + "kl": 0.012987270019948483, + "learning_rate": 1.5750000000000002e-06, + "loss": 0.0005, + "reward": 1.5416667461395264, + "reward_std": 1.3807305097579956, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 63 + }, + { + "completion_length": 237.0, + "epoch": 0.22377622377622378, + "grad_norm": 1.3068374395370483, + "kl": 0.027782242745161057, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0011, + "reward": 1.433333396911621, + "reward_std": 1.162611961364746, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2666666805744171, + "step": 64 + }, + { + "completion_length": 797.6666870117188, + "epoch": 0.22727272727272727, + "grad_norm": 0.7319328784942627, + "kl": 0.013491494581103325, + "learning_rate": 1.6250000000000001e-06, + "loss": 0.0005, + "reward": 1.3166667222976685, + "reward_std": 0.8604747653007507, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3166666626930237, + "step": 65 + }, + { + "completion_length": 237.1666717529297, + "epoch": 0.23076923076923078, + "grad_norm": 1.9626200199127197, + "kl": 0.015099573880434036, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0006, + "reward": 0.9666666388511658, + "reward_std": 0.797287106513977, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 66 + }, + { + "completion_length": 221.1666717529297, + "epoch": 0.23426573426573427, + "grad_norm": 0.7815642952919006, + "kl": 0.03964684158563614, + "learning_rate": 1.6750000000000003e-06, + "loss": 0.0016, + "reward": 1.6416667699813843, + "reward_std": 1.0584973096847534, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.14166668057441711, + "step": 67 + }, + { + "completion_length": 227.33334350585938, + "epoch": 0.23776223776223776, + "grad_norm": 1.5282418727874756, + "kl": 0.0695306807756424, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0028, + "reward": 0.75, + "reward_std": 0.7375635504722595, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.25, + "step": 68 + }, + { + "completion_length": 673.3333740234375, + "epoch": 0.24125874125874125, + "grad_norm": 0.8560697436332703, + "kl": 0.03540939837694168, + "learning_rate": 1.725e-06, + "loss": 0.0014, + "reward": 2.200000047683716, + "reward_std": 0.9581232070922852, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5333333611488342, + "step": 69 + }, + { + "completion_length": 254.6666717529297, + "epoch": 0.24475524475524477, + "grad_norm": 1.2371562719345093, + "kl": 0.03692096844315529, + "learning_rate": 1.75e-06, + "loss": 0.0015, + "reward": 1.8249998092651367, + "reward_std": 0.9968700408935547, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32499998807907104, + "step": 70 + }, + { + "completion_length": 234.6666717529297, + "epoch": 0.24825174825174826, + "grad_norm": 0.9824966192245483, + "kl": 0.07421376556158066, + "learning_rate": 1.7750000000000002e-06, + "loss": 0.003, + "reward": 1.1666667461395264, + "reward_std": 0.6485882997512817, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3333333432674408, + "step": 71 + }, + { + "completion_length": 580.0, + "epoch": 0.2517482517482518, + "grad_norm": 1.0504631996154785, + "kl": 0.048039551824331284, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0019, + "reward": 1.808333396911621, + "reward_std": 1.302849531173706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 72 + }, + { + "completion_length": 788.1666870117188, + "epoch": 0.25524475524475526, + "grad_norm": 0.6447965502738953, + "kl": 0.04130098968744278, + "learning_rate": 1.825e-06, + "loss": 0.0017, + "reward": 1.3875000476837158, + "reward_std": 1.093817949295044, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5541666746139526, + "step": 73 + }, + { + "completion_length": 376.16668701171875, + "epoch": 0.25874125874125875, + "grad_norm": 1.347108244895935, + "kl": 0.19923770427703857, + "learning_rate": 1.85e-06, + "loss": 0.008, + "reward": 1.529166579246521, + "reward_std": 0.6618943214416504, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19583332538604736, + "step": 74 + }, + { + "completion_length": 227.1666717529297, + "epoch": 0.26223776223776224, + "grad_norm": 0.8091520667076111, + "kl": 0.06355344504117966, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.0025, + "reward": 0.75, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 75 + }, + { + "completion_length": 502.3333435058594, + "epoch": 0.26573426573426573, + "grad_norm": 1.1315293312072754, + "kl": 0.11514662951231003, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0046, + "reward": 1.504166603088379, + "reward_std": 1.256027102470398, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.33750003576278687, + "step": 76 + }, + { + "completion_length": 306.16668701171875, + "epoch": 0.2692307692307692, + "grad_norm": 1.6002874374389648, + "kl": 0.07964249700307846, + "learning_rate": 1.925e-06, + "loss": 0.0032, + "reward": 1.7083333730697632, + "reward_std": 1.2195971012115479, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5416666865348816, + "step": 77 + }, + { + "completion_length": 253.0, + "epoch": 0.2727272727272727, + "grad_norm": 1.134474754333496, + "kl": 0.09407778084278107, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0038, + "reward": 1.8333333730697632, + "reward_std": 1.0842816829681396, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3333333432674408, + "step": 78 + }, + { + "completion_length": 456.3333435058594, + "epoch": 0.2762237762237762, + "grad_norm": 1.4590799808502197, + "kl": 0.08163408935070038, + "learning_rate": 1.975e-06, + "loss": 0.0033, + "reward": 1.1875, + "reward_std": 1.164232611656189, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3541666865348816, + "step": 79 + }, + { + "completion_length": 273.0, + "epoch": 0.27972027972027974, + "grad_norm": 1.589087724685669, + "kl": 0.08010071516036987, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0032, + "reward": 0.9125000238418579, + "reward_std": 0.9088110327720642, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.07916666567325592, + "step": 80 + }, + { + "completion_length": 196.1666717529297, + "epoch": 0.28321678321678323, + "grad_norm": 1.4217482805252075, + "kl": 0.0619954913854599, + "learning_rate": 2.025e-06, + "loss": 0.0025, + "reward": 1.058333396911621, + "reward_std": 0.7486097812652588, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 81 + }, + { + "completion_length": 340.8333435058594, + "epoch": 0.2867132867132867, + "grad_norm": 1.056475043296814, + "kl": 0.05495650693774223, + "learning_rate": 2.05e-06, + "loss": 0.0022, + "reward": 0.8625000715255737, + "reward_std": 0.5305068492889404, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19583332538604736, + "step": 82 + }, + { + "completion_length": 410.66668701171875, + "epoch": 0.2902097902097902, + "grad_norm": 0.5162915587425232, + "kl": 0.04134432598948479, + "learning_rate": 2.075e-06, + "loss": 0.0017, + "reward": 1.1875, + "reward_std": 0.7466174364089966, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1875, + "step": 83 + }, + { + "completion_length": 510.66668701171875, + "epoch": 0.2937062937062937, + "grad_norm": 0.9501734972000122, + "kl": 0.047528013586997986, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0019, + "reward": 1.258333444595337, + "reward_std": 1.1069854497909546, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 84 + }, + { + "completion_length": 476.0, + "epoch": 0.2972027972027972, + "grad_norm": 1.0745543241500854, + "kl": 0.04738708958029747, + "learning_rate": 2.125e-06, + "loss": 0.0019, + "reward": 0.7541666030883789, + "reward_std": 0.6050654649734497, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2541666626930237, + "step": 85 + }, + { + "completion_length": 346.16668701171875, + "epoch": 0.3006993006993007, + "grad_norm": 0.7894018888473511, + "kl": 0.03818603605031967, + "learning_rate": 2.15e-06, + "loss": 0.0015, + "reward": 1.5499999523162842, + "reward_std": 0.8729261159896851, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 86 + }, + { + "completion_length": 157.5, + "epoch": 0.3041958041958042, + "grad_norm": 1.2285088300704956, + "kl": 0.04852033406496048, + "learning_rate": 2.1750000000000004e-06, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 1.2284135818481445, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 87 + }, + { + "completion_length": 853.5, + "epoch": 0.3076923076923077, + "grad_norm": 1.1314716339111328, + "kl": 0.03052813559770584, + "learning_rate": 2.2e-06, + "loss": 0.0012, + "reward": 1.5625, + "reward_std": 1.093817949295044, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3958333432674408, + "step": 88 + }, + { + "completion_length": 372.66668701171875, + "epoch": 0.3111888111888112, + "grad_norm": 0.9353286623954773, + "kl": 0.027921725064516068, + "learning_rate": 2.2250000000000003e-06, + "loss": 0.0011, + "reward": 1.8250000476837158, + "reward_std": 0.9234446287155151, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32500001788139343, + "step": 89 + }, + { + "completion_length": 296.3333435058594, + "epoch": 0.3146853146853147, + "grad_norm": 1.140289306640625, + "kl": 0.04811665043234825, + "learning_rate": 2.25e-06, + "loss": 0.0019, + "reward": 1.125, + "reward_std": 1.1268318891525269, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 90 + }, + { + "completion_length": 99.83333587646484, + "epoch": 0.3181818181818182, + "grad_norm": 4.178561687469482, + "kl": 0.09318779408931732, + "learning_rate": 2.2750000000000002e-06, + "loss": 0.0037, + "reward": 0.5583333373069763, + "reward_std": 0.9645810127258301, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 91 + }, + { + "completion_length": 192.1666717529297, + "epoch": 0.32167832167832167, + "grad_norm": 1.560648798942566, + "kl": 0.03698144853115082, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0015, + "reward": 1.9249999523162842, + "reward_std": 0.718853235244751, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.25833335518836975, + "step": 92 + }, + { + "completion_length": 576.5, + "epoch": 0.32517482517482516, + "grad_norm": 1.093043327331543, + "kl": 0.021529672667384148, + "learning_rate": 2.325e-06, + "loss": 0.0009, + "reward": 1.070833444595337, + "reward_std": 0.6477686166763306, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.23749999701976776, + "step": 93 + }, + { + "completion_length": 335.8333435058594, + "epoch": 0.32867132867132864, + "grad_norm": 0.8303731679916382, + "kl": 0.019405633211135864, + "learning_rate": 2.35e-06, + "loss": 0.0008, + "reward": 0.8416666984558105, + "reward_std": 0.9183772802352905, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17499999701976776, + "step": 94 + }, + { + "completion_length": 569.5, + "epoch": 0.3321678321678322, + "grad_norm": 1.4912625551223755, + "kl": 0.014733041636645794, + "learning_rate": 2.375e-06, + "loss": 0.0006, + "reward": 1.4541667699813843, + "reward_std": 1.1459076404571533, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4541666507720947, + "step": 95 + }, + { + "completion_length": 232.83334350585938, + "epoch": 0.3356643356643357, + "grad_norm": 0.9174475073814392, + "kl": 0.018923718482255936, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0008, + "reward": 1.3333333730697632, + "reward_std": 0.9877583980560303, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 96 + }, + { + "completion_length": 742.1666870117188, + "epoch": 0.33916083916083917, + "grad_norm": 1.258750557899475, + "kl": 0.017664968967437744, + "learning_rate": 2.425e-06, + "loss": 0.0007, + "reward": 1.4583333730697632, + "reward_std": 0.6202150583267212, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 97 + }, + { + "completion_length": 270.8333435058594, + "epoch": 0.34265734265734266, + "grad_norm": 0.9259786605834961, + "kl": 0.05115365609526634, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.002, + "reward": 1.5500000715255737, + "reward_std": 0.8729261159896851, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.21666666865348816, + "step": 98 + }, + { + "completion_length": 476.3333435058594, + "epoch": 0.34615384615384615, + "grad_norm": 1.240902066230774, + "kl": 0.036602895706892014, + "learning_rate": 2.475e-06, + "loss": 0.0015, + "reward": 1.2791666984558105, + "reward_std": 1.1935679912567139, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.27916669845581055, + "step": 99 + }, + { + "completion_length": 213.6666717529297, + "epoch": 0.34965034965034963, + "grad_norm": 0.943215548992157, + "kl": 0.04590342566370964, + "learning_rate": 2.5e-06, + "loss": 0.0018, + "reward": 1.841666579246521, + "reward_std": 0.6873258352279663, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.34166666865348816, + "step": 100 + }, + { + "completion_length": 401.0, + "epoch": 0.3531468531468531, + "grad_norm": 0.7366496324539185, + "kl": 0.016905900090932846, + "learning_rate": 2.5250000000000004e-06, + "loss": 0.0007, + "reward": 1.3000000715255737, + "reward_std": 1.1256110668182373, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 101 + }, + { + "completion_length": 854.5, + "epoch": 0.35664335664335667, + "grad_norm": 8.089740753173828, + "kl": 0.08785610646009445, + "learning_rate": 2.55e-06, + "loss": 0.0035, + "reward": 1.316666603088379, + "reward_std": 1.2330517768859863, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 102 + }, + { + "completion_length": 455.16668701171875, + "epoch": 0.36013986013986016, + "grad_norm": 1.6066083908081055, + "kl": 0.03349429741501808, + "learning_rate": 2.5750000000000003e-06, + "loss": 0.0013, + "reward": 1.7333333492279053, + "reward_std": 1.6448911428451538, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40000003576278687, + "step": 103 + }, + { + "completion_length": 558.6666870117188, + "epoch": 0.36363636363636365, + "grad_norm": 1.2461860179901123, + "kl": 0.0453556627035141, + "learning_rate": 2.6e-06, + "loss": 0.0018, + "reward": 1.933333396911621, + "reward_std": 1.1851863861083984, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 104 + }, + { + "completion_length": 462.16668701171875, + "epoch": 0.36713286713286714, + "grad_norm": 0.9176071286201477, + "kl": 0.05445032939314842, + "learning_rate": 2.6250000000000003e-06, + "loss": 0.0022, + "reward": 1.2916667461395264, + "reward_std": 0.9144214391708374, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 105 + }, + { + "completion_length": 357.5, + "epoch": 0.3706293706293706, + "grad_norm": 1.1796709299087524, + "kl": 0.08697855472564697, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0035, + "reward": 0.9833333492279053, + "reward_std": 1.2412359714508057, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 106 + }, + { + "completion_length": 556.8333740234375, + "epoch": 0.3741258741258741, + "grad_norm": 1.1719709634780884, + "kl": 0.09557916224002838, + "learning_rate": 2.6750000000000002e-06, + "loss": 0.0038, + "reward": 0.9541666507720947, + "reward_std": 1.0742924213409424, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2875000238418579, + "step": 107 + }, + { + "completion_length": 490.8333435058594, + "epoch": 0.3776223776223776, + "grad_norm": 0.9839584827423096, + "kl": 0.07620736211538315, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.003, + "reward": 1.3416666984558105, + "reward_std": 1.0551856756210327, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5083333253860474, + "step": 108 + }, + { + "completion_length": 459.8333435058594, + "epoch": 0.3811188811188811, + "grad_norm": 1.0232492685317993, + "kl": 0.09754881262779236, + "learning_rate": 2.7250000000000006e-06, + "loss": 0.0039, + "reward": 1.7916667461395264, + "reward_std": 1.201422929763794, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 109 + }, + { + "completion_length": 432.5, + "epoch": 0.38461538461538464, + "grad_norm": 0.7946304082870483, + "kl": 0.043154411017894745, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0017, + "reward": 2.1000001430511475, + "reward_std": 0.8933085203170776, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 110 + }, + { + "completion_length": 346.8333435058594, + "epoch": 0.3881118881118881, + "grad_norm": 0.9842674136161804, + "kl": 0.1046643778681755, + "learning_rate": 2.7750000000000005e-06, + "loss": 0.0042, + "reward": 0.8166667222976685, + "reward_std": 0.7353004217147827, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 111 + }, + { + "completion_length": 214.5, + "epoch": 0.3916083916083916, + "grad_norm": 1.1671849489212036, + "kl": 0.1281026154756546, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0051, + "reward": 1.0500000715255737, + "reward_std": 0.14832398295402527, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 112 + }, + { + "completion_length": 908.6666870117188, + "epoch": 0.3951048951048951, + "grad_norm": 0.3388780951499939, + "kl": 0.022495290264487267, + "learning_rate": 2.825e-06, + "loss": 0.0009, + "reward": 2.3375000953674316, + "reward_std": 0.3727431893348694, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6708333492279053, + "step": 113 + }, + { + "completion_length": 891.6666870117188, + "epoch": 0.3986013986013986, + "grad_norm": 0.467278391122818, + "kl": 0.025123490020632744, + "learning_rate": 2.85e-06, + "loss": 0.001, + "reward": 1.8541667461395264, + "reward_std": 0.7543899416923523, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6875, + "step": 114 + }, + { + "completion_length": 546.1666870117188, + "epoch": 0.4020979020979021, + "grad_norm": 1.054366111755371, + "kl": 0.0783834159374237, + "learning_rate": 2.875e-06, + "loss": 0.0031, + "reward": 2.4000000953674316, + "reward_std": 1.306904673576355, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5666666626930237, + "step": 115 + }, + { + "completion_length": 835.1666870117188, + "epoch": 0.40559440559440557, + "grad_norm": 0.7376688122749329, + "kl": 0.04768560454249382, + "learning_rate": 2.9e-06, + "loss": 0.0019, + "reward": 1.5291666984558105, + "reward_std": 0.32841163873672485, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5291666984558105, + "step": 116 + }, + { + "completion_length": 368.3333435058594, + "epoch": 0.4090909090909091, + "grad_norm": 1.456405758857727, + "kl": 0.1393664926290512, + "learning_rate": 2.925e-06, + "loss": 0.0056, + "reward": 0.9541666507720947, + "reward_std": 0.7450531721115112, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2875000238418579, + "step": 117 + }, + { + "completion_length": 485.5, + "epoch": 0.4125874125874126, + "grad_norm": 1.4957919120788574, + "kl": 0.1291833370923996, + "learning_rate": 2.95e-06, + "loss": 0.0052, + "reward": 1.5833333730697632, + "reward_std": 1.4998888969421387, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4166666865348816, + "step": 118 + }, + { + "completion_length": 356.3333435058594, + "epoch": 0.4160839160839161, + "grad_norm": 1.178475022315979, + "kl": 0.10108506679534912, + "learning_rate": 2.9750000000000003e-06, + "loss": 0.004, + "reward": 0.7083333730697632, + "reward_std": 0.7506109476089478, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 119 + }, + { + "completion_length": 140.33334350585938, + "epoch": 0.4195804195804196, + "grad_norm": 1.4624924659729004, + "kl": 0.2249661386013031, + "learning_rate": 3e-06, + "loss": 0.009, + "reward": 0.9166666865348816, + "reward_std": 0.9174239635467529, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 120 + }, + { + "completion_length": 673.1666870117188, + "epoch": 0.4230769230769231, + "grad_norm": 1.0837116241455078, + "kl": 0.09312133491039276, + "learning_rate": 3.0250000000000003e-06, + "loss": 0.0037, + "reward": 2.2208335399627686, + "reward_std": 0.9818881750106812, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.38749998807907104, + "step": 121 + }, + { + "completion_length": 238.1666717529297, + "epoch": 0.42657342657342656, + "grad_norm": 1.0982871055603027, + "kl": 0.05689762160181999, + "learning_rate": 3.05e-06, + "loss": 0.0023, + "reward": 1.1166666746139526, + "reward_std": 0.7567474246025085, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.11666666716337204, + "step": 122 + }, + { + "completion_length": 576.1666870117188, + "epoch": 0.43006993006993005, + "grad_norm": 1.0922025442123413, + "kl": 0.04579655081033707, + "learning_rate": 3.075e-06, + "loss": 0.0018, + "reward": 2.4000000953674316, + "reward_std": 1.0807406902313232, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5666666626930237, + "step": 123 + }, + { + "completion_length": 736.6666870117188, + "epoch": 0.43356643356643354, + "grad_norm": 1.5019290447235107, + "kl": 0.030428007245063782, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0012, + "reward": 1.504166603088379, + "reward_std": 1.2472386360168457, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5041667222976685, + "step": 124 + }, + { + "completion_length": 603.5, + "epoch": 0.4370629370629371, + "grad_norm": 4.212569713592529, + "kl": 0.37697991728782654, + "learning_rate": 3.125e-06, + "loss": 0.0151, + "reward": 1.6416667699813843, + "reward_std": 0.8303112387657166, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6416667103767395, + "step": 125 + }, + { + "completion_length": 492.0, + "epoch": 0.4405594405594406, + "grad_norm": 0.9634215831756592, + "kl": 0.06763506680727005, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0027, + "reward": 2.125, + "reward_std": 1.2069590091705322, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 126 + }, + { + "completion_length": 792.1666870117188, + "epoch": 0.44405594405594406, + "grad_norm": 0.4220138192176819, + "kl": 0.03986603766679764, + "learning_rate": 3.175e-06, + "loss": 0.0016, + "reward": 1.1375000476837158, + "reward_std": 0.5137485265731812, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6375000476837158, + "step": 127 + }, + { + "completion_length": 535.5, + "epoch": 0.44755244755244755, + "grad_norm": 4.797938823699951, + "kl": 0.13327616453170776, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0053, + "reward": 1.1791666746139526, + "reward_std": 1.1582764387130737, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.34583336114883423, + "step": 128 + }, + { + "completion_length": 444.8333435058594, + "epoch": 0.45104895104895104, + "grad_norm": 0.7808079719543457, + "kl": 0.055326174944639206, + "learning_rate": 3.2250000000000005e-06, + "loss": 0.0022, + "reward": 1.495833396911621, + "reward_std": 0.7681823968887329, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.16250000894069672, + "step": 129 + }, + { + "completion_length": 454.66668701171875, + "epoch": 0.45454545454545453, + "grad_norm": 0.8776301741600037, + "kl": 0.11162035167217255, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0045, + "reward": 1.5750001668930054, + "reward_std": 1.4812158346176147, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.24166665971279144, + "step": 130 + }, + { + "completion_length": 769.6666870117188, + "epoch": 0.458041958041958, + "grad_norm": 0.4391367733478546, + "kl": 0.025292951613664627, + "learning_rate": 3.2750000000000004e-06, + "loss": 0.001, + "reward": 2.433333396911621, + "reward_std": 0.2746209502220154, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6000000238418579, + "step": 131 + }, + { + "completion_length": 528.6666870117188, + "epoch": 0.46153846153846156, + "grad_norm": 0.8809014558792114, + "kl": 0.12223925441503525, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0049, + "reward": 2.120833396911621, + "reward_std": 1.101410150527954, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4541666507720947, + "step": 132 + }, + { + "completion_length": 491.3333435058594, + "epoch": 0.46503496503496505, + "grad_norm": 1.0070464611053467, + "kl": 0.05908138304948807, + "learning_rate": 3.3250000000000004e-06, + "loss": 0.0024, + "reward": 0.5916666984558105, + "reward_std": 0.5335416197776794, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 133 + }, + { + "completion_length": 892.5, + "epoch": 0.46853146853146854, + "grad_norm": 0.4570764899253845, + "kl": 0.037701599299907684, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0015, + "reward": 1.7249999046325684, + "reward_std": 1.292478322982788, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 134 + }, + { + "completion_length": 806.8333740234375, + "epoch": 0.47202797202797203, + "grad_norm": 0.5572299361228943, + "kl": 0.05404336377978325, + "learning_rate": 3.3750000000000003e-06, + "loss": 0.0022, + "reward": 1.4583333730697632, + "reward_std": 0.990033745765686, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7916666269302368, + "step": 135 + }, + { + "completion_length": 589.0, + "epoch": 0.4755244755244755, + "grad_norm": 0.7575751543045044, + "kl": 0.04170485585927963, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0017, + "reward": 2.683333396911621, + "reward_std": 1.1075499057769775, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8500000238418579, + "step": 136 + }, + { + "completion_length": 1060.166748046875, + "epoch": 0.479020979020979, + "grad_norm": 0.5119641423225403, + "kl": 0.04976843297481537, + "learning_rate": 3.4250000000000007e-06, + "loss": 0.002, + "reward": 1.1125000715255737, + "reward_std": 0.39457258582115173, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6124999523162842, + "step": 137 + }, + { + "completion_length": 559.8333740234375, + "epoch": 0.4825174825174825, + "grad_norm": 0.6115387082099915, + "kl": 0.05675242468714714, + "learning_rate": 3.45e-06, + "loss": 0.0023, + "reward": 2.0416667461395264, + "reward_std": 0.5715476274490356, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5416666865348816, + "step": 138 + }, + { + "completion_length": 685.6666870117188, + "epoch": 0.486013986013986, + "grad_norm": 1.2578071355819702, + "kl": 0.07080799341201782, + "learning_rate": 3.475e-06, + "loss": 0.0028, + "reward": 1.379166603088379, + "reward_std": 1.0072758197784424, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7125000357627869, + "step": 139 + }, + { + "completion_length": 987.5, + "epoch": 0.48951048951048953, + "grad_norm": 0.6280319690704346, + "kl": 0.03268418833613396, + "learning_rate": 3.5e-06, + "loss": 0.0013, + "reward": 0.9291666746139526, + "reward_std": 0.6654728651046753, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5958333015441895, + "step": 140 + }, + { + "completion_length": 728.5, + "epoch": 0.493006993006993, + "grad_norm": 0.8773026466369629, + "kl": 0.032183535397052765, + "learning_rate": 3.525e-06, + "loss": 0.0013, + "reward": 2.862499952316284, + "reward_std": 0.7864078879356384, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6958333253860474, + "step": 141 + }, + { + "completion_length": 405.8333435058594, + "epoch": 0.4965034965034965, + "grad_norm": 0.8974792957305908, + "kl": 0.059865664690732956, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0024, + "reward": 1.6875, + "reward_std": 0.8300225734710693, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.3541666865348816, + "step": 142 + }, + { + "completion_length": 1081.666748046875, + "epoch": 0.5, + "grad_norm": 0.5286564230918884, + "kl": 0.022505857050418854, + "learning_rate": 3.575e-06, + "loss": 0.0009, + "reward": 2.370833396911621, + "reward_std": 0.4925486445426941, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8708332777023315, + "step": 143 + }, + { + "completion_length": 1141.3333740234375, + "epoch": 0.5034965034965035, + "grad_norm": 0.527409017086029, + "kl": 0.021072231233119965, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0008, + "reward": 1.9291666746139526, + "reward_std": 0.7955214381217957, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5958333611488342, + "step": 144 + }, + { + "completion_length": 515.5, + "epoch": 0.506993006993007, + "grad_norm": 2.5036261081695557, + "kl": 0.3181736469268799, + "learning_rate": 3.625e-06, + "loss": 0.0127, + "reward": 1.5833333730697632, + "reward_std": 0.9988327026367188, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5833333730697632, + "step": 145 + }, + { + "completion_length": 599.5, + "epoch": 0.5104895104895105, + "grad_norm": 0.7538139224052429, + "kl": 0.041587017476558685, + "learning_rate": 3.65e-06, + "loss": 0.0017, + "reward": 1.3583334684371948, + "reward_std": 0.6873258352279663, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6916666030883789, + "step": 146 + }, + { + "completion_length": 645.6666870117188, + "epoch": 0.513986013986014, + "grad_norm": 0.6815938353538513, + "kl": 0.031590305268764496, + "learning_rate": 3.6750000000000004e-06, + "loss": 0.0013, + "reward": 2.445833683013916, + "reward_std": 1.186003565788269, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6124999523162842, + "step": 147 + }, + { + "completion_length": 731.0, + "epoch": 0.5174825174825175, + "grad_norm": 1.4654277563095093, + "kl": 0.11272114515304565, + "learning_rate": 3.7e-06, + "loss": 0.0045, + "reward": 1.2125000953674316, + "reward_std": 0.7435977458953857, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7124999761581421, + "step": 148 + }, + { + "completion_length": 476.16668701171875, + "epoch": 0.5209790209790209, + "grad_norm": 3.388495683670044, + "kl": 0.9080104827880859, + "learning_rate": 3.7250000000000003e-06, + "loss": 0.0363, + "reward": 1.8958333730697632, + "reward_std": 0.9965461492538452, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3958333432674408, + "step": 149 + }, + { + "completion_length": 1053.166748046875, + "epoch": 0.5244755244755245, + "grad_norm": 0.4761454164981842, + "kl": 0.027715642005205154, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0011, + "reward": 3.2916667461395264, + "reward_std": 0.7417322397232056, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 150 + }, + { + "completion_length": 751.1666870117188, + "epoch": 0.527972027972028, + "grad_norm": 0.6827074885368347, + "kl": 0.0386313796043396, + "learning_rate": 3.7750000000000003e-06, + "loss": 0.0015, + "reward": 2.495833396911621, + "reward_std": 1.0227923393249512, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6625000238418579, + "step": 151 + }, + { + "completion_length": 721.8333740234375, + "epoch": 0.5314685314685315, + "grad_norm": 1.2814685106277466, + "kl": 0.041070081293582916, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0016, + "reward": 2.4666666984558105, + "reward_std": 0.8834120631217957, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 152 + }, + { + "completion_length": 513.0, + "epoch": 0.534965034965035, + "grad_norm": 0.6044140458106995, + "kl": 0.08036690950393677, + "learning_rate": 3.825000000000001e-06, + "loss": 0.0032, + "reward": 1.7875001430511475, + "reward_std": 1.1646621227264404, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6208333373069763, + "step": 153 + }, + { + "completion_length": 720.8333740234375, + "epoch": 0.5384615384615384, + "grad_norm": 0.7732751965522766, + "kl": 0.04927179962396622, + "learning_rate": 3.85e-06, + "loss": 0.002, + "reward": 2.383333206176758, + "reward_std": 1.4126808643341064, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 154 + }, + { + "completion_length": 708.8333740234375, + "epoch": 0.541958041958042, + "grad_norm": 0.6660548448562622, + "kl": 0.07937665283679962, + "learning_rate": 3.875e-06, + "loss": 0.0032, + "reward": 2.183333396911621, + "reward_std": 0.6377042531967163, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8500000238418579, + "step": 155 + }, + { + "completion_length": 1192.0, + "epoch": 0.5454545454545454, + "grad_norm": 0.3896901309490204, + "kl": 0.025209862738847733, + "learning_rate": 3.900000000000001e-06, + "loss": 0.001, + "reward": 1.8833332061767578, + "reward_std": 0.8691471815109253, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 156 + }, + { + "completion_length": 705.1666870117188, + "epoch": 0.548951048951049, + "grad_norm": 0.5750932097434998, + "kl": 0.04517858847975731, + "learning_rate": 3.9250000000000005e-06, + "loss": 0.0018, + "reward": 2.9541664123535156, + "reward_std": 0.6458360552787781, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6208333373069763, + "step": 157 + }, + { + "completion_length": 465.5, + "epoch": 0.5524475524475524, + "grad_norm": 0.8335661888122559, + "kl": 0.08351196348667145, + "learning_rate": 3.95e-06, + "loss": 0.0033, + "reward": 2.424999952316284, + "reward_std": 0.941673994064331, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5916666984558105, + "step": 158 + }, + { + "completion_length": 539.6666870117188, + "epoch": 0.5559440559440559, + "grad_norm": 1.1459757089614868, + "kl": 0.12647944688796997, + "learning_rate": 3.975000000000001e-06, + "loss": 0.0051, + "reward": 1.6416667699813843, + "reward_std": 1.2068209648132324, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 159 + }, + { + "completion_length": 798.0, + "epoch": 0.5594405594405595, + "grad_norm": 0.4939272105693817, + "kl": 0.051064085215330124, + "learning_rate": 4.000000000000001e-06, + "loss": 0.002, + "reward": 2.183333396911621, + "reward_std": 1.2081665992736816, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 160 + }, + { + "completion_length": 338.8333435058594, + "epoch": 0.5629370629370629, + "grad_norm": 0.8890612125396729, + "kl": 0.12327366322278976, + "learning_rate": 4.0250000000000004e-06, + "loss": 0.0049, + "reward": 2.575000286102295, + "reward_std": 0.9913375377655029, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40833336114883423, + "step": 161 + }, + { + "completion_length": 809.6666870117188, + "epoch": 0.5664335664335665, + "grad_norm": 0.3928314447402954, + "kl": 0.040153808891773224, + "learning_rate": 4.05e-06, + "loss": 0.0016, + "reward": 1.7208335399627686, + "reward_std": 0.5225937366485596, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7208333015441895, + "step": 162 + }, + { + "completion_length": 766.0, + "epoch": 0.5699300699300699, + "grad_norm": 0.7869060039520264, + "kl": 0.04531605541706085, + "learning_rate": 4.075e-06, + "loss": 0.0018, + "reward": 2.120833396911621, + "reward_std": 0.8866251707077026, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4541667103767395, + "step": 163 + }, + { + "completion_length": 1085.666748046875, + "epoch": 0.5734265734265734, + "grad_norm": 1.0671396255493164, + "kl": 0.06464602053165436, + "learning_rate": 4.1e-06, + "loss": 0.0026, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666865348816, + "step": 164 + }, + { + "completion_length": 628.1666870117188, + "epoch": 0.5769230769230769, + "grad_norm": 0.9583672285079956, + "kl": 0.06743767857551575, + "learning_rate": 4.125e-06, + "loss": 0.0027, + "reward": 2.137500286102295, + "reward_std": 1.376930594444275, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.637499988079071, + "step": 165 + }, + { + "completion_length": 351.8333435058594, + "epoch": 0.5804195804195804, + "grad_norm": 0.6946209669113159, + "kl": 0.09894745796918869, + "learning_rate": 4.15e-06, + "loss": 0.004, + "reward": 2.7750000953674316, + "reward_std": 0.7055140733718872, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4416666626930237, + "step": 166 + }, + { + "completion_length": 448.16668701171875, + "epoch": 0.583916083916084, + "grad_norm": 0.6712130308151245, + "kl": 0.0714031383395195, + "learning_rate": 4.175e-06, + "loss": 0.0029, + "reward": 1.9583333730697632, + "reward_std": 0.6499359011650085, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6250000596046448, + "step": 167 + }, + { + "completion_length": 763.0, + "epoch": 0.5874125874125874, + "grad_norm": 0.5934569239616394, + "kl": 0.039833370596170425, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0016, + "reward": 1.7208335399627686, + "reward_std": 0.6870983839035034, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.720833420753479, + "step": 168 + }, + { + "completion_length": 813.8333740234375, + "epoch": 0.5909090909090909, + "grad_norm": 0.46408811211586, + "kl": 0.0639135017991066, + "learning_rate": 4.225e-06, + "loss": 0.0026, + "reward": 2.6625001430511475, + "reward_std": 0.271454393863678, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6625000238418579, + "step": 169 + }, + { + "completion_length": 621.3333740234375, + "epoch": 0.5944055944055944, + "grad_norm": 1.6175382137298584, + "kl": 0.23431169986724854, + "learning_rate": 4.25e-06, + "loss": 0.0094, + "reward": 1.5250000953674316, + "reward_std": 1.00784432888031, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 170 + }, + { + "completion_length": 685.1666870117188, + "epoch": 0.5979020979020979, + "grad_norm": 0.7504808306694031, + "kl": 0.06654171645641327, + "learning_rate": 4.2750000000000006e-06, + "loss": 0.0027, + "reward": 2.4583334922790527, + "reward_std": 0.8187898397445679, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 171 + }, + { + "completion_length": 772.6666870117188, + "epoch": 0.6013986013986014, + "grad_norm": 0.39892545342445374, + "kl": 0.030765770003199577, + "learning_rate": 4.3e-06, + "loss": 0.0012, + "reward": 1.7333333492279053, + "reward_std": 0.40207797288894653, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 172 + }, + { + "completion_length": 600.8333740234375, + "epoch": 0.6048951048951049, + "grad_norm": 0.6147928833961487, + "kl": 0.07108036428689957, + "learning_rate": 4.325e-06, + "loss": 0.0028, + "reward": 2.054166793823242, + "reward_std": 0.5684225559234619, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7208333015441895, + "step": 173 + }, + { + "completion_length": 761.3333740234375, + "epoch": 0.6083916083916084, + "grad_norm": 1.1690645217895508, + "kl": 0.11572085320949554, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0046, + "reward": 1.9583333730697632, + "reward_std": 1.2491663694381714, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7916666865348816, + "step": 174 + }, + { + "completion_length": 800.6666870117188, + "epoch": 0.6118881118881119, + "grad_norm": 1.141146183013916, + "kl": 0.0763167217373848, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.0031, + "reward": 1.4458335638046265, + "reward_std": 1.0782413482666016, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6124999523162842, + "step": 175 + }, + { + "completion_length": 582.0, + "epoch": 0.6153846153846154, + "grad_norm": 0.9667629599571228, + "kl": 0.04065123200416565, + "learning_rate": 4.4e-06, + "loss": 0.0016, + "reward": 1.5625, + "reward_std": 1.3656271696090698, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5625, + "step": 176 + }, + { + "completion_length": 653.6666870117188, + "epoch": 0.6188811188811189, + "grad_norm": 0.7743256092071533, + "kl": 0.07254478335380554, + "learning_rate": 4.425e-06, + "loss": 0.0029, + "reward": 1.308333396911621, + "reward_std": 0.7324048280715942, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6416666507720947, + "step": 177 + }, + { + "completion_length": 624.8333740234375, + "epoch": 0.6223776223776224, + "grad_norm": 1.7900493144989014, + "kl": 0.2500300407409668, + "learning_rate": 4.450000000000001e-06, + "loss": 0.01, + "reward": 1.3583333492279053, + "reward_std": 0.7825705409049988, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916667222976685, + "step": 178 + }, + { + "completion_length": 1285.0, + "epoch": 0.6258741258741258, + "grad_norm": 0.3387628197669983, + "kl": 0.025821728631854057, + "learning_rate": 4.475e-06, + "loss": 0.001, + "reward": 2.7916667461395264, + "reward_std": 0.678355872631073, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666269302368, + "step": 179 + }, + { + "completion_length": 975.8333740234375, + "epoch": 0.6293706293706294, + "grad_norm": 0.41932833194732666, + "kl": 0.04700490087270737, + "learning_rate": 4.5e-06, + "loss": 0.0019, + "reward": 1.8500001430511475, + "reward_std": 0.6782330274581909, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8499999046325684, + "step": 180 + }, + { + "completion_length": 771.8333740234375, + "epoch": 0.6328671328671329, + "grad_norm": 0.6049262881278992, + "kl": 0.05856431648135185, + "learning_rate": 4.525000000000001e-06, + "loss": 0.0023, + "reward": 1.6624999046325684, + "reward_std": 1.5213277339935303, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6625000238418579, + "step": 181 + }, + { + "completion_length": 718.3333740234375, + "epoch": 0.6363636363636364, + "grad_norm": 0.519266664981842, + "kl": 0.05408002436161041, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0022, + "reward": 3.012500286102295, + "reward_std": 1.0839452743530273, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8458333015441895, + "step": 182 + }, + { + "completion_length": 417.3333435058594, + "epoch": 0.6398601398601399, + "grad_norm": 1.159592866897583, + "kl": 0.06883987784385681, + "learning_rate": 4.575e-06, + "loss": 0.0028, + "reward": 2.308333396911621, + "reward_std": 1.089686393737793, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416666507720947, + "step": 183 + }, + { + "completion_length": 403.66668701171875, + "epoch": 0.6433566433566433, + "grad_norm": 0.9109689593315125, + "kl": 0.12938742339611053, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0052, + "reward": 2.829166889190674, + "reward_std": 0.9263390898704529, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4958333373069763, + "step": 184 + }, + { + "completion_length": 584.1666870117188, + "epoch": 0.6468531468531469, + "grad_norm": 1.3091282844543457, + "kl": 0.1182996854186058, + "learning_rate": 4.625000000000001e-06, + "loss": 0.0047, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 185 + }, + { + "completion_length": 715.8333740234375, + "epoch": 0.6503496503496503, + "grad_norm": 0.8944427967071533, + "kl": 0.07471362501382828, + "learning_rate": 4.65e-06, + "loss": 0.003, + "reward": 2.5500001907348633, + "reward_std": 1.0044898986816406, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 186 + }, + { + "completion_length": 328.66668701171875, + "epoch": 0.6538461538461539, + "grad_norm": 2.0265045166015625, + "kl": 0.3070363402366638, + "learning_rate": 4.675000000000001e-06, + "loss": 0.0123, + "reward": 2.0291666984558105, + "reward_std": 0.9910117983818054, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.36250001192092896, + "step": 187 + }, + { + "completion_length": 463.8333435058594, + "epoch": 0.6573426573426573, + "grad_norm": 1.1863874197006226, + "kl": 0.07772837579250336, + "learning_rate": 4.7e-06, + "loss": 0.0031, + "reward": 2.5333335399627686, + "reward_std": 0.9558593034744263, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5333333611488342, + "step": 188 + }, + { + "completion_length": 516.5, + "epoch": 0.6608391608391608, + "grad_norm": 0.690477192401886, + "kl": 0.08707510679960251, + "learning_rate": 4.7250000000000005e-06, + "loss": 0.0035, + "reward": 3.4000000953674316, + "reward_std": 1.2024973630905151, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 189 + }, + { + "completion_length": 656.8333740234375, + "epoch": 0.6643356643356644, + "grad_norm": 0.7191756963729858, + "kl": 0.05152536556124687, + "learning_rate": 4.75e-06, + "loss": 0.0021, + "reward": 1.7833335399627686, + "reward_std": 0.5288351774215698, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 190 + }, + { + "completion_length": 510.16668701171875, + "epoch": 0.6678321678321678, + "grad_norm": 1.589722990989685, + "kl": 0.11165278404951096, + "learning_rate": 4.775e-06, + "loss": 0.0045, + "reward": 1.5916666984558105, + "reward_std": 1.1620744466781616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5916666984558105, + "step": 191 + }, + { + "completion_length": 463.3333435058594, + "epoch": 0.6713286713286714, + "grad_norm": 1.1402506828308105, + "kl": 0.12224837392568588, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0049, + "reward": 3.0166664123535156, + "reward_std": 0.46224093437194824, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6833333373069763, + "step": 192 + }, + { + "completion_length": 668.8333740234375, + "epoch": 0.6748251748251748, + "grad_norm": 0.829407811164856, + "kl": 0.04827030003070831, + "learning_rate": 4.825e-06, + "loss": 0.0019, + "reward": 2.516666889190674, + "reward_std": 0.9416297674179077, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 193 + }, + { + "completion_length": 653.1666870117188, + "epoch": 0.6783216783216783, + "grad_norm": 0.8737359642982483, + "kl": 0.11687206476926804, + "learning_rate": 4.85e-06, + "loss": 0.0047, + "reward": 1.883333444595337, + "reward_std": 0.9978310465812683, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666388511658, + "step": 194 + }, + { + "completion_length": 521.1666870117188, + "epoch": 0.6818181818181818, + "grad_norm": 1.265020728111267, + "kl": 0.1497541069984436, + "learning_rate": 4.875e-06, + "loss": 0.006, + "reward": 1.6666667461395264, + "reward_std": 1.1578716039657593, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6666666865348816, + "step": 195 + }, + { + "completion_length": 720.3333740234375, + "epoch": 0.6853146853146853, + "grad_norm": 0.5844486355781555, + "kl": 0.07905390858650208, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0032, + "reward": 2.683333396911621, + "reward_std": 0.7659417986869812, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 196 + }, + { + "completion_length": 654.3333740234375, + "epoch": 0.6888111888111889, + "grad_norm": 1.0279442071914673, + "kl": 0.05869147181510925, + "learning_rate": 4.925e-06, + "loss": 0.0023, + "reward": 1.8250000476837158, + "reward_std": 1.047735571861267, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.824999988079071, + "step": 197 + }, + { + "completion_length": 696.5, + "epoch": 0.6923076923076923, + "grad_norm": 0.5949178338050842, + "kl": 0.10564576834440231, + "learning_rate": 4.95e-06, + "loss": 0.0042, + "reward": 2.7958333492279053, + "reward_std": 0.8044278621673584, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 198 + }, + { + "completion_length": 667.3333740234375, + "epoch": 0.6958041958041958, + "grad_norm": 1.4045933485031128, + "kl": 0.2249039262533188, + "learning_rate": 4.975000000000001e-06, + "loss": 0.009, + "reward": 1.7833333015441895, + "reward_std": 1.2967909574508667, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 199 + }, + { + "completion_length": 549.0, + "epoch": 0.6993006993006993, + "grad_norm": 11.491266250610352, + "kl": 2.7085909843444824, + "learning_rate": 5e-06, + "loss": 0.1083, + "reward": 1.883333444595337, + "reward_std": 1.0934655666351318, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 200 + }, + { + "completion_length": 1157.666748046875, + "epoch": 0.7027972027972028, + "grad_norm": 0.3758504092693329, + "kl": 0.03439244627952576, + "learning_rate": 4.99999619228322e-06, + "loss": 0.0014, + "reward": 1.5375001430511475, + "reward_std": 0.490853875875473, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 201 + }, + { + "completion_length": 276.66668701171875, + "epoch": 0.7062937062937062, + "grad_norm": 1.4240407943725586, + "kl": 0.09711845219135284, + "learning_rate": 4.999984769144476e-06, + "loss": 0.0039, + "reward": 1.774999976158142, + "reward_std": 1.4250439405441284, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.44166669249534607, + "step": 202 + }, + { + "completion_length": 506.16668701171875, + "epoch": 0.7097902097902098, + "grad_norm": 0.8863720893859863, + "kl": 0.0886097177863121, + "learning_rate": 4.999965730618567e-06, + "loss": 0.0035, + "reward": 2.4166667461395264, + "reward_std": 0.7717944979667664, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.75, + "step": 203 + }, + { + "completion_length": 558.8333740234375, + "epoch": 0.7132867132867133, + "grad_norm": 1.036176323890686, + "kl": 0.11752279102802277, + "learning_rate": 4.999939076763487e-06, + "loss": 0.0047, + "reward": 1.8583334684371948, + "reward_std": 0.7761551141738892, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916666030883789, + "step": 204 + }, + { + "completion_length": 590.3333740234375, + "epoch": 0.7167832167832168, + "grad_norm": 1.2968803644180298, + "kl": 0.1260688155889511, + "learning_rate": 4.9999048076604286e-06, + "loss": 0.005, + "reward": 1.883333444595337, + "reward_std": 1.0934655666351318, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666984558105, + "step": 205 + }, + { + "completion_length": 653.3333740234375, + "epoch": 0.7202797202797203, + "grad_norm": 1.9041389226913452, + "kl": 0.350026935338974, + "learning_rate": 4.999862923413781e-06, + "loss": 0.014, + "reward": 1.8041666746139526, + "reward_std": 0.5104941129684448, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6375000476837158, + "step": 206 + }, + { + "completion_length": 359.3333435058594, + "epoch": 0.7237762237762237, + "grad_norm": 1.4652067422866821, + "kl": 0.09337612986564636, + "learning_rate": 4.9998134241511305e-06, + "loss": 0.0037, + "reward": 1.875, + "reward_std": 1.1440061330795288, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5416666865348816, + "step": 207 + }, + { + "completion_length": 393.3333435058594, + "epoch": 0.7272727272727273, + "grad_norm": 0.8172839879989624, + "kl": 0.11479752510786057, + "learning_rate": 4.999756310023261e-06, + "loss": 0.0046, + "reward": 3.2916667461395264, + "reward_std": 0.46627962589263916, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.625, + "step": 208 + }, + { + "completion_length": 1035.166748046875, + "epoch": 0.7307692307692307, + "grad_norm": 0.45489755272865295, + "kl": 0.03647574782371521, + "learning_rate": 4.9996915812041515e-06, + "loss": 0.0015, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 209 + }, + { + "completion_length": 561.5, + "epoch": 0.7342657342657343, + "grad_norm": 0.7732179164886475, + "kl": 0.10910838097333908, + "learning_rate": 4.9996192378909785e-06, + "loss": 0.0044, + "reward": 3.075000286102295, + "reward_std": 0.9852665662765503, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416667342185974, + "step": 210 + }, + { + "completion_length": 327.3333435058594, + "epoch": 0.7377622377622378, + "grad_norm": 1.1959446668624878, + "kl": 0.18659886717796326, + "learning_rate": 4.999539280304111e-06, + "loss": 0.0075, + "reward": 1.7333333492279053, + "reward_std": 0.6875075697898865, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5666666626930237, + "step": 211 + }, + { + "completion_length": 698.1666870117188, + "epoch": 0.7412587412587412, + "grad_norm": 0.5885636806488037, + "kl": 0.06670037657022476, + "learning_rate": 4.999451708687114e-06, + "loss": 0.0027, + "reward": 2.7750003337860107, + "reward_std": 0.8341163396835327, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7749999761581421, + "step": 212 + }, + { + "completion_length": 679.8333740234375, + "epoch": 0.7447552447552448, + "grad_norm": 0.9122396111488342, + "kl": 0.10316199064254761, + "learning_rate": 4.999356523306746e-06, + "loss": 0.0041, + "reward": 2.008333444595337, + "reward_std": 1.2973692417144775, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5083333253860474, + "step": 213 + }, + { + "completion_length": 604.1666870117188, + "epoch": 0.7482517482517482, + "grad_norm": 0.7414869070053101, + "kl": 0.08340045064687729, + "learning_rate": 4.9992537244529585e-06, + "loss": 0.0033, + "reward": 3.299999952316284, + "reward_std": 0.41713306307792664, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8000000715255737, + "step": 214 + }, + { + "completion_length": 704.5, + "epoch": 0.7517482517482518, + "grad_norm": 2.09073543548584, + "kl": 0.10594753921031952, + "learning_rate": 4.999143312438893e-06, + "loss": 0.0042, + "reward": 1.7416666746139526, + "reward_std": 0.9259679317474365, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7416667342185974, + "step": 215 + }, + { + "completion_length": 587.8333740234375, + "epoch": 0.7552447552447552, + "grad_norm": 1.304240107536316, + "kl": 0.1295248121023178, + "learning_rate": 4.999025287600886e-06, + "loss": 0.0052, + "reward": 2.616666793823242, + "reward_std": 1.6061341762542725, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6166666746139526, + "step": 216 + }, + { + "completion_length": 495.8333435058594, + "epoch": 0.7587412587412588, + "grad_norm": 1.2090598344802856, + "kl": 0.11880560964345932, + "learning_rate": 4.9988996502984604e-06, + "loss": 0.0048, + "reward": 2.7333333492279053, + "reward_std": 1.022578477859497, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5666667222976685, + "step": 217 + }, + { + "completion_length": 565.6666870117188, + "epoch": 0.7622377622377622, + "grad_norm": 0.553954005241394, + "kl": 0.052788302302360535, + "learning_rate": 4.998766400914329e-06, + "loss": 0.0021, + "reward": 2.6999998092651367, + "reward_std": 0.9705669283866882, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.699999988079071, + "step": 218 + }, + { + "completion_length": 645.6666870117188, + "epoch": 0.7657342657342657, + "grad_norm": 2.507683038711548, + "kl": 0.2849184274673462, + "learning_rate": 4.998625539854394e-06, + "loss": 0.0114, + "reward": 2.6000001430511475, + "reward_std": 1.0089600086212158, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7666666507720947, + "step": 219 + }, + { + "completion_length": 321.66668701171875, + "epoch": 0.7692307692307693, + "grad_norm": 1.2175945043563843, + "kl": 0.0842239186167717, + "learning_rate": 4.99847706754774e-06, + "loss": 0.0034, + "reward": 2.933333158493042, + "reward_std": 0.6516644954681396, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6000000238418579, + "step": 220 + }, + { + "completion_length": 700.5, + "epoch": 0.7727272727272727, + "grad_norm": 2.048892021179199, + "kl": 0.16157689690589905, + "learning_rate": 4.9983209844466404e-06, + "loss": 0.0065, + "reward": 1.625, + "reward_std": 1.0167349576950073, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 221 + }, + { + "completion_length": 833.5, + "epoch": 0.7762237762237763, + "grad_norm": 0.9171572327613831, + "kl": 0.06645169854164124, + "learning_rate": 4.998157291026553e-06, + "loss": 0.0027, + "reward": 2.9083335399627686, + "reward_std": 1.2068209648132324, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7416666746139526, + "step": 222 + }, + { + "completion_length": 506.3333435058594, + "epoch": 0.7797202797202797, + "grad_norm": 19.220211029052734, + "kl": 3.192702293395996, + "learning_rate": 4.9979859877861155e-06, + "loss": 0.1277, + "reward": 3.191666603088379, + "reward_std": 1.2146673202514648, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.6916667222976685, + "step": 223 + }, + { + "completion_length": 593.0, + "epoch": 0.7832167832167832, + "grad_norm": 0.8852243423461914, + "kl": 0.09442658722400665, + "learning_rate": 4.997807075247147e-06, + "loss": 0.0038, + "reward": 3.2750003337860107, + "reward_std": 0.6691412925720215, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7749999761581421, + "step": 224 + }, + { + "completion_length": 831.1666870117188, + "epoch": 0.7867132867132867, + "grad_norm": 0.4429211914539337, + "kl": 0.04310205578804016, + "learning_rate": 4.997620553954645e-06, + "loss": 0.0017, + "reward": 3.1541666984558105, + "reward_std": 1.132741928100586, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8208333849906921, + "step": 225 + }, + { + "completion_length": 731.0, + "epoch": 0.7902097902097902, + "grad_norm": 0.4210525155067444, + "kl": 0.0507250651717186, + "learning_rate": 4.997426424476787e-06, + "loss": 0.002, + "reward": 3.758333206176758, + "reward_std": 0.40052053332328796, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 226 + }, + { + "completion_length": 683.1666870117188, + "epoch": 0.7937062937062938, + "grad_norm": 1.443489670753479, + "kl": 0.1432674527168274, + "learning_rate": 4.9972246874049254e-06, + "loss": 0.0057, + "reward": 2.7166666984558105, + "reward_std": 1.075019359588623, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 227 + }, + { + "completion_length": 749.0, + "epoch": 0.7972027972027972, + "grad_norm": 0.4731828272342682, + "kl": 0.05084119364619255, + "learning_rate": 4.9970153433535855e-06, + "loss": 0.002, + "reward": 2.5250000953674316, + "reward_std": 0.49371039867401123, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8583332896232605, + "step": 228 + }, + { + "completion_length": 749.6666870117188, + "epoch": 0.8006993006993007, + "grad_norm": 1.1463042497634888, + "kl": 0.0917380303144455, + "learning_rate": 4.996798392960466e-06, + "loss": 0.0037, + "reward": 3.1000001430511475, + "reward_std": 1.1304867267608643, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 229 + }, + { + "completion_length": 444.3333435058594, + "epoch": 0.8041958041958042, + "grad_norm": 2.1588308811187744, + "kl": 0.2637466788291931, + "learning_rate": 4.9965738368864345e-06, + "loss": 0.0105, + "reward": 1.4583333730697632, + "reward_std": 0.665895402431488, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4583333432674408, + "step": 230 + }, + { + "completion_length": 563.8333740234375, + "epoch": 0.8076923076923077, + "grad_norm": 1.7064660787582397, + "kl": 0.15527644753456116, + "learning_rate": 4.99634167581553e-06, + "loss": 0.0062, + "reward": 2.9208335876464844, + "reward_std": 1.1095513105392456, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5874999761581421, + "step": 231 + }, + { + "completion_length": 571.6666870117188, + "epoch": 0.8111888111888111, + "grad_norm": 0.7909032106399536, + "kl": 0.10144728422164917, + "learning_rate": 4.996101910454953e-06, + "loss": 0.0041, + "reward": 3.200000286102295, + "reward_std": 0.6928204298019409, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.699999988079071, + "step": 232 + }, + { + "completion_length": 442.16668701171875, + "epoch": 0.8146853146853147, + "grad_norm": 2.3640758991241455, + "kl": 0.1561039686203003, + "learning_rate": 4.995854541535072e-06, + "loss": 0.0062, + "reward": 2.8583333492279053, + "reward_std": 1.5499732494354248, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 233 + }, + { + "completion_length": 635.0, + "epoch": 0.8181818181818182, + "grad_norm": 1.519736409187317, + "kl": 0.08059443533420563, + "learning_rate": 4.995599569809414e-06, + "loss": 0.0032, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 234 + }, + { + "completion_length": 867.1666870117188, + "epoch": 0.8216783216783217, + "grad_norm": 1.0411657094955444, + "kl": 0.18848155438899994, + "learning_rate": 4.995336996054668e-06, + "loss": 0.0075, + "reward": 2.566666603088379, + "reward_std": 0.8010410666465759, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 235 + }, + { + "completion_length": 767.0, + "epoch": 0.8251748251748252, + "grad_norm": 1.3162877559661865, + "kl": 0.1943603754043579, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.0078, + "reward": 2.8458335399627686, + "reward_std": 1.271457552909851, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8458333015441895, + "step": 236 + }, + { + "completion_length": 971.0, + "epoch": 0.8286713286713286, + "grad_norm": 0.7847824096679688, + "kl": 0.07626049965620041, + "learning_rate": 4.994789045680448e-06, + "loss": 0.0031, + "reward": 2.766666889190674, + "reward_std": 1.1245739459991455, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7666666507720947, + "step": 237 + }, + { + "completion_length": 552.0, + "epoch": 0.8321678321678322, + "grad_norm": 0.7410560250282288, + "kl": 0.10457824170589447, + "learning_rate": 4.994503670730126e-06, + "loss": 0.0042, + "reward": 3.391666889190674, + "reward_std": 0.7059863805770874, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7250000238418579, + "step": 238 + }, + { + "completion_length": 725.6666870117188, + "epoch": 0.8356643356643356, + "grad_norm": 0.4836815595626831, + "kl": 0.05600851774215698, + "learning_rate": 4.9942106970890136e-06, + "loss": 0.0022, + "reward": 2.7333333492279053, + "reward_std": 0.40207791328430176, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8999999761581421, + "step": 239 + }, + { + "completion_length": 670.1666870117188, + "epoch": 0.8391608391608392, + "grad_norm": 1.1572860479354858, + "kl": 0.09645780920982361, + "learning_rate": 4.993910125649561e-06, + "loss": 0.0039, + "reward": 1.945833444595337, + "reward_std": 1.1002748012542725, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.612500011920929, + "step": 240 + }, + { + "completion_length": 716.0, + "epoch": 0.8426573426573427, + "grad_norm": 0.6385201811790466, + "kl": 0.10877624154090881, + "learning_rate": 4.993601957327361e-06, + "loss": 0.0044, + "reward": 1.7999999523162842, + "reward_std": 1.3168143033981323, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 241 + }, + { + "completion_length": 783.0, + "epoch": 0.8461538461538461, + "grad_norm": 0.4785465598106384, + "kl": 0.06399235874414444, + "learning_rate": 4.993286193061145e-06, + "loss": 0.0026, + "reward": 2.258333444595337, + "reward_std": 0.5389031767845154, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9249999523162842, + "step": 242 + }, + { + "completion_length": 660.6666870117188, + "epoch": 0.8496503496503497, + "grad_norm": 0.7678278684616089, + "kl": 0.07323874533176422, + "learning_rate": 4.9929628338127904e-06, + "loss": 0.0029, + "reward": 2.575000047683716, + "reward_std": 1.0048632621765137, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7416666746139526, + "step": 243 + }, + { + "completion_length": 904.5, + "epoch": 0.8531468531468531, + "grad_norm": 0.41908255219459534, + "kl": 0.049275174736976624, + "learning_rate": 4.992631880567301e-06, + "loss": 0.002, + "reward": 1.9250000715255737, + "reward_std": 0.6354132890701294, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.9249999523162842, + "step": 244 + }, + { + "completion_length": 524.8333740234375, + "epoch": 0.8566433566433567, + "grad_norm": 0.9670363068580627, + "kl": 0.17363564670085907, + "learning_rate": 4.992293334332821e-06, + "loss": 0.0069, + "reward": 1.558333396911621, + "reward_std": 1.3331979513168335, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333373069763, + "step": 245 + }, + { + "completion_length": 869.1666870117188, + "epoch": 0.8601398601398601, + "grad_norm": 0.45620983839035034, + "kl": 0.0668826699256897, + "learning_rate": 4.991947196140619e-06, + "loss": 0.0027, + "reward": 2.5416667461395264, + "reward_std": 0.9057685732841492, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 246 + }, + { + "completion_length": 841.3333740234375, + "epoch": 0.8636363636363636, + "grad_norm": 0.559363603591919, + "kl": 0.0583985298871994, + "learning_rate": 4.991593467045092e-06, + "loss": 0.0023, + "reward": 2.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 247 + }, + { + "completion_length": 599.1666870117188, + "epoch": 0.8671328671328671, + "grad_norm": 0.9642091989517212, + "kl": 0.11994724720716476, + "learning_rate": 4.9912321481237616e-06, + "loss": 0.0048, + "reward": 2.5250000953674316, + "reward_std": 1.0810874700546265, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 248 + }, + { + "completion_length": 462.16668701171875, + "epoch": 0.8706293706293706, + "grad_norm": 36.93287658691406, + "kl": 9.688800811767578, + "learning_rate": 4.990863240477266e-06, + "loss": 0.3876, + "reward": 2.133333444595337, + "reward_std": 1.5154757499694824, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.46666666865348816, + "step": 249 + }, + { + "completion_length": 339.0, + "epoch": 0.8741258741258742, + "grad_norm": 26.625389099121094, + "kl": 0.959087610244751, + "learning_rate": 4.990486745229364e-06, + "loss": 0.0384, + "reward": 2.4000000953674316, + "reward_std": 1.4926488399505615, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5666666626930237, + "step": 250 + }, + { + "completion_length": 618.1666870117188, + "epoch": 0.8776223776223776, + "grad_norm": 0.8756181597709656, + "kl": 0.1540575623512268, + "learning_rate": 4.990102663526925e-06, + "loss": 0.0062, + "reward": 2.3583335876464844, + "reward_std": 0.7564169764518738, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916666030883789, + "step": 251 + }, + { + "completion_length": 659.0, + "epoch": 0.8811188811188811, + "grad_norm": 1.4729007482528687, + "kl": 0.22244331240653992, + "learning_rate": 4.989710996539926e-06, + "loss": 0.0089, + "reward": 2.6666667461395264, + "reward_std": 1.386602759361267, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6666666865348816, + "step": 252 + }, + { + "completion_length": 471.0, + "epoch": 0.8846153846153846, + "grad_norm": 1.7183626890182495, + "kl": 0.19531545042991638, + "learning_rate": 4.989311745461456e-06, + "loss": 0.0078, + "reward": 2.2624998092651367, + "reward_std": 1.547720193862915, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.42916664481163025, + "step": 253 + }, + { + "completion_length": 809.5, + "epoch": 0.8881118881118881, + "grad_norm": 1.3393943309783936, + "kl": 0.06276177614927292, + "learning_rate": 4.9889049115077e-06, + "loss": 0.0025, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 254 + }, + { + "completion_length": 696.0, + "epoch": 0.8916083916083916, + "grad_norm": 0.5159295201301575, + "kl": 0.06829811632633209, + "learning_rate": 4.988490495917948e-06, + "loss": 0.0027, + "reward": 2.375, + "reward_std": 0.8226482272148132, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.875, + "step": 255 + }, + { + "completion_length": 469.8333435058594, + "epoch": 0.8951048951048951, + "grad_norm": 15.731892585754395, + "kl": 5.195942401885986, + "learning_rate": 4.988068499954578e-06, + "loss": 0.2078, + "reward": 2.5333333015441895, + "reward_std": 1.7218208312988281, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5333333611488342, + "step": 256 + }, + { + "completion_length": 267.66668701171875, + "epoch": 0.8986013986013986, + "grad_norm": 2.6494510173797607, + "kl": 0.2645886242389679, + "learning_rate": 4.987638924903066e-06, + "loss": 0.0106, + "reward": 1.9833333492279053, + "reward_std": 1.6277797222137451, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4833333194255829, + "step": 257 + }, + { + "completion_length": 772.3333740234375, + "epoch": 0.9020979020979021, + "grad_norm": 0.4527927339076996, + "kl": 0.06693247705698013, + "learning_rate": 4.987201772071971e-06, + "loss": 0.0027, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 258 + }, + { + "completion_length": 585.6666870117188, + "epoch": 0.9055944055944056, + "grad_norm": 0.689224362373352, + "kl": 0.08530323952436447, + "learning_rate": 4.9867570427929356e-06, + "loss": 0.0034, + "reward": 0.7916666865348816, + "reward_std": 0.9183772802352905, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4583333432674408, + "step": 259 + }, + { + "completion_length": 537.1666870117188, + "epoch": 0.9090909090909091, + "grad_norm": 0.6728858947753906, + "kl": 0.0897747129201889, + "learning_rate": 4.986304738420684e-06, + "loss": 0.0036, + "reward": 3.129167079925537, + "reward_std": 1.1996268033981323, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7958333492279053, + "step": 260 + }, + { + "completion_length": 407.8333435058594, + "epoch": 0.9125874125874126, + "grad_norm": 1.1994887590408325, + "kl": 0.09183052182197571, + "learning_rate": 4.985844860333012e-06, + "loss": 0.0037, + "reward": 2.3583335876464844, + "reward_std": 1.752831220626831, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 261 + }, + { + "completion_length": 677.5, + "epoch": 0.916083916083916, + "grad_norm": 0.508855402469635, + "kl": 0.07326661795377731, + "learning_rate": 4.985377409930789e-06, + "loss": 0.0029, + "reward": 3.375, + "reward_std": 0.8635681867599487, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 262 + }, + { + "completion_length": 736.8333740234375, + "epoch": 0.9195804195804196, + "grad_norm": 0.9614912271499634, + "kl": 0.09196578711271286, + "learning_rate": 4.98490238863795e-06, + "loss": 0.0037, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 263 + }, + { + "completion_length": 770.8333740234375, + "epoch": 0.9230769230769231, + "grad_norm": 0.47455278038978577, + "kl": 0.06785900890827179, + "learning_rate": 4.984419797901491e-06, + "loss": 0.0027, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 264 + }, + { + "completion_length": 623.6666870117188, + "epoch": 0.9265734265734266, + "grad_norm": 0.5573136210441589, + "kl": 0.08627455681562424, + "learning_rate": 4.9839296391914696e-06, + "loss": 0.0035, + "reward": 3.116666793823242, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 265 + }, + { + "completion_length": 391.3333435058594, + "epoch": 0.9300699300699301, + "grad_norm": 1.9462356567382812, + "kl": 0.16661277413368225, + "learning_rate": 4.983431914000991e-06, + "loss": 0.0067, + "reward": 2.4749999046325684, + "reward_std": 1.4665435552597046, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 266 + }, + { + "completion_length": 397.3333435058594, + "epoch": 0.9335664335664335, + "grad_norm": 1.011677622795105, + "kl": 0.23764805495738983, + "learning_rate": 4.982926623846216e-06, + "loss": 0.0095, + "reward": 3.366666793823242, + "reward_std": 0.6274287104606628, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7000000476837158, + "step": 267 + }, + { + "completion_length": 417.0, + "epoch": 0.9370629370629371, + "grad_norm": 1.4490914344787598, + "kl": 0.13754335045814514, + "learning_rate": 4.9824137702663424e-06, + "loss": 0.0055, + "reward": 1.7125000953674316, + "reward_std": 0.9684717655181885, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7124999761581421, + "step": 268 + }, + { + "completion_length": 410.5, + "epoch": 0.9405594405594405, + "grad_norm": 0.8436146974563599, + "kl": 0.14260268211364746, + "learning_rate": 4.981893354823614e-06, + "loss": 0.0057, + "reward": 1.8125, + "reward_std": 1.1806514263153076, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6458333730697632, + "step": 269 + }, + { + "completion_length": 644.6666870117188, + "epoch": 0.9440559440559441, + "grad_norm": 0.7549885511398315, + "kl": 0.09023593366146088, + "learning_rate": 4.981365379103306e-06, + "loss": 0.0036, + "reward": 2.3500001430511475, + "reward_std": 1.3856406211853027, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 270 + }, + { + "completion_length": 195.5, + "epoch": 0.9475524475524476, + "grad_norm": 1.895914077758789, + "kl": 0.29670989513397217, + "learning_rate": 4.980829844713722e-06, + "loss": 0.0119, + "reward": 1.649999976158142, + "reward_std": 1.0168579816818237, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.3166666626930237, + "step": 271 + }, + { + "completion_length": 359.8333435058594, + "epoch": 0.951048951048951, + "grad_norm": 1.0856112241744995, + "kl": 0.255443274974823, + "learning_rate": 4.980286753286196e-06, + "loss": 0.0102, + "reward": 2.2916667461395264, + "reward_std": 1.2310227155685425, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.625, + "step": 272 + }, + { + "completion_length": 726.8333740234375, + "epoch": 0.9545454545454546, + "grad_norm": 0.2943981885910034, + "kl": 0.12990406155586243, + "learning_rate": 4.979736106475075e-06, + "loss": 0.0064, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 273 + }, + { + "completion_length": 680.0, + "epoch": 0.958041958041958, + "grad_norm": 0.5072641372680664, + "kl": 0.07472037523984909, + "learning_rate": 4.979177905957726e-06, + "loss": 0.003, + "reward": 3.012500286102295, + "reward_std": 1.1379531621932983, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8458333015441895, + "step": 274 + }, + { + "completion_length": 491.5, + "epoch": 0.9615384615384616, + "grad_norm": 0.6770206689834595, + "kl": 0.13075995445251465, + "learning_rate": 4.978612153434527e-06, + "loss": 0.0052, + "reward": 2.008333444595337, + "reward_std": 0.7618508338928223, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6750000715255737, + "step": 275 + }, + { + "completion_length": 749.6666870117188, + "epoch": 0.965034965034965, + "grad_norm": 0.5412439107894897, + "kl": 0.10561086982488632, + "learning_rate": 4.978038850628855e-06, + "loss": 0.0042, + "reward": 2.870833396911621, + "reward_std": 0.6615166068077087, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 276 + }, + { + "completion_length": 511.5, + "epoch": 0.9685314685314685, + "grad_norm": 1.1368520259857178, + "kl": 0.14474637806415558, + "learning_rate": 4.977457999287091e-06, + "loss": 0.0058, + "reward": 1.7583332061767578, + "reward_std": 1.0646204948425293, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 277 + }, + { + "completion_length": 750.6666870117188, + "epoch": 0.972027972027972, + "grad_norm": 1.0957084894180298, + "kl": 0.10108073800802231, + "learning_rate": 4.9768696011786095e-06, + "loss": 0.004, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 278 + }, + { + "completion_length": 324.3333435058594, + "epoch": 0.9755244755244755, + "grad_norm": 1.0172570943832397, + "kl": 0.31204575300216675, + "learning_rate": 4.976273658095772e-06, + "loss": 0.0125, + "reward": 0.908333420753479, + "reward_std": 1.0532886981964111, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40833330154418945, + "step": 279 + }, + { + "completion_length": 329.66668701171875, + "epoch": 0.9790209790209791, + "grad_norm": 0.753690242767334, + "kl": 0.09907300770282745, + "learning_rate": 4.975670171853926e-06, + "loss": 0.004, + "reward": 2.7750003337860107, + "reward_std": 1.0994317531585693, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7749999761581421, + "step": 280 + }, + { + "completion_length": 615.3333740234375, + "epoch": 0.9825174825174825, + "grad_norm": 0.8215593695640564, + "kl": 0.09376661479473114, + "learning_rate": 4.975059144291395e-06, + "loss": 0.0038, + "reward": 2.375, + "reward_std": 0.6509608626365662, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8749999403953552, + "step": 281 + }, + { + "completion_length": 435.8333435058594, + "epoch": 0.986013986013986, + "grad_norm": 1.3309355974197388, + "kl": 0.21346941590309143, + "learning_rate": 4.974440577269473e-06, + "loss": 0.0085, + "reward": 2.0333333015441895, + "reward_std": 1.6485350131988525, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5333333611488342, + "step": 282 + }, + { + "completion_length": 470.3333435058594, + "epoch": 0.9895104895104895, + "grad_norm": 1.1230376958847046, + "kl": 0.1047142893075943, + "learning_rate": 4.973814472672424e-06, + "loss": 0.0042, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 283 + }, + { + "completion_length": 887.5, + "epoch": 0.993006993006993, + "grad_norm": 0.6477030515670776, + "kl": 0.08142790198326111, + "learning_rate": 4.973180832407471e-06, + "loss": 0.0033, + "reward": 1.4250000715255737, + "reward_std": 0.9661781191825867, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5916666388511658, + "step": 284 + }, + { + "completion_length": 566.3333740234375, + "epoch": 0.9965034965034965, + "grad_norm": 0.7089259624481201, + "kl": 0.1486695259809494, + "learning_rate": 4.972539658404793e-06, + "loss": 0.0059, + "reward": 1.7166666984558105, + "reward_std": 0.7332576513290405, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 285 + }, + { + "completion_length": 899.3333740234375, + "epoch": 1.0, + "grad_norm": 0.6575971841812134, + "kl": 0.0989997610449791, + "learning_rate": 4.971890952617515e-06, + "loss": 0.004, + "reward": 2.8583335876464844, + "reward_std": 0.9960757493972778, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 286 + }, + { + "completion_length": 414.5, + "epoch": 1.0034965034965035, + "grad_norm": 1.0364247560501099, + "kl": 0.19011634588241577, + "learning_rate": 4.971234717021709e-06, + "loss": 0.0076, + "reward": 1.7916667461395264, + "reward_std": 1.7468304634094238, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6250000596046448, + "step": 287 + }, + { + "completion_length": 524.0, + "epoch": 1.006993006993007, + "grad_norm": 0.9833644032478333, + "kl": 0.14835724234580994, + "learning_rate": 4.970570953616383e-06, + "loss": 0.0059, + "reward": 2.3583335876464844, + "reward_std": 1.1191142797470093, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916666030883789, + "step": 288 + }, + { + "completion_length": 681.1666870117188, + "epoch": 1.0104895104895104, + "grad_norm": 0.6175888180732727, + "kl": 0.10941031575202942, + "learning_rate": 4.969899664423473e-06, + "loss": 0.0044, + "reward": 2.704166889190674, + "reward_std": 0.7567061185836792, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8708333373069763, + "step": 289 + }, + { + "completion_length": 386.5, + "epoch": 1.013986013986014, + "grad_norm": 2.7495882511138916, + "kl": 0.5513795614242554, + "learning_rate": 4.9692208514878445e-06, + "loss": 0.0221, + "reward": 1.3666666746139526, + "reward_std": 1.0023306608200073, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5333333611488342, + "step": 290 + }, + { + "completion_length": 679.6666870117188, + "epoch": 1.0174825174825175, + "grad_norm": 0.9174596667289734, + "kl": 0.14350205659866333, + "learning_rate": 4.968534516877279e-06, + "loss": 0.0057, + "reward": 2.879167079925537, + "reward_std": 1.0047906637191772, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7124999761581421, + "step": 291 + }, + { + "completion_length": 322.0, + "epoch": 1.020979020979021, + "grad_norm": 6.856034278869629, + "kl": 3.479478597640991, + "learning_rate": 4.96784066268247e-06, + "loss": 0.1392, + "reward": 0.875, + "reward_std": 0.9832345247268677, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.375, + "step": 292 + }, + { + "completion_length": 500.5, + "epoch": 1.0244755244755244, + "grad_norm": 0.8394511938095093, + "kl": 0.14955884218215942, + "learning_rate": 4.967139291017018e-06, + "loss": 0.006, + "reward": 2.133333206176758, + "reward_std": 1.149202585220337, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 293 + }, + { + "completion_length": 470.5, + "epoch": 1.027972027972028, + "grad_norm": 1.0547795295715332, + "kl": 0.26865124702453613, + "learning_rate": 4.966430404017424e-06, + "loss": 0.0107, + "reward": 1.7916667461395264, + "reward_std": 1.1534368991851807, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 294 + }, + { + "completion_length": 357.3333435058594, + "epoch": 1.0314685314685315, + "grad_norm": 1.61123788356781, + "kl": 0.2728823125362396, + "learning_rate": 4.965714003843079e-06, + "loss": 0.0109, + "reward": 3.266666889190674, + "reward_std": 1.6014575958251953, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7666666507720947, + "step": 295 + }, + { + "completion_length": 388.3333435058594, + "epoch": 1.034965034965035, + "grad_norm": 0.8229731917381287, + "kl": 0.33708059787750244, + "learning_rate": 4.964990092676263e-06, + "loss": 0.0135, + "reward": 0.7125000357627869, + "reward_std": 0.5300353765487671, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3791666626930237, + "step": 296 + }, + { + "completion_length": 667.0, + "epoch": 1.0384615384615385, + "grad_norm": 1.0831242799758911, + "kl": 0.26999422907829285, + "learning_rate": 4.964258672722135e-06, + "loss": 0.0108, + "reward": 2.5458335876464844, + "reward_std": 1.5425965785980225, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7124999761581421, + "step": 297 + }, + { + "completion_length": 804.1666870117188, + "epoch": 1.0419580419580419, + "grad_norm": 0.625715434551239, + "kl": 0.12136679887771606, + "learning_rate": 4.963519746208726e-06, + "loss": 0.0049, + "reward": 1.5791667699813843, + "reward_std": 1.2249915599822998, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7458333373069763, + "step": 298 + }, + { + "completion_length": 615.3333740234375, + "epoch": 1.0454545454545454, + "grad_norm": 0.9705678820610046, + "kl": 0.2214520424604416, + "learning_rate": 4.962773315386935e-06, + "loss": 0.0089, + "reward": 2.516666889190674, + "reward_std": 1.2355836629867554, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 299 + }, + { + "completion_length": 836.1666870117188, + "epoch": 1.048951048951049, + "grad_norm": 1.5465428829193115, + "kl": 0.24709966778755188, + "learning_rate": 4.962019382530521e-06, + "loss": 0.0099, + "reward": 2.0458333492279053, + "reward_std": 1.097544550895691, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 300 + }, + { + "completion_length": 597.6666870117188, + "epoch": 1.0524475524475525, + "grad_norm": 3.8257570266723633, + "kl": 0.9686455130577087, + "learning_rate": 4.961257949936092e-06, + "loss": 0.0387, + "reward": 1.4750001430511475, + "reward_std": 1.025061011314392, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 301 + }, + { + "completion_length": 516.6666870117188, + "epoch": 1.055944055944056, + "grad_norm": 2.1578736305236816, + "kl": 0.25257474184036255, + "learning_rate": 4.960489019923105e-06, + "loss": 0.0101, + "reward": 1.712499976158142, + "reward_std": 1.2360976934432983, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7125000357627869, + "step": 302 + }, + { + "completion_length": 390.3333435058594, + "epoch": 1.0594405594405594, + "grad_norm": 1.1851695775985718, + "kl": 0.30646514892578125, + "learning_rate": 4.959712594833855e-06, + "loss": 0.0123, + "reward": 1.3875000476837158, + "reward_std": 1.3440377712249756, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5541666746139526, + "step": 303 + }, + { + "completion_length": 329.66668701171875, + "epoch": 1.062937062937063, + "grad_norm": 1.7874314785003662, + "kl": 0.5978689193725586, + "learning_rate": 4.958928677033465e-06, + "loss": 0.0239, + "reward": 2.5625, + "reward_std": 1.447562575340271, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5625, + "step": 304 + }, + { + "completion_length": 676.5, + "epoch": 1.0664335664335665, + "grad_norm": 1.6353819370269775, + "kl": 0.2865048348903656, + "learning_rate": 4.958137268909887e-06, + "loss": 0.0115, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 305 + }, + { + "completion_length": 685.1666870117188, + "epoch": 1.06993006993007, + "grad_norm": 0.5405178666114807, + "kl": 0.16403402388095856, + "learning_rate": 4.957338372873886e-06, + "loss": 0.0066, + "reward": 1.1166666746139526, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 306 + }, + { + "completion_length": 377.16668701171875, + "epoch": 1.0734265734265733, + "grad_norm": 1.3861095905303955, + "kl": 0.5912900567054749, + "learning_rate": 4.956531991359038e-06, + "loss": 0.0237, + "reward": 0.9541667699813843, + "reward_std": 0.9423928260803223, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4541666507720947, + "step": 307 + }, + { + "completion_length": 568.1666870117188, + "epoch": 1.0769230769230769, + "grad_norm": 2.0841739177703857, + "kl": 0.3946326673030853, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.0158, + "reward": 1.2583333253860474, + "reward_std": 1.1876096725463867, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666388511658, + "step": 308 + }, + { + "completion_length": 610.1666870117188, + "epoch": 1.0804195804195804, + "grad_norm": 0.7838713526725769, + "kl": 0.20940952003002167, + "learning_rate": 4.95489678174111e-06, + "loss": 0.0084, + "reward": 1.1750000715255737, + "reward_std": 1.1035170555114746, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6750000715255737, + "step": 309 + }, + { + "completion_length": 780.3333740234375, + "epoch": 1.083916083916084, + "grad_norm": 0.91953444480896, + "kl": 0.13563194870948792, + "learning_rate": 4.9540679586191605e-06, + "loss": 0.0054, + "reward": 1.8500001430511475, + "reward_std": 1.006479024887085, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 310 + }, + { + "completion_length": 468.66668701171875, + "epoch": 1.0874125874125875, + "grad_norm": 1.1062681674957275, + "kl": 0.36474311351776123, + "learning_rate": 4.953231659980613e-06, + "loss": 0.0146, + "reward": 2.058333396911621, + "reward_std": 1.7576736211776733, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 311 + }, + { + "completion_length": 571.3333740234375, + "epoch": 1.0909090909090908, + "grad_norm": 0.7562583088874817, + "kl": 0.17403468489646912, + "learning_rate": 4.9523878883729794e-06, + "loss": 0.007, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 312 + }, + { + "completion_length": 580.6666870117188, + "epoch": 1.0944055944055944, + "grad_norm": 0.7236371040344238, + "kl": 0.20542237162590027, + "learning_rate": 4.9515366463665324e-06, + "loss": 0.0082, + "reward": 2.4000000953674316, + "reward_std": 0.8803409337997437, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 313 + }, + { + "completion_length": 372.5, + "epoch": 1.097902097902098, + "grad_norm": 0.736242949962616, + "kl": 0.19798314571380615, + "learning_rate": 4.9506779365543054e-06, + "loss": 0.0079, + "reward": 3.0916666984558105, + "reward_std": 0.4247548282146454, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9250000715255737, + "step": 314 + }, + { + "completion_length": 660.8333740234375, + "epoch": 1.1013986013986015, + "grad_norm": 0.7641960978507996, + "kl": 0.29524654150009155, + "learning_rate": 4.949811761552074e-06, + "loss": 0.0118, + "reward": 2.4166669845581055, + "reward_std": 1.2176480293273926, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7499999403953552, + "step": 315 + }, + { + "completion_length": 838.3333740234375, + "epoch": 1.104895104895105, + "grad_norm": 0.5717921853065491, + "kl": 0.14558419585227966, + "learning_rate": 4.94893812399836e-06, + "loss": 0.0058, + "reward": 2.258333206176758, + "reward_std": 0.8662660121917725, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9249999523162842, + "step": 316 + }, + { + "completion_length": 308.8333435058594, + "epoch": 1.1083916083916083, + "grad_norm": 1.5407124757766724, + "kl": 0.36382099986076355, + "learning_rate": 4.948057026554415e-06, + "loss": 0.0146, + "reward": 1.2291667461395264, + "reward_std": 1.2054479122161865, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5625, + "step": 317 + }, + { + "completion_length": 582.1666870117188, + "epoch": 1.1118881118881119, + "grad_norm": 0.5300387144088745, + "kl": 0.19406351447105408, + "learning_rate": 4.947168471904213e-06, + "loss": 0.0078, + "reward": 1.375, + "reward_std": 0.4937104880809784, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8749999403953552, + "step": 318 + }, + { + "completion_length": 889.3333740234375, + "epoch": 1.1153846153846154, + "grad_norm": 0.7921298146247864, + "kl": 0.14385448396205902, + "learning_rate": 4.946272462754447e-06, + "loss": 0.0058, + "reward": 1.629166603088379, + "reward_std": 0.8614546656608582, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 319 + }, + { + "completion_length": 576.6666870117188, + "epoch": 1.118881118881119, + "grad_norm": 2.1564207077026367, + "kl": 0.8259252309799194, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.033, + "reward": 1.399999976158142, + "reward_std": 1.3337916135787964, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40000003576278687, + "step": 320 + }, + { + "completion_length": 471.8333435058594, + "epoch": 1.1223776223776223, + "grad_norm": 1.2515596151351929, + "kl": 0.24163812398910522, + "learning_rate": 4.944458091896515e-06, + "loss": 0.0097, + "reward": 1.883333444595337, + "reward_std": 1.3325413465499878, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666984558105, + "step": 321 + }, + { + "completion_length": 416.66668701171875, + "epoch": 1.1258741258741258, + "grad_norm": 0.7721207141876221, + "kl": 0.2213769555091858, + "learning_rate": 4.9435397357152406e-06, + "loss": 0.0089, + "reward": 1.899999976158142, + "reward_std": 0.6442049741744995, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 322 + }, + { + "completion_length": 1349.5, + "epoch": 1.1293706293706294, + "grad_norm": 0.3130567967891693, + "kl": 0.10197386145591736, + "learning_rate": 4.94261393608816e-06, + "loss": 0.0041, + "reward": 1.9666666984558105, + "reward_std": 0.9277212023735046, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7999999523162842, + "step": 323 + }, + { + "completion_length": 669.5, + "epoch": 1.132867132867133, + "grad_norm": 0.9291994571685791, + "kl": 0.22598087787628174, + "learning_rate": 4.9416806958354206e-06, + "loss": 0.009, + "reward": 0.949999988079071, + "reward_std": 0.6595453023910522, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 324 + }, + { + "completion_length": 184.1666717529297, + "epoch": 1.1363636363636362, + "grad_norm": 2.9357590675354004, + "kl": 0.44805118441581726, + "learning_rate": 4.9407400177998335e-06, + "loss": 0.0179, + "reward": 2.450000047683716, + "reward_std": 1.4673106670379639, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 325 + }, + { + "completion_length": 786.8333740234375, + "epoch": 1.1398601398601398, + "grad_norm": 0.7112540006637573, + "kl": 0.23709163069725037, + "learning_rate": 4.939791904846869e-06, + "loss": 0.0095, + "reward": 2.7333335876464844, + "reward_std": 0.40207797288894653, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 326 + }, + { + "completion_length": 391.16668701171875, + "epoch": 1.1433566433566433, + "grad_norm": 1.6311299800872803, + "kl": 0.31598275899887085, + "learning_rate": 4.938836359864641e-06, + "loss": 0.0126, + "reward": 2.2791666984558105, + "reward_std": 0.9937827587127686, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6124999523162842, + "step": 327 + }, + { + "completion_length": 325.8333435058594, + "epoch": 1.1468531468531469, + "grad_norm": 1.6858141422271729, + "kl": 0.40026235580444336, + "learning_rate": 4.937873385763909e-06, + "loss": 0.016, + "reward": 2.0250000953674316, + "reward_std": 1.1339092254638672, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 328 + }, + { + "completion_length": 313.3333435058594, + "epoch": 1.1503496503496504, + "grad_norm": 1.9852374792099, + "kl": 0.36842843890190125, + "learning_rate": 4.936902985478055e-06, + "loss": 0.0147, + "reward": 2.5250000953674316, + "reward_std": 1.1206024885177612, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 329 + }, + { + "completion_length": 333.66668701171875, + "epoch": 1.1538461538461537, + "grad_norm": 1.0456072092056274, + "kl": 0.3002980351448059, + "learning_rate": 4.935925161963089e-06, + "loss": 0.012, + "reward": 2.1083335876464844, + "reward_std": 0.9068719744682312, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7749999761581421, + "step": 330 + }, + { + "completion_length": 419.16668701171875, + "epoch": 1.1573426573426573, + "grad_norm": 0.9209095239639282, + "kl": 0.19463126361370087, + "learning_rate": 4.93493991819763e-06, + "loss": 0.0078, + "reward": 3.566666603088379, + "reward_std": 0.9389710426330566, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 331 + }, + { + "completion_length": 501.3333435058594, + "epoch": 1.1608391608391608, + "grad_norm": 0.9894822239875793, + "kl": 0.23653444647789001, + "learning_rate": 4.933947257182901e-06, + "loss": 0.0095, + "reward": 2.4583334922790527, + "reward_std": 1.6280101537704468, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 332 + }, + { + "completion_length": 283.8333435058594, + "epoch": 1.1643356643356644, + "grad_norm": 1.3056206703186035, + "kl": 0.3558562397956848, + "learning_rate": 4.932947181942721e-06, + "loss": 0.0142, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 333 + }, + { + "completion_length": 617.8333740234375, + "epoch": 1.167832167832168, + "grad_norm": 0.7905691266059875, + "kl": 0.2221965491771698, + "learning_rate": 4.9319396955234925e-06, + "loss": 0.0089, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 334 + }, + { + "completion_length": 802.3333740234375, + "epoch": 1.1713286713286712, + "grad_norm": 0.650930643081665, + "kl": 0.2902371287345886, + "learning_rate": 4.930924800994192e-06, + "loss": 0.0116, + "reward": 2.9375, + "reward_std": 0.9523326754570007, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7708332538604736, + "step": 335 + }, + { + "completion_length": 571.5, + "epoch": 1.1748251748251748, + "grad_norm": 2.592233180999756, + "kl": 0.44388240575790405, + "learning_rate": 4.9299025014463665e-06, + "loss": 0.0178, + "reward": 2.625, + "reward_std": 1.0167349576950073, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 336 + }, + { + "completion_length": 765.0, + "epoch": 1.1783216783216783, + "grad_norm": 0.8478806018829346, + "kl": 0.23496964573860168, + "learning_rate": 4.928872799994116e-06, + "loss": 0.0094, + "reward": 2.4166665077209473, + "reward_std": 1.0943796634674072, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.75, + "step": 337 + }, + { + "completion_length": 369.5, + "epoch": 1.1818181818181819, + "grad_norm": 1.2003388404846191, + "kl": 0.283313125371933, + "learning_rate": 4.92783569977409e-06, + "loss": 0.0113, + "reward": 2.4625000953674316, + "reward_std": 1.1056389808654785, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 338 + }, + { + "completion_length": 241.1666717529297, + "epoch": 1.1853146853146854, + "grad_norm": 1.1362509727478027, + "kl": 0.36542683839797974, + "learning_rate": 4.926791203945477e-06, + "loss": 0.0146, + "reward": 2.941667079925537, + "reward_std": 1.237908124923706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7749999761581421, + "step": 339 + }, + { + "completion_length": 262.3333435058594, + "epoch": 1.1888111888111887, + "grad_norm": 2.5425589084625244, + "kl": 0.46542689204216003, + "learning_rate": 4.925739315689991e-06, + "loss": 0.0186, + "reward": 2.2166666984558105, + "reward_std": 1.3840761184692383, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666388511658, + "step": 340 + }, + { + "completion_length": 458.8333435058594, + "epoch": 1.1923076923076923, + "grad_norm": 1.0685269832611084, + "kl": 0.28533288836479187, + "learning_rate": 4.924680038211868e-06, + "loss": 0.0114, + "reward": 3.0375001430511475, + "reward_std": 0.7974568605422974, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 341 + }, + { + "completion_length": 680.6666870117188, + "epoch": 1.1958041958041958, + "grad_norm": 1.049636960029602, + "kl": 0.2565695643424988, + "learning_rate": 4.923613374737848e-06, + "loss": 0.0103, + "reward": 2.875, + "reward_std": 0.7834221124649048, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.875, + "step": 342 + }, + { + "completion_length": 669.5, + "epoch": 1.1993006993006994, + "grad_norm": 0.47562330961227417, + "kl": 0.15911276638507843, + "learning_rate": 4.922539328517174e-06, + "loss": 0.0064, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 343 + }, + { + "completion_length": 533.1666870117188, + "epoch": 1.2027972027972027, + "grad_norm": 2.7278823852539062, + "kl": 0.42878812551498413, + "learning_rate": 4.921457902821578e-06, + "loss": 0.0172, + "reward": 2.191666603088379, + "reward_std": 1.1499637365341187, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 344 + }, + { + "completion_length": 410.5, + "epoch": 1.2062937062937062, + "grad_norm": 1.2009421586990356, + "kl": 0.30361247062683105, + "learning_rate": 4.92036910094527e-06, + "loss": 0.0121, + "reward": 2.2958333492279053, + "reward_std": 0.7362772822380066, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7958333492279053, + "step": 345 + }, + { + "completion_length": 678.0, + "epoch": 1.2097902097902098, + "grad_norm": 1.1339452266693115, + "kl": 0.36994367837905884, + "learning_rate": 4.9192729262049285e-06, + "loss": 0.0148, + "reward": 1.375, + "reward_std": 1.7195203304290771, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.375, + "step": 346 + }, + { + "completion_length": 364.66668701171875, + "epoch": 1.2132867132867133, + "grad_norm": 1.0105022192001343, + "kl": 0.22824347019195557, + "learning_rate": 4.918169381939693e-06, + "loss": 0.0091, + "reward": 1.75, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.75, + "step": 347 + }, + { + "completion_length": 231.83334350585938, + "epoch": 1.2167832167832167, + "grad_norm": 2.2665371894836426, + "kl": 0.5012367963790894, + "learning_rate": 4.917058471511149e-06, + "loss": 0.02, + "reward": 0.8916667699813843, + "reward_std": 0.8929818868637085, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 348 + }, + { + "completion_length": 149.6666717529297, + "epoch": 1.2202797202797202, + "grad_norm": 1.465401530265808, + "kl": 0.71610426902771, + "learning_rate": 4.915940198303324e-06, + "loss": 0.0286, + "reward": 2.183333396911621, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5166666507720947, + "step": 349 + }, + { + "completion_length": 265.66668701171875, + "epoch": 1.2237762237762237, + "grad_norm": 1.1324924230575562, + "kl": 0.39196571707725525, + "learning_rate": 4.914814565722671e-06, + "loss": 0.0157, + "reward": 2.016666889190674, + "reward_std": 0.9521905779838562, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 350 + }, + { + "completion_length": 228.1666717529297, + "epoch": 1.2272727272727273, + "grad_norm": 2.361294746398926, + "kl": 0.5443918704986572, + "learning_rate": 4.913681577198063e-06, + "loss": 0.0218, + "reward": 2.7916667461395264, + "reward_std": 1.5844295024871826, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 351 + }, + { + "completion_length": 645.1666870117188, + "epoch": 1.2307692307692308, + "grad_norm": 1.6541866064071655, + "kl": 0.3587082326412201, + "learning_rate": 4.912541236180779e-06, + "loss": 0.0143, + "reward": 3.0208334922790527, + "reward_std": 1.1969144344329834, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6875, + "step": 352 + }, + { + "completion_length": 592.1666870117188, + "epoch": 1.2342657342657342, + "grad_norm": 3.038172483444214, + "kl": 0.6741119623184204, + "learning_rate": 4.9113935461444955e-06, + "loss": 0.027, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 353 + }, + { + "completion_length": 416.16668701171875, + "epoch": 1.2377622377622377, + "grad_norm": 1.0763347148895264, + "kl": 0.32444697618484497, + "learning_rate": 4.910238510585275e-06, + "loss": 0.013, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 354 + }, + { + "completion_length": 276.3333435058594, + "epoch": 1.2412587412587412, + "grad_norm": 2.7986843585968018, + "kl": 0.9174998998641968, + "learning_rate": 4.909076133021558e-06, + "loss": 0.0367, + "reward": 2.625, + "reward_std": 1.616709589958191, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 355 + }, + { + "completion_length": 269.16668701171875, + "epoch": 1.2447552447552448, + "grad_norm": 0.9633187055587769, + "kl": 0.3955456614494324, + "learning_rate": 4.907906416994146e-06, + "loss": 0.0158, + "reward": 3.066667079925537, + "reward_std": 0.4490731656551361, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8999999761581421, + "step": 356 + }, + { + "completion_length": 313.16668701171875, + "epoch": 1.2482517482517483, + "grad_norm": 2.313849449157715, + "kl": 0.662523627281189, + "learning_rate": 4.906729366066197e-06, + "loss": 0.0265, + "reward": 1.7666667699813843, + "reward_std": 1.1767185926437378, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 357 + }, + { + "completion_length": 216.0, + "epoch": 1.2517482517482517, + "grad_norm": 4.379472255706787, + "kl": 0.7677586078643799, + "learning_rate": 4.905544983823214e-06, + "loss": 0.0307, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 358 + }, + { + "completion_length": 860.3333740234375, + "epoch": 1.2552447552447552, + "grad_norm": 2.9275009632110596, + "kl": 0.6438803672790527, + "learning_rate": 4.904353273873029e-06, + "loss": 0.0258, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 359 + }, + { + "completion_length": 217.83334350585938, + "epoch": 1.2587412587412588, + "grad_norm": 2.738201141357422, + "kl": 0.6947124004364014, + "learning_rate": 4.903154239845798e-06, + "loss": 0.0278, + "reward": 2.125, + "reward_std": 1.3129165172576904, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 360 + }, + { + "completion_length": 850.6666870117188, + "epoch": 1.2622377622377623, + "grad_norm": 0.6407853364944458, + "kl": 0.21777069568634033, + "learning_rate": 4.901947885393986e-06, + "loss": 0.0087, + "reward": 3.066667079925537, + "reward_std": 0.9389710426330566, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 361 + }, + { + "completion_length": 430.5, + "epoch": 1.2657342657342658, + "grad_norm": 3.934774398803711, + "kl": 1.3171093463897705, + "learning_rate": 4.900734214192358e-06, + "loss": 0.0527, + "reward": 2.4666666984558105, + "reward_std": 1.7380064725875854, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 362 + }, + { + "completion_length": 1049.0, + "epoch": 1.2692307692307692, + "grad_norm": 1.0587317943572998, + "kl": 0.3339938521385193, + "learning_rate": 4.899513229937968e-06, + "loss": 0.0134, + "reward": 1.183333396911621, + "reward_std": 0.6088240146636963, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8499999046325684, + "step": 363 + }, + { + "completion_length": 752.5, + "epoch": 1.2727272727272727, + "grad_norm": 0.9463182687759399, + "kl": 0.2867739796638489, + "learning_rate": 4.898284936350144e-06, + "loss": 0.0115, + "reward": 1.445833444595337, + "reward_std": 1.1011831760406494, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.612500011920929, + "step": 364 + }, + { + "completion_length": 302.3333435058594, + "epoch": 1.2762237762237763, + "grad_norm": 1.0470837354660034, + "kl": 0.4384109377861023, + "learning_rate": 4.897049337170483e-06, + "loss": 0.0175, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 365 + }, + { + "completion_length": 299.5, + "epoch": 1.2797202797202798, + "grad_norm": 1.4532350301742554, + "kl": 0.48457586765289307, + "learning_rate": 4.8958064361628334e-06, + "loss": 0.0194, + "reward": 2.183333396911621, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 366 + }, + { + "completion_length": 591.1666870117188, + "epoch": 1.2832167832167833, + "grad_norm": 1.7987697124481201, + "kl": 0.44638824462890625, + "learning_rate": 4.894556237113287e-06, + "loss": 0.0179, + "reward": 1.1166666746139526, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 367 + }, + { + "completion_length": 1384.5, + "epoch": 1.2867132867132867, + "grad_norm": 0.4040040373802185, + "kl": 0.12767352163791656, + "learning_rate": 4.893298743830168e-06, + "loss": 0.0051, + "reward": 1.691666841506958, + "reward_std": 1.4019334316253662, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 368 + }, + { + "completion_length": 440.3333435058594, + "epoch": 1.2902097902097902, + "grad_norm": 1.9347208738327026, + "kl": 0.46111249923706055, + "learning_rate": 4.89203396014402e-06, + "loss": 0.0184, + "reward": 1.9333332777023315, + "reward_std": 1.0510313510894775, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7666666507720947, + "step": 369 + }, + { + "completion_length": 602.8333740234375, + "epoch": 1.2937062937062938, + "grad_norm": 1.7568728923797607, + "kl": 0.5643346309661865, + "learning_rate": 4.890761889907589e-06, + "loss": 0.0226, + "reward": 1.2333333492279053, + "reward_std": 1.1513760089874268, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40000003576278687, + "step": 370 + }, + { + "completion_length": 584.1666870117188, + "epoch": 1.297202797202797, + "grad_norm": 2.6727964878082275, + "kl": 0.5424228310585022, + "learning_rate": 4.889482536995826e-06, + "loss": 0.0217, + "reward": 2.7916667461395264, + "reward_std": 1.4527275562286377, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 371 + }, + { + "completion_length": 302.16668701171875, + "epoch": 1.3006993006993006, + "grad_norm": 1.0215359926223755, + "kl": 0.38776999711990356, + "learning_rate": 4.888195905305859e-06, + "loss": 0.0155, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 372 + }, + { + "completion_length": 1038.5, + "epoch": 1.3041958041958042, + "grad_norm": 0.8328973054885864, + "kl": 0.31271958351135254, + "learning_rate": 4.886901998756995e-06, + "loss": 0.0125, + "reward": 1.4750001430511475, + "reward_std": 1.0486897230148315, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 373 + }, + { + "completion_length": 407.16668701171875, + "epoch": 1.3076923076923077, + "grad_norm": 1.812672734260559, + "kl": 0.3156376779079437, + "learning_rate": 4.885600821290692e-06, + "loss": 0.0126, + "reward": 3.183333396911621, + "reward_std": 1.2027745246887207, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 374 + }, + { + "completion_length": 264.16668701171875, + "epoch": 1.3111888111888113, + "grad_norm": 4.727421760559082, + "kl": 1.329188585281372, + "learning_rate": 4.884292376870567e-06, + "loss": 0.0532, + "reward": 2.0916666984558105, + "reward_std": 0.94890296459198, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 375 + }, + { + "completion_length": 516.5, + "epoch": 1.3146853146853146, + "grad_norm": 2.27711820602417, + "kl": 0.6330995559692383, + "learning_rate": 4.882976669482368e-06, + "loss": 0.0253, + "reward": 1.3583333492279053, + "reward_std": 1.1029127836227417, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6916667222976685, + "step": 376 + }, + { + "completion_length": 420.66668701171875, + "epoch": 1.3181818181818181, + "grad_norm": 2.9678735733032227, + "kl": 0.8875288367271423, + "learning_rate": 4.881653703133966e-06, + "loss": 0.0355, + "reward": 1.883333444595337, + "reward_std": 1.3325413465499878, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 377 + }, + { + "completion_length": 753.1666870117188, + "epoch": 1.3216783216783217, + "grad_norm": 0.774476945400238, + "kl": 0.36767667531967163, + "learning_rate": 4.880323481855347e-06, + "loss": 0.0147, + "reward": 2.3583335876464844, + "reward_std": 1.55962073802948, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 378 + }, + { + "completion_length": 182.5, + "epoch": 1.3251748251748252, + "grad_norm": 1.207739233970642, + "kl": 0.43915602564811707, + "learning_rate": 4.878986009698596e-06, + "loss": 0.0176, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 379 + }, + { + "completion_length": 341.0, + "epoch": 1.3286713286713288, + "grad_norm": 0.7512596249580383, + "kl": 0.3403867483139038, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.0136, + "reward": 3.0416667461395264, + "reward_std": 1.4800056219100952, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.875, + "step": 380 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.332167832167832, + "grad_norm": 2.4150354862213135, + "kl": 0.6687287092208862, + "learning_rate": 4.87628932906946e-06, + "loss": 0.0267, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 381 + }, + { + "completion_length": 657.5, + "epoch": 1.3356643356643356, + "grad_norm": 1.1033812761306763, + "kl": 0.2525772750377655, + "learning_rate": 4.874930128811631e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 382 + }, + { + "completion_length": 655.6666870117188, + "epoch": 1.3391608391608392, + "grad_norm": 2.7283008098602295, + "kl": 0.7087686061859131, + "learning_rate": 4.87356369410476e-06, + "loss": 0.0284, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6333333253860474, + "step": 383 + }, + { + "completion_length": 1037.166748046875, + "epoch": 1.3426573426573427, + "grad_norm": 1.4860605001449585, + "kl": 0.35516053438186646, + "learning_rate": 4.8721900291112415e-06, + "loss": 0.0142, + "reward": 1.3416666984558105, + "reward_std": 1.0956352949142456, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6750000715255737, + "step": 384 + }, + { + "completion_length": 776.0, + "epoch": 1.3461538461538463, + "grad_norm": 2.1169064044952393, + "kl": 0.6649973392486572, + "learning_rate": 4.870809138015499e-06, + "loss": 0.0266, + "reward": 1.4750001430511475, + "reward_std": 1.2451908588409424, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 385 + }, + { + "completion_length": 803.8333740234375, + "epoch": 1.3496503496503496, + "grad_norm": 1.5138658285140991, + "kl": 0.5593903064727783, + "learning_rate": 4.869421025023965e-06, + "loss": 0.0224, + "reward": 1.2250001430511475, + "reward_std": 1.229125738143921, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5583333373069763, + "step": 386 + }, + { + "completion_length": 579.8333740234375, + "epoch": 1.3531468531468531, + "grad_norm": 0.8988491892814636, + "kl": 0.2851899266242981, + "learning_rate": 4.868025694365073e-06, + "loss": 0.0114, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 387 + }, + { + "completion_length": 173.5, + "epoch": 1.3566433566433567, + "grad_norm": 1.3644022941589355, + "kl": 0.5744073390960693, + "learning_rate": 4.866623150289241e-06, + "loss": 0.023, + "reward": 1.9666666984558105, + "reward_std": 1.2412359714508057, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 388 + }, + { + "completion_length": 578.3333740234375, + "epoch": 1.3601398601398602, + "grad_norm": 0.8156600594520569, + "kl": 0.2687755227088928, + "learning_rate": 4.865213397068864e-06, + "loss": 0.0108, + "reward": 2.2916667461395264, + "reward_std": 1.4901063442230225, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 389 + }, + { + "completion_length": 1756.8333740234375, + "epoch": 1.3636363636363638, + "grad_norm": 0.36968812346458435, + "kl": 0.11372655630111694, + "learning_rate": 4.863796438998293e-06, + "loss": 0.0045, + "reward": 1.4666666984558105, + "reward_std": 0.9174239635467529, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6333333253860474, + "step": 390 + }, + { + "completion_length": 605.5, + "epoch": 1.367132867132867, + "grad_norm": 1.086455225944519, + "kl": 0.2938157916069031, + "learning_rate": 4.862372280393828e-06, + "loss": 0.0118, + "reward": 2.4375, + "reward_std": 1.2702115774154663, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7708333730697632, + "step": 391 + }, + { + "completion_length": 736.0, + "epoch": 1.3706293706293706, + "grad_norm": 3.411510705947876, + "kl": 0.9218753576278687, + "learning_rate": 4.860940925593703e-06, + "loss": 0.0369, + "reward": 1.4583333730697632, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666865348816, + "step": 392 + }, + { + "completion_length": 166.5, + "epoch": 1.3741258741258742, + "grad_norm": 1.464406132698059, + "kl": 0.34225571155548096, + "learning_rate": 4.8595023789580745e-06, + "loss": 0.0137, + "reward": 1.6041667461395264, + "reward_std": 0.7573666572570801, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7708332538604736, + "step": 393 + }, + { + "completion_length": 646.5, + "epoch": 1.3776223776223775, + "grad_norm": 1.6122732162475586, + "kl": 0.4424184560775757, + "learning_rate": 4.858056644869002e-06, + "loss": 0.0177, + "reward": 1.3250000476837158, + "reward_std": 0.9527591466903687, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8250000476837158, + "step": 394 + }, + { + "completion_length": 641.1666870117188, + "epoch": 1.381118881118881, + "grad_norm": 0.6985570192337036, + "kl": 0.23967330157756805, + "learning_rate": 4.856603727730446e-06, + "loss": 0.0096, + "reward": 2.5458333492279053, + "reward_std": 1.5425965785980225, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7125000357627869, + "step": 395 + }, + { + "completion_length": 161.83334350585938, + "epoch": 1.3846153846153846, + "grad_norm": 1.9270485639572144, + "kl": 0.7514389753341675, + "learning_rate": 4.855143631968242e-06, + "loss": 0.0301, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 396 + }, + { + "completion_length": 166.0, + "epoch": 1.3881118881118881, + "grad_norm": 1.2144757509231567, + "kl": 0.35039469599723816, + "learning_rate": 4.853676362030095e-06, + "loss": 0.014, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 397 + }, + { + "completion_length": 569.0, + "epoch": 1.3916083916083917, + "grad_norm": 6.755039215087891, + "kl": 0.7890805006027222, + "learning_rate": 4.852201922385564e-06, + "loss": 0.0316, + "reward": 2.1083333492279053, + "reward_std": 1.7987264394760132, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6083333492279053, + "step": 398 + }, + { + "completion_length": 909.0, + "epoch": 1.395104895104895, + "grad_norm": 0.7347401976585388, + "kl": 0.18117789924144745, + "learning_rate": 4.850720317526047e-06, + "loss": 0.0072, + "reward": 1.962499976158142, + "reward_std": 0.534263551235199, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7958333492279053, + "step": 399 + }, + { + "completion_length": 793.5, + "epoch": 1.3986013986013985, + "grad_norm": 0.849243700504303, + "kl": 0.27008673548698425, + "learning_rate": 4.849231551964771e-06, + "loss": 0.0108, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 400 + }, + { + "completion_length": 554.1666870117188, + "epoch": 1.402097902097902, + "grad_norm": 2.7050747871398926, + "kl": 0.5240260362625122, + "learning_rate": 4.847735630236773e-06, + "loss": 0.021, + "reward": 3.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 401 + }, + { + "completion_length": 215.83334350585938, + "epoch": 1.4055944055944056, + "grad_norm": 0.9243234992027283, + "kl": 0.3121068477630615, + "learning_rate": 4.84623255689889e-06, + "loss": 0.0125, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 402 + }, + { + "completion_length": 235.6666717529297, + "epoch": 1.4090909090909092, + "grad_norm": 3.3891875743865967, + "kl": 0.5218031406402588, + "learning_rate": 4.844722336529745e-06, + "loss": 0.0209, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 403 + }, + { + "completion_length": 923.5, + "epoch": 1.4125874125874125, + "grad_norm": 3.197908878326416, + "kl": 0.7076524496078491, + "learning_rate": 4.84320497372973e-06, + "loss": 0.0283, + "reward": 2.0458335876464844, + "reward_std": 1.3396285772323608, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7125000357627869, + "step": 404 + }, + { + "completion_length": 197.83334350585938, + "epoch": 1.416083916083916, + "grad_norm": 1.1261261701583862, + "kl": 0.3264281153678894, + "learning_rate": 4.841680473120994e-06, + "loss": 0.0131, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 405 + }, + { + "completion_length": 554.5, + "epoch": 1.4195804195804196, + "grad_norm": 3.3561604022979736, + "kl": 0.8642048835754395, + "learning_rate": 4.840148839347434e-06, + "loss": 0.0346, + "reward": 1.8500001430511475, + "reward_std": 1.0315039157867432, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 406 + }, + { + "completion_length": 795.8333740234375, + "epoch": 1.4230769230769231, + "grad_norm": 4.25921630859375, + "kl": 0.770601749420166, + "learning_rate": 4.838610077074669e-06, + "loss": 0.0308, + "reward": 1.2916667461395264, + "reward_std": 1.0551856756210327, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 407 + }, + { + "completion_length": 915.0, + "epoch": 1.4265734265734267, + "grad_norm": 0.571506142616272, + "kl": 0.20412606000900269, + "learning_rate": 4.837064190990036e-06, + "loss": 0.0082, + "reward": 2.241666793823242, + "reward_std": 1.3698238134384155, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7416666746139526, + "step": 408 + }, + { + "completion_length": 520.6666870117188, + "epoch": 1.43006993006993, + "grad_norm": 0.9773194193840027, + "kl": 0.29276588559150696, + "learning_rate": 4.835511185802574e-06, + "loss": 0.0117, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 409 + }, + { + "completion_length": 357.5, + "epoch": 1.4335664335664335, + "grad_norm": 2.5951545238494873, + "kl": 0.4989779591560364, + "learning_rate": 4.833951066243004e-06, + "loss": 0.02, + "reward": 1.945833444595337, + "reward_std": 1.279689073562622, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.612500011920929, + "step": 410 + }, + { + "completion_length": 794.3333740234375, + "epoch": 1.437062937062937, + "grad_norm": 0.761000394821167, + "kl": 0.20721551775932312, + "learning_rate": 4.832383837063723e-06, + "loss": 0.0083, + "reward": 2.0416667461395264, + "reward_std": 1.100189447402954, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 411 + }, + { + "completion_length": 1086.5, + "epoch": 1.4405594405594406, + "grad_norm": 0.9872347116470337, + "kl": 0.296750009059906, + "learning_rate": 4.830809503038781e-06, + "loss": 0.0119, + "reward": 2.0916666984558105, + "reward_std": 1.442365050315857, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 412 + }, + { + "completion_length": 168.5, + "epoch": 1.4440559440559442, + "grad_norm": 1.2185351848602295, + "kl": 0.34197482466697693, + "learning_rate": 4.829228068963873e-06, + "loss": 0.0137, + "reward": 3.2333333492279053, + "reward_std": 0.5671566724777222, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 413 + }, + { + "completion_length": 775.3333740234375, + "epoch": 1.4475524475524475, + "grad_norm": 1.1913334131240845, + "kl": 0.3759481906890869, + "learning_rate": 4.8276395396563215e-06, + "loss": 0.015, + "reward": 0.8916667699813843, + "reward_std": 0.5633975267410278, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7250000834465027, + "step": 414 + }, + { + "completion_length": 203.6666717529297, + "epoch": 1.451048951048951, + "grad_norm": 1.0359302759170532, + "kl": 0.31211602687835693, + "learning_rate": 4.826043919955062e-06, + "loss": 0.0125, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 415 + }, + { + "completion_length": 543.6666870117188, + "epoch": 1.4545454545454546, + "grad_norm": 0.7396105527877808, + "kl": 0.25116777420043945, + "learning_rate": 4.824441214720629e-06, + "loss": 0.01, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 416 + }, + { + "completion_length": 253.0, + "epoch": 1.458041958041958, + "grad_norm": 2.3947131633758545, + "kl": 0.3577002286911011, + "learning_rate": 4.8228314288351405e-06, + "loss": 0.0143, + "reward": 1.8500001430511475, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 417 + }, + { + "completion_length": 776.0, + "epoch": 1.4615384615384617, + "grad_norm": 0.9339893460273743, + "kl": 0.2636467218399048, + "learning_rate": 4.821214567202284e-06, + "loss": 0.0105, + "reward": 2.2333333492279053, + "reward_std": 0.5671566724777222, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 418 + }, + { + "completion_length": 185.33334350585938, + "epoch": 1.465034965034965, + "grad_norm": 3.6216635704040527, + "kl": 0.6233493685722351, + "learning_rate": 4.8195906347473e-06, + "loss": 0.0249, + "reward": 1.8000000715255737, + "reward_std": 1.579240322113037, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 419 + }, + { + "completion_length": 1112.0, + "epoch": 1.4685314685314685, + "grad_norm": 0.6356344223022461, + "kl": 0.26539915800094604, + "learning_rate": 4.817959636416969e-06, + "loss": 0.0106, + "reward": 2.375, + "reward_std": 1.001873254776001, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 420 + }, + { + "completion_length": 531.1666870117188, + "epoch": 1.472027972027972, + "grad_norm": 0.8300501108169556, + "kl": 0.31844228506088257, + "learning_rate": 4.816321577179594e-06, + "loss": 0.0127, + "reward": 2.875, + "reward_std": 0.7834221124649048, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 421 + }, + { + "completion_length": 218.83334350585938, + "epoch": 1.4755244755244754, + "grad_norm": 0.796237051486969, + "kl": 0.331187903881073, + "learning_rate": 4.814676462024988e-06, + "loss": 0.0132, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 422 + }, + { + "completion_length": 186.83334350585938, + "epoch": 1.479020979020979, + "grad_norm": 1.279965877532959, + "kl": 0.3236890733242035, + "learning_rate": 4.8130242959644555e-06, + "loss": 0.0129, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 423 + }, + { + "completion_length": 249.0, + "epoch": 1.4825174825174825, + "grad_norm": 4.079779624938965, + "kl": 0.39256423711776733, + "learning_rate": 4.811365084030784e-06, + "loss": 0.0157, + "reward": 1.7125000953674316, + "reward_std": 0.9684717655181885, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7125000357627869, + "step": 424 + }, + { + "completion_length": 183.33334350585938, + "epoch": 1.486013986013986, + "grad_norm": 1.1069165468215942, + "kl": 0.262847363948822, + "learning_rate": 4.809698831278217e-06, + "loss": 0.0105, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 425 + }, + { + "completion_length": 199.6666717529297, + "epoch": 1.4895104895104896, + "grad_norm": 1.413517713546753, + "kl": 0.39733991026878357, + "learning_rate": 4.808025542782453e-06, + "loss": 0.0159, + "reward": 2.7083334922790527, + "reward_std": 0.8662660121917725, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 426 + }, + { + "completion_length": 235.6666717529297, + "epoch": 1.493006993006993, + "grad_norm": 0.9659198522567749, + "kl": 0.2365071177482605, + "learning_rate": 4.806345223640616e-06, + "loss": 0.0095, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 427 + }, + { + "completion_length": 774.1666870117188, + "epoch": 1.4965034965034965, + "grad_norm": 0.830765962600708, + "kl": 0.33350443840026855, + "learning_rate": 4.804657878971252e-06, + "loss": 0.0133, + "reward": 2.183333396911621, + "reward_std": 1.3265244960784912, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 428 + }, + { + "completion_length": 203.0, + "epoch": 1.5, + "grad_norm": 1.0319793224334717, + "kl": 0.27221041917800903, + "learning_rate": 4.802963513914304e-06, + "loss": 0.0109, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 429 + }, + { + "completion_length": 461.16668701171875, + "epoch": 1.5034965034965035, + "grad_norm": 1.0231879949569702, + "kl": 0.24733422696590424, + "learning_rate": 4.801262133631101e-06, + "loss": 0.0099, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 430 + }, + { + "completion_length": 244.83334350585938, + "epoch": 1.506993006993007, + "grad_norm": 0.9520881772041321, + "kl": 0.31419527530670166, + "learning_rate": 4.799553743304345e-06, + "loss": 0.0126, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 431 + }, + { + "completion_length": 206.83334350585938, + "epoch": 1.5104895104895104, + "grad_norm": 0.8148533701896667, + "kl": 0.2550124228000641, + "learning_rate": 4.7978383481380865e-06, + "loss": 0.0102, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 432 + }, + { + "completion_length": 1087.8333740234375, + "epoch": 1.513986013986014, + "grad_norm": 0.3516090214252472, + "kl": 0.2816867530345917, + "learning_rate": 4.796115953357718e-06, + "loss": 0.0113, + "reward": 2.2833333015441895, + "reward_std": 1.2408331632614136, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 433 + }, + { + "completion_length": 556.3333740234375, + "epoch": 1.5174825174825175, + "grad_norm": 3.6779227256774902, + "kl": 0.4250108003616333, + "learning_rate": 4.794386564209953e-06, + "loss": 0.017, + "reward": 2.4083335399627686, + "reward_std": 1.687132716178894, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 434 + }, + { + "completion_length": 707.8333740234375, + "epoch": 1.5209790209790208, + "grad_norm": 1.121485948562622, + "kl": 0.24696388840675354, + "learning_rate": 4.79265018596281e-06, + "loss": 0.0099, + "reward": 2.9000000953674316, + "reward_std": 0.9027735590934753, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 435 + }, + { + "completion_length": 469.8333435058594, + "epoch": 1.5244755244755246, + "grad_norm": 2.6518046855926514, + "kl": 0.7716752886772156, + "learning_rate": 4.790906823905599e-06, + "loss": 0.0309, + "reward": 1.8000000715255737, + "reward_std": 1.447066068649292, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 436 + }, + { + "completion_length": 192.83334350585938, + "epoch": 1.527972027972028, + "grad_norm": 1.165176272392273, + "kl": 0.2884241044521332, + "learning_rate": 4.7891564833489035e-06, + "loss": 0.0115, + "reward": 2.450000286102295, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 437 + }, + { + "completion_length": 254.6666717529297, + "epoch": 1.5314685314685315, + "grad_norm": 0.8783808350563049, + "kl": 0.26613113284111023, + "learning_rate": 4.787399169624562e-06, + "loss": 0.0106, + "reward": 3.370833396911621, + "reward_std": 1.011238932609558, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 438 + }, + { + "completion_length": 158.5, + "epoch": 1.534965034965035, + "grad_norm": 2.008617877960205, + "kl": 0.5028926134109497, + "learning_rate": 4.7856348880856595e-06, + "loss": 0.0201, + "reward": 1.7416666746139526, + "reward_std": 1.1517016887664795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7416666746139526, + "step": 439 + }, + { + "completion_length": 208.5, + "epoch": 1.5384615384615383, + "grad_norm": 0.8693957924842834, + "kl": 0.2799164056777954, + "learning_rate": 4.783863644106502e-06, + "loss": 0.0112, + "reward": 2.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 440 + }, + { + "completion_length": 211.5, + "epoch": 1.541958041958042, + "grad_norm": 1.5437381267547607, + "kl": 0.3011782467365265, + "learning_rate": 4.782085443082607e-06, + "loss": 0.012, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 441 + }, + { + "completion_length": 491.8333435058594, + "epoch": 1.5454545454545454, + "grad_norm": 3.308060884475708, + "kl": 0.43526870012283325, + "learning_rate": 4.780300290430683e-06, + "loss": 0.0174, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 442 + }, + { + "completion_length": 177.1666717529297, + "epoch": 1.548951048951049, + "grad_norm": 2.3108198642730713, + "kl": 0.6005208492279053, + "learning_rate": 4.778508191588613e-06, + "loss": 0.024, + "reward": 2.683333396911621, + "reward_std": 1.2110600471496582, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 443 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.5524475524475525, + "grad_norm": 0.9576809406280518, + "kl": 0.3041282296180725, + "learning_rate": 4.776709152015443e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 444 + }, + { + "completion_length": 807.3333740234375, + "epoch": 1.5559440559440558, + "grad_norm": 0.6298768520355225, + "kl": 0.2337806224822998, + "learning_rate": 4.774903177191358e-06, + "loss": 0.0094, + "reward": 2.5458335876464844, + "reward_std": 1.3377609252929688, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 445 + }, + { + "completion_length": 523.6666870117188, + "epoch": 1.5594405594405596, + "grad_norm": 1.1019190549850464, + "kl": 0.39509618282318115, + "learning_rate": 4.773090272617672e-06, + "loss": 0.0158, + "reward": 2.049999952316284, + "reward_std": 1.5391557216644287, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 446 + }, + { + "completion_length": 787.6666870117188, + "epoch": 1.562937062937063, + "grad_norm": 0.893694281578064, + "kl": 0.37470337748527527, + "learning_rate": 4.771270443816805e-06, + "loss": 0.015, + "reward": 2.2083334922790527, + "reward_std": 0.8720186948776245, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 447 + }, + { + "completion_length": 546.8333740234375, + "epoch": 1.5664335664335665, + "grad_norm": 0.837485134601593, + "kl": 0.22402605414390564, + "learning_rate": 4.769443696332272e-06, + "loss": 0.009, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 448 + }, + { + "completion_length": 177.6666717529297, + "epoch": 1.56993006993007, + "grad_norm": 1.617317795753479, + "kl": 0.3958384692668915, + "learning_rate": 4.767610035728663e-06, + "loss": 0.0158, + "reward": 2.875, + "reward_std": 1.0068515539169312, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 449 + }, + { + "completion_length": 147.33334350585938, + "epoch": 1.5734265734265733, + "grad_norm": 0.9628480076789856, + "kl": 0.3490566611289978, + "learning_rate": 4.765769467591626e-06, + "loss": 0.014, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 450 + }, + { + "completion_length": 203.83334350585938, + "epoch": 1.5769230769230769, + "grad_norm": 0.9194980263710022, + "kl": 0.3181028962135315, + "learning_rate": 4.763921997527849e-06, + "loss": 0.0127, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 451 + }, + { + "completion_length": 167.5, + "epoch": 1.5804195804195804, + "grad_norm": 3.041954517364502, + "kl": 0.426164835691452, + "learning_rate": 4.762067631165049e-06, + "loss": 0.017, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 452 + }, + { + "completion_length": 212.33334350585938, + "epoch": 1.583916083916084, + "grad_norm": 1.1762245893478394, + "kl": 0.2974995970726013, + "learning_rate": 4.760206374151947e-06, + "loss": 0.0119, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 453 + }, + { + "completion_length": 493.66668701171875, + "epoch": 1.5874125874125875, + "grad_norm": 1.3206851482391357, + "kl": 0.36789295077323914, + "learning_rate": 4.7583382321582525e-06, + "loss": 0.0147, + "reward": 1.9166667461395264, + "reward_std": 1.2738393545150757, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.75, + "step": 454 + }, + { + "completion_length": 205.0, + "epoch": 1.5909090909090908, + "grad_norm": 1.0482568740844727, + "kl": 0.2594867944717407, + "learning_rate": 4.7564632108746524e-06, + "loss": 0.0104, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 455 + }, + { + "completion_length": 502.8333435058594, + "epoch": 1.5944055944055944, + "grad_norm": 2.1341159343719482, + "kl": 0.4591405391693115, + "learning_rate": 4.754581316012785e-06, + "loss": 0.0184, + "reward": 3.7083334922790527, + "reward_std": 0.591960072517395, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 456 + }, + { + "completion_length": 633.3333740234375, + "epoch": 1.597902097902098, + "grad_norm": 1.0107204914093018, + "kl": 0.24642407894134521, + "learning_rate": 4.752692553305229e-06, + "loss": 0.0099, + "reward": 3.0375001430511475, + "reward_std": 0.7974569201469421, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708333373069763, + "step": 457 + }, + { + "completion_length": 517.0, + "epoch": 1.6013986013986012, + "grad_norm": 0.6217291355133057, + "kl": 0.22938358783721924, + "learning_rate": 4.750796928505484e-06, + "loss": 0.0092, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 458 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.604895104895105, + "grad_norm": 0.5446264743804932, + "kl": 0.1968853920698166, + "learning_rate": 4.7488944473879515e-06, + "loss": 0.0079, + "reward": 1.625, + "reward_std": 0.7960842847824097, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 459 + }, + { + "completion_length": 193.83334350585938, + "epoch": 1.6083916083916083, + "grad_norm": 0.8946224451065063, + "kl": 0.25773894786834717, + "learning_rate": 4.746985115747918e-06, + "loss": 0.0103, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 460 + }, + { + "completion_length": 204.6666717529297, + "epoch": 1.6118881118881119, + "grad_norm": 0.8260864019393921, + "kl": 0.2527741491794586, + "learning_rate": 4.745068939401539e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 461 + }, + { + "completion_length": 848.6666870117188, + "epoch": 1.6153846153846154, + "grad_norm": 1.5746495723724365, + "kl": 0.3351367712020874, + "learning_rate": 4.743145924185821e-06, + "loss": 0.0134, + "reward": 2.25, + "reward_std": 0.7803846597671509, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.75, + "step": 462 + }, + { + "completion_length": 190.0, + "epoch": 1.6188811188811187, + "grad_norm": 1.0435597896575928, + "kl": 0.26553571224212646, + "learning_rate": 4.741216075958602e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 463 + }, + { + "completion_length": 209.6666717529297, + "epoch": 1.6223776223776225, + "grad_norm": 1.0996354818344116, + "kl": 0.31133967638015747, + "learning_rate": 4.7392794005985324e-06, + "loss": 0.0125, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 464 + }, + { + "completion_length": 512.6666870117188, + "epoch": 1.6258741258741258, + "grad_norm": 0.7010518908500671, + "kl": 0.21432137489318848, + "learning_rate": 4.737335904005063e-06, + "loss": 0.0086, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 465 + }, + { + "completion_length": 527.0, + "epoch": 1.6293706293706294, + "grad_norm": 0.5995029211044312, + "kl": 0.22433510422706604, + "learning_rate": 4.735385592098421e-06, + "loss": 0.009, + "reward": 2.7916667461395264, + "reward_std": 1.4527275562286377, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 466 + }, + { + "completion_length": 191.0, + "epoch": 1.632867132867133, + "grad_norm": 1.2079272270202637, + "kl": 0.2614157795906067, + "learning_rate": 4.733428470819595e-06, + "loss": 0.0105, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 467 + }, + { + "completion_length": 783.1666870117188, + "epoch": 1.6363636363636362, + "grad_norm": 2.2251851558685303, + "kl": 0.6713162660598755, + "learning_rate": 4.731464546130315e-06, + "loss": 0.0269, + "reward": 2.4375, + "reward_std": 1.3401259183883667, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7708333730697632, + "step": 468 + }, + { + "completion_length": 529.1666870117188, + "epoch": 1.63986013986014, + "grad_norm": 0.5742272138595581, + "kl": 0.23623262345790863, + "learning_rate": 4.729493824013036e-06, + "loss": 0.0094, + "reward": 2.2125000953674316, + "reward_std": 1.234073519706726, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7124999761581421, + "step": 469 + }, + { + "completion_length": 181.0, + "epoch": 1.6433566433566433, + "grad_norm": 1.7596086263656616, + "kl": 0.33919036388397217, + "learning_rate": 4.72751631047092e-06, + "loss": 0.0136, + "reward": 1.8500001430511475, + "reward_std": 1.2247450351715088, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 470 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.6468531468531469, + "grad_norm": 1.0671755075454712, + "kl": 0.27314767241477966, + "learning_rate": 4.725532011527817e-06, + "loss": 0.0109, + "reward": 3.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 471 + }, + { + "completion_length": 189.6666717529297, + "epoch": 1.6503496503496504, + "grad_norm": 1.0676515102386475, + "kl": 0.2805836498737335, + "learning_rate": 4.723540933228245e-06, + "loss": 0.0112, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 472 + }, + { + "completion_length": 836.5, + "epoch": 1.6538461538461537, + "grad_norm": 0.8203516006469727, + "kl": 0.172221839427948, + "learning_rate": 4.721543081637372e-06, + "loss": 0.0069, + "reward": 1.5833333730697632, + "reward_std": 1.0308573246002197, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7499999403953552, + "step": 473 + }, + { + "completion_length": 169.0, + "epoch": 1.6573426573426573, + "grad_norm": 1.7924721240997314, + "kl": 0.30363911390304565, + "learning_rate": 4.719538462841003e-06, + "loss": 0.0121, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 474 + }, + { + "completion_length": 176.6666717529297, + "epoch": 1.6608391608391608, + "grad_norm": 0.19596193730831146, + "kl": 0.24111799895763397, + "learning_rate": 4.717527082945555e-06, + "loss": 0.0108, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 475 + }, + { + "completion_length": 234.6666717529297, + "epoch": 1.6643356643356644, + "grad_norm": 0.9966434240341187, + "kl": 0.25714850425720215, + "learning_rate": 4.715508948078037e-06, + "loss": 0.0103, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 476 + }, + { + "completion_length": 1046.8333740234375, + "epoch": 1.667832167832168, + "grad_norm": 0.6285001635551453, + "kl": 0.1687658280134201, + "learning_rate": 4.71348406438604e-06, + "loss": 0.0068, + "reward": 2.0250000953674316, + "reward_std": 1.4372718334197998, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 477 + }, + { + "completion_length": 219.1666717529297, + "epoch": 1.6713286713286712, + "grad_norm": 1.0476932525634766, + "kl": 0.29544544219970703, + "learning_rate": 4.71145243803771e-06, + "loss": 0.0118, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 478 + }, + { + "completion_length": 561.1666870117188, + "epoch": 1.6748251748251748, + "grad_norm": 1.0641223192214966, + "kl": 0.1950298398733139, + "learning_rate": 4.709414075221734e-06, + "loss": 0.0078, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 479 + }, + { + "completion_length": 228.5, + "epoch": 1.6783216783216783, + "grad_norm": 0.8561164736747742, + "kl": 0.26422810554504395, + "learning_rate": 4.707368982147318e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 480 + }, + { + "completion_length": 509.3333435058594, + "epoch": 1.6818181818181817, + "grad_norm": 0.5843437314033508, + "kl": 0.20474323630332947, + "learning_rate": 4.70531716504417e-06, + "loss": 0.0082, + "reward": 2.183333396911621, + "reward_std": 1.2027745246887207, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 481 + }, + { + "completion_length": 548.6666870117188, + "epoch": 1.6853146853146854, + "grad_norm": 0.648353636264801, + "kl": 0.18905925750732422, + "learning_rate": 4.703258630162481e-06, + "loss": 0.0076, + "reward": 2.2125000953674316, + "reward_std": 1.3164108991622925, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7124999761581421, + "step": 482 + }, + { + "completion_length": 219.6666717529297, + "epoch": 1.6888111888111887, + "grad_norm": 4.2207932472229, + "kl": 1.0905920267105103, + "learning_rate": 4.701193383772905e-06, + "loss": 0.0436, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 483 + }, + { + "completion_length": 1049.166748046875, + "epoch": 1.6923076923076923, + "grad_norm": 0.5171648859977722, + "kl": 0.20516209304332733, + "learning_rate": 4.699121432166542e-06, + "loss": 0.0082, + "reward": 2.2333333492279053, + "reward_std": 0.9174240827560425, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 484 + }, + { + "completion_length": 201.6666717529297, + "epoch": 1.6958041958041958, + "grad_norm": 1.1004559993743896, + "kl": 0.2839426100254059, + "learning_rate": 4.697042781654913e-06, + "loss": 0.0114, + "reward": 1.870833396911621, + "reward_std": 0.193917915225029, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 485 + }, + { + "completion_length": 190.33334350585938, + "epoch": 1.6993006993006992, + "grad_norm": 1.0573567152023315, + "kl": 0.22315821051597595, + "learning_rate": 4.6949574385699514e-06, + "loss": 0.0089, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 486 + }, + { + "completion_length": 835.5, + "epoch": 1.702797202797203, + "grad_norm": 0.7173390984535217, + "kl": 0.1510881930589676, + "learning_rate": 4.6928654092639725e-06, + "loss": 0.006, + "reward": 1.5500000715255737, + "reward_std": 1.0904128551483154, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666388511658, + "step": 487 + }, + { + "completion_length": 615.8333740234375, + "epoch": 1.7062937062937062, + "grad_norm": 0.8014463186264038, + "kl": 0.22651296854019165, + "learning_rate": 4.690766700109659e-06, + "loss": 0.0091, + "reward": 2.7083334922790527, + "reward_std": 1.315453052520752, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 488 + }, + { + "completion_length": 169.6666717529297, + "epoch": 1.7097902097902098, + "grad_norm": 3.6473190784454346, + "kl": 0.40026336908340454, + "learning_rate": 4.688661317500045e-06, + "loss": 0.016, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 489 + }, + { + "completion_length": 1151.5, + "epoch": 1.7132867132867133, + "grad_norm": 0.8561959266662598, + "kl": 0.16577297449111938, + "learning_rate": 4.68654926784849e-06, + "loss": 0.0066, + "reward": 2.7083334922790527, + "reward_std": 1.0641508102416992, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.875, + "step": 490 + }, + { + "completion_length": 397.3333435058594, + "epoch": 1.7167832167832167, + "grad_norm": 1.0723934173583984, + "kl": 0.21682481467723846, + "learning_rate": 4.6844305575886635e-06, + "loss": 0.0087, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 491 + }, + { + "completion_length": 169.6666717529297, + "epoch": 1.7202797202797204, + "grad_norm": 1.4164685010910034, + "kl": 0.245243638753891, + "learning_rate": 4.682305193174524e-06, + "loss": 0.0098, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 492 + }, + { + "completion_length": 110.33333587646484, + "epoch": 1.7237762237762237, + "grad_norm": 5.974154949188232, + "kl": 1.1889418363571167, + "learning_rate": 4.680173181080302e-06, + "loss": 0.0476, + "reward": 3.075000286102295, + "reward_std": 1.1660832166671753, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 493 + }, + { + "completion_length": 215.5, + "epoch": 1.7272727272727273, + "grad_norm": 0.9199399352073669, + "kl": 0.2431143820285797, + "learning_rate": 4.6780345278004744e-06, + "loss": 0.0097, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 494 + }, + { + "completion_length": 190.83334350585938, + "epoch": 1.7307692307692308, + "grad_norm": 0.9801461696624756, + "kl": 0.25382137298583984, + "learning_rate": 4.675889239849749e-06, + "loss": 0.0102, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 495 + }, + { + "completion_length": 846.6666870117188, + "epoch": 1.7342657342657342, + "grad_norm": 0.6822401881217957, + "kl": 0.21501430869102478, + "learning_rate": 4.673737323763048e-06, + "loss": 0.0086, + "reward": 2.679166793823242, + "reward_std": 1.3748105764389038, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8458333015441895, + "step": 496 + }, + { + "completion_length": 182.33334350585938, + "epoch": 1.737762237762238, + "grad_norm": 6.3415422439575195, + "kl": 1.284159541130066, + "learning_rate": 4.671578786095479e-06, + "loss": 0.0514, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 497 + }, + { + "completion_length": 164.83334350585938, + "epoch": 1.7412587412587412, + "grad_norm": 1.421428918838501, + "kl": 0.3243716359138489, + "learning_rate": 4.669413633422322e-06, + "loss": 0.013, + "reward": 3.566666603088379, + "reward_std": 0.6013872623443604, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 498 + }, + { + "completion_length": 229.6666717529297, + "epoch": 1.7447552447552448, + "grad_norm": 0.8355535864830017, + "kl": 0.24279817938804626, + "learning_rate": 4.667241872339007e-06, + "loss": 0.0097, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 499 + }, + { + "completion_length": 672.6666870117188, + "epoch": 1.7482517482517483, + "grad_norm": 0.5215955376625061, + "kl": 0.19877499341964722, + "learning_rate": 4.665063509461098e-06, + "loss": 0.008, + "reward": 2.924999952316284, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 500 + }, + { + "completion_length": 198.83334350585938, + "epoch": 1.7517482517482517, + "grad_norm": 0.9148537516593933, + "kl": 0.24169328808784485, + "learning_rate": 4.6628785514242615e-06, + "loss": 0.0097, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 501 + }, + { + "completion_length": 928.5, + "epoch": 1.7552447552447552, + "grad_norm": 0.4413454532623291, + "kl": 0.15593400597572327, + "learning_rate": 4.6606870048842626e-06, + "loss": 0.0062, + "reward": 2.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 502 + }, + { + "completion_length": 508.0, + "epoch": 1.7587412587412588, + "grad_norm": 0.7536454796791077, + "kl": 0.24186736345291138, + "learning_rate": 4.658488876516929e-06, + "loss": 0.0097, + "reward": 2.625, + "reward_std": 1.616709589958191, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 503 + }, + { + "completion_length": 208.33334350585938, + "epoch": 1.762237762237762, + "grad_norm": 1.1730728149414062, + "kl": 0.2987002432346344, + "learning_rate": 4.656284173018144e-06, + "loss": 0.0119, + "reward": 2.758333206176758, + "reward_std": 1.0394309759140015, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 504 + }, + { + "completion_length": 197.1666717529297, + "epoch": 1.7657342657342658, + "grad_norm": 2.2083706855773926, + "kl": 0.3215945363044739, + "learning_rate": 4.654072901103815e-06, + "loss": 0.0129, + "reward": 2.0416667461395264, + "reward_std": 0.9002315402030945, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 505 + }, + { + "completion_length": 572.0, + "epoch": 1.7692307692307692, + "grad_norm": 0.8655341863632202, + "kl": 0.24153539538383484, + "learning_rate": 4.65185506750986e-06, + "loss": 0.0097, + "reward": 1.870833396911621, + "reward_std": 1.0137083530426025, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 506 + }, + { + "completion_length": 517.5, + "epoch": 1.7727272727272727, + "grad_norm": 0.49979329109191895, + "kl": 0.16330799460411072, + "learning_rate": 4.649630678992184e-06, + "loss": 0.0065, + "reward": 2.4000000953674316, + "reward_std": 0.9460445642471313, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 507 + }, + { + "completion_length": 324.16668701171875, + "epoch": 1.7762237762237763, + "grad_norm": 0.9129101037979126, + "kl": 0.26079505681991577, + "learning_rate": 4.6473997423266615e-06, + "loss": 0.0104, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 508 + }, + { + "completion_length": 316.16668701171875, + "epoch": 1.7797202797202796, + "grad_norm": 0.7381297945976257, + "kl": 0.34089159965515137, + "learning_rate": 4.645162264309112e-06, + "loss": 0.0136, + "reward": 3.2333335876464844, + "reward_std": 0.849509596824646, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 509 + }, + { + "completion_length": 207.83334350585938, + "epoch": 1.7832167832167833, + "grad_norm": 1.0436253547668457, + "kl": 0.2835765480995178, + "learning_rate": 4.642918251755281e-06, + "loss": 0.0113, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 510 + }, + { + "completion_length": 230.33334350585938, + "epoch": 1.7867132867132867, + "grad_norm": 0.9628374576568604, + "kl": 0.2641430199146271, + "learning_rate": 4.640667711500821e-06, + "loss": 0.0106, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 511 + }, + { + "completion_length": 507.66668701171875, + "epoch": 1.7902097902097902, + "grad_norm": 0.3851446211338043, + "kl": 0.251933217048645, + "learning_rate": 4.638410650401267e-06, + "loss": 0.0101, + "reward": 3.516666889190674, + "reward_std": 1.0614454746246338, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 512 + }, + { + "completion_length": 192.0, + "epoch": 1.7937062937062938, + "grad_norm": 1.3856638669967651, + "kl": 0.2984909415245056, + "learning_rate": 4.636147075332019e-06, + "loss": 0.0119, + "reward": 3.0916666984558105, + "reward_std": 1.2249150276184082, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 513 + }, + { + "completion_length": 206.83334350585938, + "epoch": 1.797202797202797, + "grad_norm": 0.9139816164970398, + "kl": 0.24960675835609436, + "learning_rate": 4.633876993188319e-06, + "loss": 0.01, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 514 + }, + { + "completion_length": 538.0, + "epoch": 1.8006993006993008, + "grad_norm": 0.7666388750076294, + "kl": 0.2067805826663971, + "learning_rate": 4.631600410885231e-06, + "loss": 0.0083, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 515 + }, + { + "completion_length": 186.0, + "epoch": 1.8041958041958042, + "grad_norm": 0.9322411417961121, + "kl": 0.24232684075832367, + "learning_rate": 4.62931733535762e-06, + "loss": 0.0097, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 516 + }, + { + "completion_length": 170.6666717529297, + "epoch": 1.8076923076923077, + "grad_norm": 1.5746034383773804, + "kl": 0.36948150396347046, + "learning_rate": 4.627027773560129e-06, + "loss": 0.0148, + "reward": 2.516666889190674, + "reward_std": 1.525341510772705, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 517 + }, + { + "completion_length": 193.0, + "epoch": 1.8111888111888113, + "grad_norm": 0.9759989380836487, + "kl": 0.3557225167751312, + "learning_rate": 4.62473173246716e-06, + "loss": 0.0142, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 518 + }, + { + "completion_length": 523.6666870117188, + "epoch": 1.8146853146853146, + "grad_norm": 0.9804190993309021, + "kl": 0.2574712038040161, + "learning_rate": 4.622429219072854e-06, + "loss": 0.0103, + "reward": 1.633333444595337, + "reward_std": 1.1919171810150146, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 519 + }, + { + "completion_length": 1029.166748046875, + "epoch": 1.8181818181818183, + "grad_norm": 0.5941687822341919, + "kl": 0.1915300190448761, + "learning_rate": 4.620120240391065e-06, + "loss": 0.0077, + "reward": 2.883333206176758, + "reward_std": 1.7192052602767944, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 520 + }, + { + "completion_length": 157.1666717529297, + "epoch": 1.8216783216783217, + "grad_norm": 3.1836304664611816, + "kl": 0.6161837577819824, + "learning_rate": 4.6178048034553435e-06, + "loss": 0.0246, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 521 + }, + { + "completion_length": 201.33334350585938, + "epoch": 1.8251748251748252, + "grad_norm": 1.5185062885284424, + "kl": 0.31097742915153503, + "learning_rate": 4.6154829153189105e-06, + "loss": 0.0124, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 522 + }, + { + "completion_length": 186.1666717529297, + "epoch": 1.8286713286713288, + "grad_norm": 0.936562180519104, + "kl": 0.3272198438644409, + "learning_rate": 4.613154583054641e-06, + "loss": 0.0131, + "reward": 2.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 523 + }, + { + "completion_length": 216.6666717529297, + "epoch": 1.832167832167832, + "grad_norm": 0.9323495626449585, + "kl": 0.3112618923187256, + "learning_rate": 4.610819813755038e-06, + "loss": 0.0125, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 524 + }, + { + "completion_length": 525.3333740234375, + "epoch": 1.8356643356643356, + "grad_norm": 0.40873953700065613, + "kl": 0.241009920835495, + "learning_rate": 4.608478614532215e-06, + "loss": 0.0096, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 525 + }, + { + "completion_length": 160.83334350585938, + "epoch": 1.8391608391608392, + "grad_norm": 1.1447237730026245, + "kl": 0.37633103132247925, + "learning_rate": 4.60613099251787e-06, + "loss": 0.0151, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 526 + }, + { + "completion_length": 176.5, + "epoch": 1.8426573426573427, + "grad_norm": 1.4215019941329956, + "kl": 0.31421756744384766, + "learning_rate": 4.603776954863266e-06, + "loss": 0.0126, + "reward": 2.2083334922790527, + "reward_std": 0.6003471612930298, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 527 + }, + { + "completion_length": 511.16668701171875, + "epoch": 1.8461538461538463, + "grad_norm": 0.7890862226486206, + "kl": 0.21260276436805725, + "learning_rate": 4.601416508739211e-06, + "loss": 0.0085, + "reward": 2.2916667461395264, + "reward_std": 1.4901063442230225, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 528 + }, + { + "completion_length": 145.6666717529297, + "epoch": 1.8496503496503496, + "grad_norm": 2.972633123397827, + "kl": 1.6821321249008179, + "learning_rate": 4.599049661336033e-06, + "loss": 0.0673, + "reward": 2.4583334922790527, + "reward_std": 1.3603004217147827, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 529 + }, + { + "completion_length": 337.66668701171875, + "epoch": 1.8531468531468531, + "grad_norm": 0.4933686852455139, + "kl": 0.2972989082336426, + "learning_rate": 4.596676419863561e-06, + "loss": 0.0119, + "reward": 3.758333206176758, + "reward_std": 0.4694856107234955, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9250000715255737, + "step": 530 + }, + { + "completion_length": 1491.166748046875, + "epoch": 1.8566433566433567, + "grad_norm": 0.7114420533180237, + "kl": 0.16526620090007782, + "learning_rate": 4.5942967915510975e-06, + "loss": 0.0066, + "reward": 2.683333396911621, + "reward_std": 0.8942409753799438, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 531 + }, + { + "completion_length": 822.0, + "epoch": 1.86013986013986, + "grad_norm": 0.4190931022167206, + "kl": 0.21502110362052917, + "learning_rate": 4.591910783647405e-06, + "loss": 0.0086, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 532 + }, + { + "completion_length": 739.5, + "epoch": 1.8636363636363638, + "grad_norm": 0.5615747570991516, + "kl": 0.223265141248703, + "learning_rate": 4.589518403420676e-06, + "loss": 0.0089, + "reward": 2.3500001430511475, + "reward_std": 1.5231547355651855, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 533 + }, + { + "completion_length": 188.6666717529297, + "epoch": 1.867132867132867, + "grad_norm": 0.754673957824707, + "kl": 0.2731919288635254, + "learning_rate": 4.587119658158517e-06, + "loss": 0.0109, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 534 + }, + { + "completion_length": 528.3333740234375, + "epoch": 1.8706293706293706, + "grad_norm": 0.45285508036613464, + "kl": 0.21540388464927673, + "learning_rate": 4.584714555167921e-06, + "loss": 0.0086, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 535 + }, + { + "completion_length": 513.1666870117188, + "epoch": 1.8741258741258742, + "grad_norm": 0.6436936259269714, + "kl": 0.2541727125644684, + "learning_rate": 4.582303101775249e-06, + "loss": 0.0102, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 536 + }, + { + "completion_length": 503.3333435058594, + "epoch": 1.8776223776223775, + "grad_norm": 0.5080775618553162, + "kl": 0.2073960304260254, + "learning_rate": 4.579885305326206e-06, + "loss": 0.0083, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 537 + }, + { + "completion_length": 209.6666717529297, + "epoch": 1.8811188811188813, + "grad_norm": 0.9030362963676453, + "kl": 0.283308744430542, + "learning_rate": 4.577461173185821e-06, + "loss": 0.0113, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 538 + }, + { + "completion_length": 121.5, + "epoch": 1.8846153846153846, + "grad_norm": 2.8895628452301025, + "kl": 0.8616495132446289, + "learning_rate": 4.5750307127384194e-06, + "loss": 0.0345, + "reward": 1.4666666984558105, + "reward_std": 1.2002778053283691, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 539 + }, + { + "completion_length": 208.83334350585938, + "epoch": 1.8881118881118881, + "grad_norm": 1.0781502723693848, + "kl": 0.2666887640953064, + "learning_rate": 4.572593931387604e-06, + "loss": 0.0107, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 540 + }, + { + "completion_length": 529.8333740234375, + "epoch": 1.8916083916083917, + "grad_norm": 0.8341970443725586, + "kl": 0.27578771114349365, + "learning_rate": 4.570150836556236e-06, + "loss": 0.011, + "reward": 2.683333396911621, + "reward_std": 0.9092121124267578, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 541 + }, + { + "completion_length": 509.0, + "epoch": 1.895104895104895, + "grad_norm": 0.7221694588661194, + "kl": 0.20753830671310425, + "learning_rate": 4.567701435686405e-06, + "loss": 0.0083, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 542 + }, + { + "completion_length": 999.0, + "epoch": 1.8986013986013988, + "grad_norm": 0.8567831516265869, + "kl": 0.2119346261024475, + "learning_rate": 4.5652457362394094e-06, + "loss": 0.0085, + "reward": 1.808333396911621, + "reward_std": 2.014302968978882, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 543 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.902097902097902, + "grad_norm": 0.5826951265335083, + "kl": 0.2415902316570282, + "learning_rate": 4.562783745695738e-06, + "loss": 0.0097, + "reward": 2.183333396911621, + "reward_std": 1.0230673551559448, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 544 + }, + { + "completion_length": 831.0, + "epoch": 1.9055944055944056, + "grad_norm": 0.5661029815673828, + "kl": 0.2621002495288849, + "learning_rate": 4.560315471555039e-06, + "loss": 0.0105, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 545 + }, + { + "completion_length": 190.6666717529297, + "epoch": 1.9090909090909092, + "grad_norm": 0.8984940648078918, + "kl": 0.261735200881958, + "learning_rate": 4.5578409213361055e-06, + "loss": 0.0105, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 546 + }, + { + "completion_length": 672.5, + "epoch": 1.9125874125874125, + "grad_norm": 0.6307451128959656, + "kl": 0.3331562280654907, + "learning_rate": 4.555360102576844e-06, + "loss": 0.0133, + "reward": 3.5916666984558105, + "reward_std": 0.5571505427360535, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9250000715255737, + "step": 547 + }, + { + "completion_length": 193.5, + "epoch": 1.916083916083916, + "grad_norm": 0.9689189791679382, + "kl": 0.31761375069618225, + "learning_rate": 4.55287302283426e-06, + "loss": 0.0127, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 548 + }, + { + "completion_length": 477.0, + "epoch": 1.9195804195804196, + "grad_norm": 1.1217161417007446, + "kl": 0.4803551435470581, + "learning_rate": 4.550379689684431e-06, + "loss": 0.0192, + "reward": 2.924999952316284, + "reward_std": 0.06123730167746544, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.9249999523162842, + "step": 549 + }, + { + "completion_length": 501.66668701171875, + "epoch": 1.9230769230769231, + "grad_norm": 0.48732584714889526, + "kl": 0.3280116021633148, + "learning_rate": 4.54788011072248e-06, + "loss": 0.0131, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 550 + }, + { + "completion_length": 190.5, + "epoch": 1.9265734265734267, + "grad_norm": 0.05169845372438431, + "kl": 0.2321687638759613, + "learning_rate": 4.545374293562559e-06, + "loss": 0.0117, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 551 + }, + { + "completion_length": 226.33334350585938, + "epoch": 1.93006993006993, + "grad_norm": 1.1284880638122559, + "kl": 0.3435511291027069, + "learning_rate": 4.542862245837821e-06, + "loss": 0.0137, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 552 + }, + { + "completion_length": 197.5, + "epoch": 1.9335664335664335, + "grad_norm": 0.8085185289382935, + "kl": 0.2905815541744232, + "learning_rate": 4.540343975200401e-06, + "loss": 0.0116, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 553 + }, + { + "completion_length": 504.8333435058594, + "epoch": 1.937062937062937, + "grad_norm": 0.38323989510536194, + "kl": 0.26971811056137085, + "learning_rate": 4.537819489321385e-06, + "loss": 0.0108, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 554 + }, + { + "completion_length": 172.5, + "epoch": 1.9405594405594404, + "grad_norm": 1.8462821245193481, + "kl": 0.32645952701568604, + "learning_rate": 4.535288795890799e-06, + "loss": 0.0131, + "reward": 2.8500001430511475, + "reward_std": 1.3190906047821045, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 555 + }, + { + "completion_length": 508.66668701171875, + "epoch": 1.9440559440559442, + "grad_norm": 0.48262494802474976, + "kl": 0.26610442996025085, + "learning_rate": 4.5327519026175694e-06, + "loss": 0.0106, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 556 + }, + { + "completion_length": 205.33334350585938, + "epoch": 1.9475524475524475, + "grad_norm": 0.8724077343940735, + "kl": 0.34979626536369324, + "learning_rate": 4.530208817229516e-06, + "loss": 0.014, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 557 + }, + { + "completion_length": 466.3333435058594, + "epoch": 1.951048951048951, + "grad_norm": 1.2409106492996216, + "kl": 0.5075003504753113, + "learning_rate": 4.527659547473317e-06, + "loss": 0.0203, + "reward": 1.774999976158142, + "reward_std": 1.3299436569213867, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6083333492279053, + "step": 558 + }, + { + "completion_length": 201.0, + "epoch": 1.9545454545454546, + "grad_norm": 0.9538130760192871, + "kl": 0.22750967741012573, + "learning_rate": 4.5251041011144905e-06, + "loss": 0.0091, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 559 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.958041958041958, + "grad_norm": 0.8161240220069885, + "kl": 0.28019654750823975, + "learning_rate": 4.522542485937369e-06, + "loss": 0.0112, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 560 + }, + { + "completion_length": 515.5, + "epoch": 1.9615384615384617, + "grad_norm": 0.6905736327171326, + "kl": 0.20913702249526978, + "learning_rate": 4.519974709745076e-06, + "loss": 0.0084, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 561 + }, + { + "completion_length": 201.5, + "epoch": 1.965034965034965, + "grad_norm": 1.109075665473938, + "kl": 0.29383933544158936, + "learning_rate": 4.517400780359505e-06, + "loss": 0.0118, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 562 + }, + { + "completion_length": 849.0, + "epoch": 1.9685314685314685, + "grad_norm": 0.5454800128936768, + "kl": 0.16988810896873474, + "learning_rate": 4.51482070562129e-06, + "loss": 0.0068, + "reward": 2.4666666984558105, + "reward_std": 1.949530005455017, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 563 + }, + { + "completion_length": 826.0, + "epoch": 1.972027972027972, + "grad_norm": 0.521063506603241, + "kl": 0.2149253934621811, + "learning_rate": 4.512234493389785e-06, + "loss": 0.0086, + "reward": 1.9666666984558105, + "reward_std": 1.5233734846115112, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 564 + }, + { + "completion_length": 502.8333435058594, + "epoch": 1.9755244755244754, + "grad_norm": 0.4798555076122284, + "kl": 0.26902374625205994, + "learning_rate": 4.509642151543043e-06, + "loss": 0.0108, + "reward": 1.625, + "reward_std": 0.7960842847824097, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 565 + }, + { + "completion_length": 525.0, + "epoch": 1.9790209790209792, + "grad_norm": 0.566384494304657, + "kl": 0.2703857123851776, + "learning_rate": 4.507043687977787e-06, + "loss": 0.0108, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 566 + }, + { + "completion_length": 194.33334350585938, + "epoch": 1.9825174825174825, + "grad_norm": 2.502077579498291, + "kl": 0.4179210364818573, + "learning_rate": 4.504439110609385e-06, + "loss": 0.0167, + "reward": 1.383333444595337, + "reward_std": 0.8920015096664429, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 567 + }, + { + "completion_length": 199.33334350585938, + "epoch": 1.986013986013986, + "grad_norm": 0.07109465450048447, + "kl": 0.2686344385147095, + "learning_rate": 4.501828427371834e-06, + "loss": 0.0131, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 568 + }, + { + "completion_length": 190.83334350585938, + "epoch": 1.9895104895104896, + "grad_norm": 1.11842942237854, + "kl": 0.2603175640106201, + "learning_rate": 4.4992116462177274e-06, + "loss": 0.0104, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 569 + }, + { + "completion_length": 513.8333740234375, + "epoch": 1.993006993006993, + "grad_norm": 0.47602808475494385, + "kl": 0.20756664872169495, + "learning_rate": 4.496588775118232e-06, + "loss": 0.0083, + "reward": 2.2916667461395264, + "reward_std": 1.19181227684021, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 570 + }, + { + "completion_length": 197.1666717529297, + "epoch": 1.9965034965034965, + "grad_norm": 0.7599025368690491, + "kl": 0.23664715886116028, + "learning_rate": 4.4939598220630724e-06, + "loss": 0.0095, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 571 + }, + { + "completion_length": 207.83334350585938, + "epoch": 2.0, + "grad_norm": 0.7908173203468323, + "kl": 0.28615739941596985, + "learning_rate": 4.491324795060491e-06, + "loss": 0.0114, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 572 + }, + { + "completion_length": 194.6666717529297, + "epoch": 2.0034965034965033, + "grad_norm": 0.9715352654457092, + "kl": 0.3183891177177429, + "learning_rate": 4.48868370213724e-06, + "loss": 0.0127, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 573 + }, + { + "completion_length": 173.83334350585938, + "epoch": 2.006993006993007, + "grad_norm": 2.3841874599456787, + "kl": 1.3214149475097656, + "learning_rate": 4.4860365513385456e-06, + "loss": 0.0529, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 574 + }, + { + "completion_length": 213.83334350585938, + "epoch": 2.0104895104895104, + "grad_norm": 0.9496575593948364, + "kl": 0.22735705971717834, + "learning_rate": 4.4833833507280884e-06, + "loss": 0.0091, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 575 + }, + { + "completion_length": 511.0, + "epoch": 2.013986013986014, + "grad_norm": 0.6045878529548645, + "kl": 0.25393787026405334, + "learning_rate": 4.4807241083879774e-06, + "loss": 0.0102, + "reward": 1.4583333730697632, + "reward_std": 0.8187898397445679, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 576 + }, + { + "completion_length": 222.1666717529297, + "epoch": 2.0174825174825175, + "grad_norm": 0.7379043102264404, + "kl": 0.22020569443702698, + "learning_rate": 4.478058832418726e-06, + "loss": 0.0088, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 577 + }, + { + "completion_length": 204.6666717529297, + "epoch": 2.020979020979021, + "grad_norm": 0.9404547810554504, + "kl": 0.2797861695289612, + "learning_rate": 4.475387530939226e-06, + "loss": 0.0112, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 578 + }, + { + "completion_length": 206.6666717529297, + "epoch": 2.0244755244755246, + "grad_norm": 0.8784480690956116, + "kl": 0.24152153730392456, + "learning_rate": 4.4727102120867274e-06, + "loss": 0.0097, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 579 + }, + { + "completion_length": 414.66668701171875, + "epoch": 2.027972027972028, + "grad_norm": 0.6715477705001831, + "kl": 0.21307629346847534, + "learning_rate": 4.470026884016805e-06, + "loss": 0.0085, + "reward": 2.4583334922790527, + "reward_std": 1.5001389980316162, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 580 + }, + { + "completion_length": 528.5, + "epoch": 2.0314685314685317, + "grad_norm": 0.7886191010475159, + "kl": 0.4145243763923645, + "learning_rate": 4.467337554903344e-06, + "loss": 0.0166, + "reward": 3.5416667461395264, + "reward_std": 1.0002083778381348, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.875, + "step": 581 + }, + { + "completion_length": 457.5, + "epoch": 2.034965034965035, + "grad_norm": 5.719381809234619, + "kl": 1.370613932609558, + "learning_rate": 4.464642232938505e-06, + "loss": 0.0548, + "reward": 1.9750001430511475, + "reward_std": 2.163504123687744, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 582 + }, + { + "completion_length": 361.5, + "epoch": 2.0384615384615383, + "grad_norm": 0.5381609201431274, + "kl": 0.23687216639518738, + "learning_rate": 4.461940926332708e-06, + "loss": 0.0095, + "reward": 1.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 583 + }, + { + "completion_length": 874.6666870117188, + "epoch": 2.041958041958042, + "grad_norm": 0.45025861263275146, + "kl": 0.16833463311195374, + "learning_rate": 4.4592336433146e-06, + "loss": 0.0067, + "reward": 2.9583334922790527, + "reward_std": 1.6554203033447266, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 584 + }, + { + "completion_length": 726.3333740234375, + "epoch": 2.0454545454545454, + "grad_norm": 0.4446694254875183, + "kl": 0.17844387888908386, + "learning_rate": 4.456520392131035e-06, + "loss": 0.0071, + "reward": 1.133333444595337, + "reward_std": 0.9595138430595398, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 585 + }, + { + "completion_length": 830.3333740234375, + "epoch": 2.0489510489510487, + "grad_norm": 0.8371572494506836, + "kl": 0.16316595673561096, + "learning_rate": 4.453801181047047e-06, + "loss": 0.0065, + "reward": 1.524999976158142, + "reward_std": 1.1206024885177612, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 586 + }, + { + "completion_length": 110.5, + "epoch": 2.0524475524475525, + "grad_norm": 3.6648356914520264, + "kl": 0.4860494136810303, + "learning_rate": 4.4510760183458246e-06, + "loss": 0.0194, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 587 + }, + { + "completion_length": 228.6666717529297, + "epoch": 2.055944055944056, + "grad_norm": 0.8717478513717651, + "kl": 0.28448450565338135, + "learning_rate": 4.448344912328686e-06, + "loss": 0.0114, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 588 + }, + { + "completion_length": 614.0, + "epoch": 2.0594405594405596, + "grad_norm": 0.352130651473999, + "kl": 0.19009076058864594, + "learning_rate": 4.445607871315053e-06, + "loss": 0.0076, + "reward": 1.7333333492279053, + "reward_std": 0.5307227969169617, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 589 + }, + { + "completion_length": 476.3333435058594, + "epoch": 2.062937062937063, + "grad_norm": 2.5581870079040527, + "kl": 0.5677192807197571, + "learning_rate": 4.442864903642428e-06, + "loss": 0.0227, + "reward": 1.8000000715255737, + "reward_std": 1.5792405605316162, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 590 + }, + { + "completion_length": 314.66668701171875, + "epoch": 2.0664335664335662, + "grad_norm": 0.657811164855957, + "kl": 0.20458662509918213, + "learning_rate": 4.440116017666365e-06, + "loss": 0.0082, + "reward": 3.116666793823242, + "reward_std": 1.3291600942611694, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 591 + }, + { + "completion_length": 516.0, + "epoch": 2.06993006993007, + "grad_norm": 0.473056823015213, + "kl": 0.19687163829803467, + "learning_rate": 4.437361221760449e-06, + "loss": 0.0079, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 592 + }, + { + "completion_length": 217.0, + "epoch": 2.0734265734265733, + "grad_norm": 0.793745756149292, + "kl": 0.2862774133682251, + "learning_rate": 4.434600524316266e-06, + "loss": 0.0115, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 593 + }, + { + "completion_length": 216.0, + "epoch": 2.076923076923077, + "grad_norm": 0.7589979767799377, + "kl": 0.2887541651725769, + "learning_rate": 4.431833933743378e-06, + "loss": 0.0116, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 594 + }, + { + "completion_length": 234.0, + "epoch": 2.0804195804195804, + "grad_norm": 0.952064037322998, + "kl": 0.30340343713760376, + "learning_rate": 4.4290614584693005e-06, + "loss": 0.0121, + "reward": 2.5375001430511475, + "reward_std": 0.9115578532218933, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 595 + }, + { + "completion_length": 1109.8333740234375, + "epoch": 2.0839160839160837, + "grad_norm": 0.382217139005661, + "kl": 0.1974603831768036, + "learning_rate": 4.426283106939474e-06, + "loss": 0.0079, + "reward": 1.7166666984558105, + "reward_std": 0.967298686504364, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7166666984558105, + "step": 596 + }, + { + "completion_length": 497.66668701171875, + "epoch": 2.0874125874125875, + "grad_norm": 0.7741627097129822, + "kl": 0.2393149733543396, + "learning_rate": 4.423498887617238e-06, + "loss": 0.0096, + "reward": 1.9583333730697632, + "reward_std": 1.400148868560791, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 597 + }, + { + "completion_length": 518.0, + "epoch": 2.090909090909091, + "grad_norm": 0.534230649471283, + "kl": 0.22715210914611816, + "learning_rate": 4.420708808983809e-06, + "loss": 0.0091, + "reward": 2.4583334922790527, + "reward_std": 1.3603003025054932, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 598 + }, + { + "completion_length": 502.8333435058594, + "epoch": 2.0944055944055946, + "grad_norm": 0.5411605834960938, + "kl": 0.2008448839187622, + "learning_rate": 4.41791287953825e-06, + "loss": 0.008, + "reward": 2.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 599 + }, + { + "completion_length": 545.6666870117188, + "epoch": 2.097902097902098, + "grad_norm": 0.44943779706954956, + "kl": 0.225155770778656, + "learning_rate": 4.415111107797445e-06, + "loss": 0.009, + "reward": 3.016666889190674, + "reward_std": 1.3952300548553467, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 600 + }, + { + "completion_length": 239.0, + "epoch": 2.1013986013986012, + "grad_norm": 0.9387716054916382, + "kl": 0.2535586357116699, + "learning_rate": 4.412303502296081e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 601 + }, + { + "completion_length": 188.0, + "epoch": 2.104895104895105, + "grad_norm": 3.3025033473968506, + "kl": 0.3564508557319641, + "learning_rate": 4.409490071586606e-06, + "loss": 0.0143, + "reward": 2.9583334922790527, + "reward_std": 1.6554205417633057, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 602 + }, + { + "completion_length": 526.8333740234375, + "epoch": 2.1083916083916083, + "grad_norm": 0.7135488986968994, + "kl": 0.25961729884147644, + "learning_rate": 4.406670824239221e-06, + "loss": 0.0104, + "reward": 2.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 603 + }, + { + "completion_length": 201.0, + "epoch": 2.111888111888112, + "grad_norm": 0.5526494979858398, + "kl": 0.26036110520362854, + "learning_rate": 4.403845768841842e-06, + "loss": 0.0104, + "reward": 2.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 604 + }, + { + "completion_length": 516.8333740234375, + "epoch": 2.1153846153846154, + "grad_norm": 0.4089651107788086, + "kl": 0.2617362141609192, + "learning_rate": 4.401014914000078e-06, + "loss": 0.0105, + "reward": 3.2916667461395264, + "reward_std": 1.6125807762145996, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 605 + }, + { + "completion_length": 192.5, + "epoch": 2.1188811188811187, + "grad_norm": 0.7996219396591187, + "kl": 0.30715522170066833, + "learning_rate": 4.398178268337202e-06, + "loss": 0.0123, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 606 + }, + { + "completion_length": 793.3333740234375, + "epoch": 2.1223776223776225, + "grad_norm": 0.8545472025871277, + "kl": 0.20438644289970398, + "learning_rate": 4.395335840494131e-06, + "loss": 0.0082, + "reward": 3.375, + "reward_std": 0.493710458278656, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.875, + "step": 607 + }, + { + "completion_length": 197.5, + "epoch": 2.125874125874126, + "grad_norm": 0.09662449359893799, + "kl": 0.2624778151512146, + "learning_rate": 4.3924876391293915e-06, + "loss": 0.0117, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 608 + }, + { + "completion_length": 199.0, + "epoch": 2.129370629370629, + "grad_norm": 0.8693634867668152, + "kl": 0.232680082321167, + "learning_rate": 4.389633672919099e-06, + "loss": 0.0093, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 609 + }, + { + "completion_length": 213.1666717529297, + "epoch": 2.132867132867133, + "grad_norm": 0.23271039128303528, + "kl": 0.2889987826347351, + "learning_rate": 4.386773950556931e-06, + "loss": 0.0139, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 610 + }, + { + "completion_length": 197.83334350585938, + "epoch": 2.1363636363636362, + "grad_norm": 0.8127601742744446, + "kl": 0.35951054096221924, + "learning_rate": 4.3839084807540956e-06, + "loss": 0.0144, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 611 + }, + { + "completion_length": 164.6666717529297, + "epoch": 2.13986013986014, + "grad_norm": 1.0649946928024292, + "kl": 0.26743820309638977, + "learning_rate": 4.381037272239311e-06, + "loss": 0.0107, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 612 + }, + { + "completion_length": 217.6666717529297, + "epoch": 2.1433566433566433, + "grad_norm": 0.8122753500938416, + "kl": 0.27118992805480957, + "learning_rate": 4.378160333758779e-06, + "loss": 0.0108, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 613 + }, + { + "completion_length": 201.6666717529297, + "epoch": 2.1468531468531467, + "grad_norm": 0.8640854358673096, + "kl": 0.2445271909236908, + "learning_rate": 4.3752776740761495e-06, + "loss": 0.0098, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 614 + }, + { + "completion_length": 188.6666717529297, + "epoch": 2.1503496503496504, + "grad_norm": 1.3168154954910278, + "kl": 0.2900705933570862, + "learning_rate": 4.372389301972506e-06, + "loss": 0.0116, + "reward": 1.7083333730697632, + "reward_std": 0.591960072517395, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 615 + }, + { + "completion_length": 241.6666717529297, + "epoch": 2.1538461538461537, + "grad_norm": 1.1053791046142578, + "kl": 0.4096168875694275, + "learning_rate": 4.36949522624633e-06, + "loss": 0.0164, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 616 + }, + { + "completion_length": 147.83334350585938, + "epoch": 2.1573426573426575, + "grad_norm": 3.980419874191284, + "kl": 1.5825055837631226, + "learning_rate": 4.366595455713479e-06, + "loss": 0.0633, + "reward": 2.3000001907348633, + "reward_std": 1.4812158346176147, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 617 + }, + { + "completion_length": 197.0, + "epoch": 2.160839160839161, + "grad_norm": 0.8954426050186157, + "kl": 0.23646585643291473, + "learning_rate": 4.3636899992071555e-06, + "loss": 0.0095, + "reward": 2.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 618 + }, + { + "completion_length": 221.33334350585938, + "epoch": 2.164335664335664, + "grad_norm": 0.8455007076263428, + "kl": 0.25921204686164856, + "learning_rate": 4.360778865577885e-06, + "loss": 0.0104, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 619 + }, + { + "completion_length": 196.5, + "epoch": 2.167832167832168, + "grad_norm": 0.8735758662223816, + "kl": 0.27696120738983154, + "learning_rate": 4.357862063693486e-06, + "loss": 0.0111, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 620 + }, + { + "completion_length": 177.83334350585938, + "epoch": 2.1713286713286712, + "grad_norm": 32.12022018432617, + "kl": 2.4454264640808105, + "learning_rate": 4.354939602439041e-06, + "loss": 0.0978, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 621 + }, + { + "completion_length": 184.6666717529297, + "epoch": 2.174825174825175, + "grad_norm": 2.8916237354278564, + "kl": 0.3946024775505066, + "learning_rate": 4.352011490716875e-06, + "loss": 0.0158, + "reward": 3.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 622 + }, + { + "completion_length": 210.33334350585938, + "epoch": 2.1783216783216783, + "grad_norm": 1.4287588596343994, + "kl": 0.32967257499694824, + "learning_rate": 4.349077737446525e-06, + "loss": 0.0132, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 623 + }, + { + "completion_length": 229.83334350585938, + "epoch": 2.1818181818181817, + "grad_norm": 0.04024571180343628, + "kl": 0.2965821325778961, + "learning_rate": 4.346138351564711e-06, + "loss": 0.0142, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 624 + }, + { + "completion_length": 153.83334350585938, + "epoch": 2.1853146853146854, + "grad_norm": 0.9452215433120728, + "kl": 0.26284661889076233, + "learning_rate": 4.34319334202531e-06, + "loss": 0.0105, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 625 + }, + { + "completion_length": 162.1666717529297, + "epoch": 2.1888111888111887, + "grad_norm": 32.100563049316406, + "kl": 7.969426155090332, + "learning_rate": 4.340242717799337e-06, + "loss": 0.3188, + "reward": 2.2916667461395264, + "reward_std": 0.8002604246139526, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 626 + }, + { + "completion_length": 175.5, + "epoch": 2.1923076923076925, + "grad_norm": 6.515329360961914, + "kl": 0.3849031627178192, + "learning_rate": 4.3372864878749e-06, + "loss": 0.0154, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 627 + }, + { + "completion_length": 504.3333435058594, + "epoch": 2.195804195804196, + "grad_norm": 0.6083482503890991, + "kl": 0.19082359969615936, + "learning_rate": 4.334324661257191e-06, + "loss": 0.0076, + "reward": 2.4583334922790527, + "reward_std": 1.5001389980316162, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 628 + }, + { + "completion_length": 196.0, + "epoch": 2.199300699300699, + "grad_norm": 0.9820056557655334, + "kl": 0.2912360727787018, + "learning_rate": 4.331357246968447e-06, + "loss": 0.0116, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 629 + }, + { + "completion_length": 544.0, + "epoch": 2.202797202797203, + "grad_norm": 0.5948340892791748, + "kl": 0.22720639407634735, + "learning_rate": 4.328384254047927e-06, + "loss": 0.0091, + "reward": 2.375, + "reward_std": 0.6509608626365662, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 630 + }, + { + "completion_length": 237.0, + "epoch": 2.2062937062937062, + "grad_norm": 0.0632646456360817, + "kl": 0.2671894431114197, + "learning_rate": 4.3254056915518815e-06, + "loss": 0.0131, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 631 + }, + { + "completion_length": 501.16668701171875, + "epoch": 2.20979020979021, + "grad_norm": 0.44626739621162415, + "kl": 0.2233467698097229, + "learning_rate": 4.322421568553529e-06, + "loss": 0.0089, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 632 + }, + { + "completion_length": 187.5, + "epoch": 2.2132867132867133, + "grad_norm": 0.9024590849876404, + "kl": 0.299750417470932, + "learning_rate": 4.319431894143027e-06, + "loss": 0.012, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 633 + }, + { + "completion_length": 532.5, + "epoch": 2.2167832167832167, + "grad_norm": 0.38001272082328796, + "kl": 0.28776365518569946, + "learning_rate": 4.316436677427441e-06, + "loss": 0.0115, + "reward": 3.566666603088379, + "reward_std": 0.9389711618423462, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 634 + }, + { + "completion_length": 201.6666717529297, + "epoch": 2.2202797202797204, + "grad_norm": 1.1841076612472534, + "kl": 0.3013113737106323, + "learning_rate": 4.313435927530719e-06, + "loss": 0.0121, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 635 + }, + { + "completion_length": 185.6666717529297, + "epoch": 2.2237762237762237, + "grad_norm": 0.8018883466720581, + "kl": 0.2923080325126648, + "learning_rate": 4.3104296535936695e-06, + "loss": 0.0117, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 636 + }, + { + "completion_length": 525.3333740234375, + "epoch": 2.227272727272727, + "grad_norm": 0.4936811923980713, + "kl": 0.25341111421585083, + "learning_rate": 4.3074178647739205e-06, + "loss": 0.0101, + "reward": 3.2083334922790527, + "reward_std": 0.9697508215904236, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 637 + }, + { + "completion_length": 191.83334350585938, + "epoch": 2.230769230769231, + "grad_norm": 0.6575815677642822, + "kl": 0.3100575804710388, + "learning_rate": 4.3044005702459055e-06, + "loss": 0.0124, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 638 + }, + { + "completion_length": 178.5, + "epoch": 2.234265734265734, + "grad_norm": 0.8525052666664124, + "kl": 0.31076908111572266, + "learning_rate": 4.301377779200826e-06, + "loss": 0.0124, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 639 + }, + { + "completion_length": 185.33334350585938, + "epoch": 2.237762237762238, + "grad_norm": 1.0106300115585327, + "kl": 0.30621784925460815, + "learning_rate": 4.2983495008466285e-06, + "loss": 0.0122, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 640 + }, + { + "completion_length": 186.5, + "epoch": 2.2412587412587412, + "grad_norm": 0.885761022567749, + "kl": 0.3738858103752136, + "learning_rate": 4.295315744407972e-06, + "loss": 0.015, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 641 + }, + { + "completion_length": 171.6666717529297, + "epoch": 2.2447552447552446, + "grad_norm": 1.113839030265808, + "kl": 0.3465404212474823, + "learning_rate": 4.2922765191262075e-06, + "loss": 0.0139, + "reward": 2.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 642 + }, + { + "completion_length": 203.0, + "epoch": 2.2482517482517483, + "grad_norm": 0.8950809836387634, + "kl": 0.2658528983592987, + "learning_rate": 4.28923183425934e-06, + "loss": 0.0106, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 643 + }, + { + "completion_length": 198.5, + "epoch": 2.2517482517482517, + "grad_norm": 0.9561752080917358, + "kl": 0.31710129976272583, + "learning_rate": 4.286181699082008e-06, + "loss": 0.0127, + "reward": 2.2833335399627686, + "reward_std": 1.0327956676483154, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 644 + }, + { + "completion_length": 168.1666717529297, + "epoch": 2.2552447552447554, + "grad_norm": 0.8310069441795349, + "kl": 0.27687615156173706, + "learning_rate": 4.283126122885455e-06, + "loss": 0.0111, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 645 + }, + { + "completion_length": 196.83334350585938, + "epoch": 2.2587412587412588, + "grad_norm": 0.09269661456346512, + "kl": 0.2699682414531708, + "learning_rate": 4.280065114977492e-06, + "loss": 0.012, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 646 + }, + { + "completion_length": 163.6666717529297, + "epoch": 2.262237762237762, + "grad_norm": 1.2992812395095825, + "kl": 0.3616819381713867, + "learning_rate": 4.276998684682482e-06, + "loss": 0.0145, + "reward": 2.375, + "reward_std": 1.1847995519638062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 647 + }, + { + "completion_length": 198.6666717529297, + "epoch": 2.265734265734266, + "grad_norm": 0.8000275492668152, + "kl": 0.2609575390815735, + "learning_rate": 4.273926841341303e-06, + "loss": 0.0104, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 648 + }, + { + "completion_length": 196.1666717529297, + "epoch": 2.269230769230769, + "grad_norm": 0.8786153197288513, + "kl": 0.3877195119857788, + "learning_rate": 4.270849594311323e-06, + "loss": 0.0155, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 649 + }, + { + "completion_length": 201.0, + "epoch": 2.2727272727272725, + "grad_norm": 0.9727340936660767, + "kl": 0.3743540942668915, + "learning_rate": 4.267766952966369e-06, + "loss": 0.015, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 650 + }, + { + "completion_length": 205.33334350585938, + "epoch": 2.2762237762237763, + "grad_norm": 0.09209764748811722, + "kl": 0.27989333868026733, + "learning_rate": 4.264678926696703e-06, + "loss": 0.0136, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 651 + }, + { + "completion_length": 202.5, + "epoch": 2.2797202797202796, + "grad_norm": 0.9205158948898315, + "kl": 0.3037436008453369, + "learning_rate": 4.261585524908987e-06, + "loss": 0.0121, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 652 + }, + { + "completion_length": 304.66668701171875, + "epoch": 2.2832167832167833, + "grad_norm": 0.8844843506813049, + "kl": 0.3668223023414612, + "learning_rate": 4.25848675702626e-06, + "loss": 0.0147, + "reward": 1.9500001668930054, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 653 + }, + { + "completion_length": 194.6666717529297, + "epoch": 2.2867132867132867, + "grad_norm": 1.0558805465698242, + "kl": 0.3064219057559967, + "learning_rate": 4.255382632487907e-06, + "loss": 0.0123, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 654 + }, + { + "completion_length": 185.6666717529297, + "epoch": 2.29020979020979, + "grad_norm": 0.9313608407974243, + "kl": 0.31230098009109497, + "learning_rate": 4.2522731607496275e-06, + "loss": 0.0125, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 655 + }, + { + "completion_length": 211.1666717529297, + "epoch": 2.2937062937062938, + "grad_norm": 0.19107016921043396, + "kl": 0.373710036277771, + "learning_rate": 4.249158351283414e-06, + "loss": 0.0173, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 656 + }, + { + "completion_length": 348.8333435058594, + "epoch": 2.297202797202797, + "grad_norm": 0.7309221029281616, + "kl": 0.3733287751674652, + "learning_rate": 4.246038213577516e-06, + "loss": 0.0149, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 657 + }, + { + "completion_length": 180.6666717529297, + "epoch": 2.300699300699301, + "grad_norm": 0.8861889839172363, + "kl": 0.35562607645988464, + "learning_rate": 4.242912757136412e-06, + "loss": 0.0142, + "reward": 2.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 658 + }, + { + "completion_length": 204.5, + "epoch": 2.304195804195804, + "grad_norm": 0.7407400608062744, + "kl": 0.28287678956985474, + "learning_rate": 4.239781991480786e-06, + "loss": 0.0113, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 659 + }, + { + "completion_length": 174.0, + "epoch": 2.3076923076923075, + "grad_norm": 8.534856796264648, + "kl": 1.5403010845184326, + "learning_rate": 4.236645926147493e-06, + "loss": 0.0616, + "reward": 2.2916667461395264, + "reward_std": 0.8002604246139526, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 660 + }, + { + "completion_length": 184.33334350585938, + "epoch": 2.3111888111888113, + "grad_norm": 0.06887773424386978, + "kl": 0.2856985628604889, + "learning_rate": 4.233504570689533e-06, + "loss": 0.0138, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 661 + }, + { + "completion_length": 202.83334350585938, + "epoch": 2.3146853146853146, + "grad_norm": 0.8288156986236572, + "kl": 0.2896421253681183, + "learning_rate": 4.230357934676017e-06, + "loss": 0.0116, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 662 + }, + { + "completion_length": 207.6666717529297, + "epoch": 2.3181818181818183, + "grad_norm": 1.119509220123291, + "kl": 0.4124630391597748, + "learning_rate": 4.227206027692146e-06, + "loss": 0.0165, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 663 + }, + { + "completion_length": 198.1666717529297, + "epoch": 2.3216783216783217, + "grad_norm": 0.8312250971794128, + "kl": 0.3108134865760803, + "learning_rate": 4.224048859339175e-06, + "loss": 0.0124, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 664 + }, + { + "completion_length": 610.8333740234375, + "epoch": 2.325174825174825, + "grad_norm": 0.5707215070724487, + "kl": 0.23091670870780945, + "learning_rate": 4.220886439234385e-06, + "loss": 0.0092, + "reward": 2.383333444595337, + "reward_std": 1.5413198471069336, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 665 + }, + { + "completion_length": 460.0, + "epoch": 2.3286713286713288, + "grad_norm": 10.873461723327637, + "kl": 2.6264634132385254, + "learning_rate": 4.217718777011058e-06, + "loss": 0.1051, + "reward": 1.4666666984558105, + "reward_std": 1.356711745262146, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 666 + }, + { + "completion_length": 207.0, + "epoch": 2.332167832167832, + "grad_norm": 0.6674370765686035, + "kl": 0.2692621350288391, + "learning_rate": 4.2145458823184414e-06, + "loss": 0.0108, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 667 + }, + { + "completion_length": 567.8333740234375, + "epoch": 2.335664335664336, + "grad_norm": 0.42179885506629944, + "kl": 0.2716664671897888, + "learning_rate": 4.211367764821722e-06, + "loss": 0.0109, + "reward": 3.566666603088379, + "reward_std": 0.938971221446991, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 668 + }, + { + "completion_length": 223.5, + "epoch": 2.339160839160839, + "grad_norm": 0.6866164803504944, + "kl": 0.24070698022842407, + "learning_rate": 4.208184434201999e-06, + "loss": 0.0096, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 669 + }, + { + "completion_length": 214.5, + "epoch": 2.3426573426573425, + "grad_norm": 0.9751102924346924, + "kl": 0.2499878704547882, + "learning_rate": 4.204995900156247e-06, + "loss": 0.01, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 670 + }, + { + "completion_length": 182.33334350585938, + "epoch": 2.3461538461538463, + "grad_norm": 3.7804720401763916, + "kl": 0.46188828349113464, + "learning_rate": 4.201802172397295e-06, + "loss": 0.0185, + "reward": 3.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 671 + }, + { + "completion_length": 1019.6666870117188, + "epoch": 2.3496503496503496, + "grad_norm": 0.4247821569442749, + "kl": 0.21799665689468384, + "learning_rate": 4.198603260653792e-06, + "loss": 0.0087, + "reward": 2.7166669368743896, + "reward_std": 1.6418485641479492, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666388511658, + "step": 672 + }, + { + "completion_length": 753.8333740234375, + "epoch": 2.3531468531468533, + "grad_norm": 0.5194523334503174, + "kl": 0.22523364424705505, + "learning_rate": 4.195399174670177e-06, + "loss": 0.009, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 673 + }, + { + "completion_length": 573.6666870117188, + "epoch": 2.3566433566433567, + "grad_norm": 0.5000849366188049, + "kl": 0.22850388288497925, + "learning_rate": 4.192189924206652e-06, + "loss": 0.0091, + "reward": 2.950000286102295, + "reward_std": 1.095445156097412, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 674 + }, + { + "completion_length": 1213.8333740234375, + "epoch": 2.36013986013986, + "grad_norm": 0.5522187352180481, + "kl": 0.177886962890625, + "learning_rate": 4.188975519039151e-06, + "loss": 0.0071, + "reward": 1.3916667699813843, + "reward_std": 1.4026464223861694, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5583333969116211, + "step": 675 + }, + { + "completion_length": 872.6666870117188, + "epoch": 2.3636363636363638, + "grad_norm": 0.4857361912727356, + "kl": 0.20906971395015717, + "learning_rate": 4.185755968959308e-06, + "loss": 0.0084, + "reward": 2.9083335399627686, + "reward_std": 1.696000337600708, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 676 + }, + { + "completion_length": 502.0, + "epoch": 2.367132867132867, + "grad_norm": 0.5935739278793335, + "kl": 0.27800655364990234, + "learning_rate": 4.182531283774434e-06, + "loss": 0.0111, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 677 + }, + { + "completion_length": 738.5, + "epoch": 2.370629370629371, + "grad_norm": 0.5985221862792969, + "kl": 0.2548876702785492, + "learning_rate": 4.179301473307476e-06, + "loss": 0.0102, + "reward": 2.2125000953674316, + "reward_std": 1.3164108991622925, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 678 + }, + { + "completion_length": 173.83334350585938, + "epoch": 2.374125874125874, + "grad_norm": 1.7061294317245483, + "kl": 0.3693540692329407, + "learning_rate": 4.176066547396998e-06, + "loss": 0.0148, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 679 + }, + { + "completion_length": 203.6666717529297, + "epoch": 2.3776223776223775, + "grad_norm": 1.0101178884506226, + "kl": 0.31931599974632263, + "learning_rate": 4.172826515897146e-06, + "loss": 0.0128, + "reward": 2.2083334922790527, + "reward_std": 1.1577637195587158, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 680 + }, + { + "completion_length": 205.83334350585938, + "epoch": 2.3811188811188813, + "grad_norm": 0.8966777920722961, + "kl": 0.3051684498786926, + "learning_rate": 4.169581388677617e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 681 + }, + { + "completion_length": 210.6666717529297, + "epoch": 2.3846153846153846, + "grad_norm": 0.7840998768806458, + "kl": 0.31647345423698425, + "learning_rate": 4.166331175623631e-06, + "loss": 0.0127, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 682 + }, + { + "completion_length": 209.1666717529297, + "epoch": 2.3881118881118883, + "grad_norm": 0.9048584699630737, + "kl": 0.25157231092453003, + "learning_rate": 4.163075886635902e-06, + "loss": 0.0101, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 683 + }, + { + "completion_length": 494.66668701171875, + "epoch": 2.3916083916083917, + "grad_norm": 0.612885057926178, + "kl": 0.1984379142522812, + "learning_rate": 4.159815531630604e-06, + "loss": 0.0079, + "reward": 2.125, + "reward_std": 1.3129165172576904, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 684 + }, + { + "completion_length": 182.83334350585938, + "epoch": 2.395104895104895, + "grad_norm": 1.069145679473877, + "kl": 0.33643895387649536, + "learning_rate": 4.1565501205393445e-06, + "loss": 0.0135, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 685 + }, + { + "completion_length": 192.6666717529297, + "epoch": 2.3986013986013988, + "grad_norm": 0.8116271495819092, + "kl": 0.29202282428741455, + "learning_rate": 4.15327966330913e-06, + "loss": 0.0117, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 686 + }, + { + "completion_length": 196.0, + "epoch": 2.402097902097902, + "grad_norm": 0.9276851415634155, + "kl": 0.31228408217430115, + "learning_rate": 4.150004169902343e-06, + "loss": 0.0125, + "reward": 1.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 687 + }, + { + "completion_length": 213.6666717529297, + "epoch": 2.4055944055944054, + "grad_norm": 1.0499162673950195, + "kl": 0.24672053754329681, + "learning_rate": 4.146723650296701e-06, + "loss": 0.0099, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 688 + }, + { + "completion_length": 219.1666717529297, + "epoch": 2.409090909090909, + "grad_norm": 0.7051374912261963, + "kl": 0.24717721343040466, + "learning_rate": 4.14343811448524e-06, + "loss": 0.0099, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 689 + }, + { + "completion_length": 226.5, + "epoch": 2.4125874125874125, + "grad_norm": 0.7789434194564819, + "kl": 0.2564643919467926, + "learning_rate": 4.140147572476269e-06, + "loss": 0.0103, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 690 + }, + { + "completion_length": 212.0, + "epoch": 2.4160839160839163, + "grad_norm": 0.8126075267791748, + "kl": 0.23958399891853333, + "learning_rate": 4.136852034293349e-06, + "loss": 0.0096, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 691 + }, + { + "completion_length": 210.6666717529297, + "epoch": 2.4195804195804196, + "grad_norm": 0.8626409769058228, + "kl": 0.2777412533760071, + "learning_rate": 4.133551509975264e-06, + "loss": 0.0111, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 692 + }, + { + "completion_length": 529.8333740234375, + "epoch": 2.423076923076923, + "grad_norm": 0.5266372561454773, + "kl": 0.2946487069129944, + "learning_rate": 4.130246009575981e-06, + "loss": 0.0118, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 693 + }, + { + "completion_length": 217.6666717529297, + "epoch": 2.4265734265734267, + "grad_norm": 0.814607560634613, + "kl": 0.31643202900886536, + "learning_rate": 4.126935543164628e-06, + "loss": 0.0127, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 694 + }, + { + "completion_length": 206.1666717529297, + "epoch": 2.43006993006993, + "grad_norm": 0.6121898293495178, + "kl": 0.24353787302970886, + "learning_rate": 4.123620120825459e-06, + "loss": 0.0097, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 695 + }, + { + "completion_length": 416.16668701171875, + "epoch": 2.4335664335664333, + "grad_norm": 0.65854811668396, + "kl": 0.29339665174484253, + "learning_rate": 4.120299752657828e-06, + "loss": 0.0117, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 696 + }, + { + "completion_length": 524.3333740234375, + "epoch": 2.437062937062937, + "grad_norm": 0.5596239566802979, + "kl": 0.26455265283584595, + "learning_rate": 4.11697444877615e-06, + "loss": 0.0106, + "reward": 2.7916667461395264, + "reward_std": 1.4527273178100586, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 697 + }, + { + "completion_length": 173.6666717529297, + "epoch": 2.4405594405594404, + "grad_norm": 2.7013747692108154, + "kl": 0.5755926370620728, + "learning_rate": 4.113644219309877e-06, + "loss": 0.023, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 698 + }, + { + "completion_length": 893.0, + "epoch": 2.444055944055944, + "grad_norm": 0.5892761945724487, + "kl": 0.22364209592342377, + "learning_rate": 4.110309074403467e-06, + "loss": 0.0089, + "reward": 2.433333396911621, + "reward_std": 1.3728317022323608, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 699 + }, + { + "completion_length": 1029.166748046875, + "epoch": 2.4475524475524475, + "grad_norm": 0.41362571716308594, + "kl": 0.20189592242240906, + "learning_rate": 4.106969024216348e-06, + "loss": 0.0081, + "reward": 1.7416666746139526, + "reward_std": 0.9625055193901062, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7416666746139526, + "step": 700 + }, + { + "completion_length": 200.5, + "epoch": 2.451048951048951, + "grad_norm": 0.9199966788291931, + "kl": 0.29405680298805237, + "learning_rate": 4.103624078922895e-06, + "loss": 0.0118, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 701 + }, + { + "completion_length": 551.8333740234375, + "epoch": 2.4545454545454546, + "grad_norm": 0.5847578644752502, + "kl": 0.30494964122772217, + "learning_rate": 4.1002742487123896e-06, + "loss": 0.0122, + "reward": 2.4583334922790527, + "reward_std": 1.3603003025054932, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 702 + }, + { + "completion_length": 158.5, + "epoch": 2.458041958041958, + "grad_norm": 3.148179054260254, + "kl": 0.33209604024887085, + "learning_rate": 4.096919543788995e-06, + "loss": 0.0133, + "reward": 2.9583334922790527, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 703 + }, + { + "completion_length": 635.1666870117188, + "epoch": 2.4615384615384617, + "grad_norm": 0.7368152141571045, + "kl": 0.2001763880252838, + "learning_rate": 4.093559974371725e-06, + "loss": 0.008, + "reward": 3.204166889190674, + "reward_std": 0.42143115401268005, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 704 + }, + { + "completion_length": 206.83334350585938, + "epoch": 2.465034965034965, + "grad_norm": 0.7404118776321411, + "kl": 0.2592664361000061, + "learning_rate": 4.09019555069441e-06, + "loss": 0.0104, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 705 + }, + { + "completion_length": 203.5, + "epoch": 2.4685314685314683, + "grad_norm": 0.7086665630340576, + "kl": 0.28512802720069885, + "learning_rate": 4.086826283005669e-06, + "loss": 0.0114, + "reward": 2.704166889190674, + "reward_std": 0.6021662950515747, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 706 + }, + { + "completion_length": 175.33334350585938, + "epoch": 2.472027972027972, + "grad_norm": 3.0447657108306885, + "kl": 0.38635802268981934, + "learning_rate": 4.083452181568876e-06, + "loss": 0.0155, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 707 + }, + { + "completion_length": 200.6666717529297, + "epoch": 2.4755244755244754, + "grad_norm": 0.7985562682151794, + "kl": 0.30575287342071533, + "learning_rate": 4.080073256662128e-06, + "loss": 0.0122, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 708 + }, + { + "completion_length": 212.5, + "epoch": 2.479020979020979, + "grad_norm": 1.0262845754623413, + "kl": 0.30596381425857544, + "learning_rate": 4.076689518578217e-06, + "loss": 0.0122, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 709 + }, + { + "completion_length": 199.5, + "epoch": 2.4825174825174825, + "grad_norm": 0.8163771629333496, + "kl": 0.23148366808891296, + "learning_rate": 4.073300977624594e-06, + "loss": 0.0093, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 710 + }, + { + "completion_length": 228.33334350585938, + "epoch": 2.486013986013986, + "grad_norm": 0.6531832218170166, + "kl": 0.27565860748291016, + "learning_rate": 4.069907644123346e-06, + "loss": 0.011, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 711 + }, + { + "completion_length": 487.16668701171875, + "epoch": 2.4895104895104896, + "grad_norm": 0.3693908452987671, + "kl": 0.32342347502708435, + "learning_rate": 4.066509528411151e-06, + "loss": 0.0129, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 712 + }, + { + "completion_length": 191.83334350585938, + "epoch": 2.493006993006993, + "grad_norm": 0.822213351726532, + "kl": 0.3490138649940491, + "learning_rate": 4.063106640839264e-06, + "loss": 0.014, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 713 + }, + { + "completion_length": 178.6666717529297, + "epoch": 2.4965034965034967, + "grad_norm": 0.7303230166435242, + "kl": 0.26454809308052063, + "learning_rate": 4.059698991773466e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 714 + }, + { + "completion_length": 213.6666717529297, + "epoch": 2.5, + "grad_norm": 0.792052149772644, + "kl": 0.32973194122314453, + "learning_rate": 4.056286591594049e-06, + "loss": 0.0132, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 715 + }, + { + "completion_length": 211.0, + "epoch": 2.5034965034965033, + "grad_norm": 0.6441434025764465, + "kl": 0.3346059024333954, + "learning_rate": 4.052869450695776e-06, + "loss": 0.0134, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 716 + }, + { + "completion_length": 181.6666717529297, + "epoch": 2.506993006993007, + "grad_norm": 2.2384145259857178, + "kl": 0.4402106702327728, + "learning_rate": 4.049447579487851e-06, + "loss": 0.0176, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 717 + }, + { + "completion_length": 825.8333740234375, + "epoch": 2.5104895104895104, + "grad_norm": 0.4227934777736664, + "kl": 0.19202569127082825, + "learning_rate": 4.046020988393886e-06, + "loss": 0.0077, + "reward": 2.7916667461395264, + "reward_std": 1.5844295024871826, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 718 + }, + { + "completion_length": 199.6666717529297, + "epoch": 2.513986013986014, + "grad_norm": 0.7948997020721436, + "kl": 0.30144181847572327, + "learning_rate": 4.0425896878518725e-06, + "loss": 0.0121, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 719 + }, + { + "completion_length": 200.6666717529297, + "epoch": 2.5174825174825175, + "grad_norm": 0.7969666123390198, + "kl": 0.2623240351676941, + "learning_rate": 4.039153688314146e-06, + "loss": 0.0105, + "reward": 2.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 720 + }, + { + "completion_length": 184.6666717529297, + "epoch": 2.520979020979021, + "grad_norm": 1.1336637735366821, + "kl": 0.2935950756072998, + "learning_rate": 4.035713000247358e-06, + "loss": 0.0117, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 721 + }, + { + "completion_length": 667.3333740234375, + "epoch": 2.5244755244755246, + "grad_norm": 0.39087414741516113, + "kl": 0.2444695681333542, + "learning_rate": 4.032267634132442e-06, + "loss": 0.0098, + "reward": 3.704166889190674, + "reward_std": 0.6021662950515747, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 722 + }, + { + "completion_length": 818.5, + "epoch": 2.527972027972028, + "grad_norm": 0.42902201414108276, + "kl": 0.18485748767852783, + "learning_rate": 4.028817600464579e-06, + "loss": 0.0074, + "reward": 1.9666666984558105, + "reward_std": 1.5233734846115112, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 723 + }, + { + "completion_length": 197.33334350585938, + "epoch": 2.5314685314685317, + "grad_norm": 0.5554837584495544, + "kl": 0.3039252758026123, + "learning_rate": 4.02536290975317e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 724 + }, + { + "completion_length": 519.3333740234375, + "epoch": 2.534965034965035, + "grad_norm": 0.44166073203086853, + "kl": 0.24431876838207245, + "learning_rate": 4.021903572521802e-06, + "loss": 0.0098, + "reward": 2.7916667461395264, + "reward_std": 1.4527273178100586, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 725 + }, + { + "completion_length": 200.0, + "epoch": 2.5384615384615383, + "grad_norm": 0.7037209868431091, + "kl": 0.3631229102611542, + "learning_rate": 4.018439599308217e-06, + "loss": 0.0145, + "reward": 3.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 726 + }, + { + "completion_length": 192.5, + "epoch": 2.541958041958042, + "grad_norm": 0.664789617061615, + "kl": 0.29182663559913635, + "learning_rate": 4.0149710006642775e-06, + "loss": 0.0117, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 727 + }, + { + "completion_length": 198.5, + "epoch": 2.5454545454545454, + "grad_norm": 1.0678514242172241, + "kl": 0.28828293085098267, + "learning_rate": 4.011497787155938e-06, + "loss": 0.0115, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 728 + }, + { + "completion_length": 213.83334350585938, + "epoch": 2.548951048951049, + "grad_norm": 0.8395413756370544, + "kl": 0.3076155185699463, + "learning_rate": 4.008019969363206e-06, + "loss": 0.0123, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 729 + }, + { + "completion_length": 212.0, + "epoch": 2.5524475524475525, + "grad_norm": 0.7780301570892334, + "kl": 0.2876867651939392, + "learning_rate": 4.0045375578801216e-06, + "loss": 0.0115, + "reward": 2.616666793823242, + "reward_std": 1.0327956676483154, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 730 + }, + { + "completion_length": 183.33334350585938, + "epoch": 2.555944055944056, + "grad_norm": 0.043716005980968475, + "kl": 0.40688663721084595, + "learning_rate": 4.001050563314711e-06, + "loss": 0.0187, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 731 + }, + { + "completion_length": 181.6666717529297, + "epoch": 2.5594405594405596, + "grad_norm": 0.7270947098731995, + "kl": 0.2820360064506531, + "learning_rate": 3.997558996288965e-06, + "loss": 0.0113, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 732 + }, + { + "completion_length": 522.0, + "epoch": 2.562937062937063, + "grad_norm": 0.5480185747146606, + "kl": 0.2843058109283447, + "learning_rate": 3.994062867438803e-06, + "loss": 0.0114, + "reward": 3.016666889190674, + "reward_std": 0.9521905183792114, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 733 + }, + { + "completion_length": 192.0, + "epoch": 2.5664335664335667, + "grad_norm": 0.733644962310791, + "kl": 0.27982231974601746, + "learning_rate": 3.9905621874140396e-06, + "loss": 0.0112, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 734 + }, + { + "completion_length": 198.6666717529297, + "epoch": 2.56993006993007, + "grad_norm": 0.7122451066970825, + "kl": 0.36668699979782104, + "learning_rate": 3.987056966878354e-06, + "loss": 0.0147, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 735 + }, + { + "completion_length": 204.33334350585938, + "epoch": 2.5734265734265733, + "grad_norm": 0.07662484794855118, + "kl": 0.3632362484931946, + "learning_rate": 3.983547216509254e-06, + "loss": 0.0169, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 736 + }, + { + "completion_length": 189.1666717529297, + "epoch": 2.5769230769230766, + "grad_norm": 0.34811052680015564, + "kl": 0.4749183654785156, + "learning_rate": 3.9800329469980495e-06, + "loss": 0.0214, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 737 + }, + { + "completion_length": 583.5, + "epoch": 2.5804195804195804, + "grad_norm": 0.3855575919151306, + "kl": 0.2539462447166443, + "learning_rate": 3.976514169049814e-06, + "loss": 0.0102, + "reward": 2.704166889190674, + "reward_std": 0.6021661758422852, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 738 + }, + { + "completion_length": 174.6666717529297, + "epoch": 2.583916083916084, + "grad_norm": 1.0900449752807617, + "kl": 0.3619951605796814, + "learning_rate": 3.972990893383356e-06, + "loss": 0.0145, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 739 + }, + { + "completion_length": 203.1666717529297, + "epoch": 2.5874125874125875, + "grad_norm": 0.9708390831947327, + "kl": 0.28454601764678955, + "learning_rate": 3.969463130731183e-06, + "loss": 0.0114, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 740 + }, + { + "completion_length": 522.1666870117188, + "epoch": 2.590909090909091, + "grad_norm": 0.6295937895774841, + "kl": 0.26834964752197266, + "learning_rate": 3.965930891839473e-06, + "loss": 0.0107, + "reward": 3.370833396911621, + "reward_std": 0.4925486445426941, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 741 + }, + { + "completion_length": 703.5, + "epoch": 2.594405594405594, + "grad_norm": 0.35760697722435, + "kl": 0.28400832414627075, + "learning_rate": 3.96239418746804e-06, + "loss": 0.0114, + "reward": 3.066666603088379, + "reward_std": 0.2857738435268402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8999999761581421, + "step": 742 + }, + { + "completion_length": 833.8333740234375, + "epoch": 2.597902097902098, + "grad_norm": 0.5528135895729065, + "kl": 0.28165918588638306, + "learning_rate": 3.958853028390294e-06, + "loss": 0.0113, + "reward": 2.3583335876464844, + "reward_std": 1.752831220626831, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916666030883789, + "step": 743 + }, + { + "completion_length": 143.33334350585938, + "epoch": 2.6013986013986012, + "grad_norm": 0.7684369683265686, + "kl": 0.3106473684310913, + "learning_rate": 3.955307425393224e-06, + "loss": 0.0124, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 744 + }, + { + "completion_length": 789.1666870117188, + "epoch": 2.604895104895105, + "grad_norm": 0.9867936372756958, + "kl": 0.2591046094894409, + "learning_rate": 3.951757389277349e-06, + "loss": 0.0104, + "reward": 3.3500001430511475, + "reward_std": 0.5224940180778503, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 745 + }, + { + "completion_length": 194.83334350585938, + "epoch": 2.6083916083916083, + "grad_norm": 0.7808223962783813, + "kl": 0.30762046575546265, + "learning_rate": 3.948202930856697e-06, + "loss": 0.0123, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 746 + }, + { + "completion_length": 503.8333435058594, + "epoch": 2.6118881118881117, + "grad_norm": 0.6441946625709534, + "kl": 0.2855534851551056, + "learning_rate": 3.944644060958764e-06, + "loss": 0.0114, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 747 + }, + { + "completion_length": 190.1666717529297, + "epoch": 2.6153846153846154, + "grad_norm": 0.8443914651870728, + "kl": 0.32207822799682617, + "learning_rate": 3.941080790424483e-06, + "loss": 0.0129, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 748 + }, + { + "completion_length": 193.6666717529297, + "epoch": 2.6188811188811187, + "grad_norm": 0.620596706867218, + "kl": 0.33432909846305847, + "learning_rate": 3.9375131301081974e-06, + "loss": 0.0134, + "reward": 2.616666793823242, + "reward_std": 1.0327956676483154, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 749 + }, + { + "completion_length": 218.33334350585938, + "epoch": 2.6223776223776225, + "grad_norm": 0.8599146604537964, + "kl": 0.2318965494632721, + "learning_rate": 3.933941090877615e-06, + "loss": 0.0093, + "reward": 2.0375001430511475, + "reward_std": 0.4857339859008789, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 750 + } + ], + "logging_steps": 1, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-750/training_args.bin b/checkpoint-750/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..404a67ca1097568ef818195412e92eb5df6df003 --- /dev/null +++ b/checkpoint-750/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9b809202c83316443ca7c3596f9666d891e249e918f031374256726d85b5070 +size 6008 diff --git a/checkpoint-800/README.md b/checkpoint-800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..342a23987f57b711334f1f7c4b72004ab4751d11 --- /dev/null +++ b/checkpoint-800/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/checkpoint-800/adapter_config.json b/checkpoint-800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca69f90ffbea02ffd530ac27f43588458c02af39 --- /dev/null +++ b/checkpoint-800/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "up_proj", + "o_proj", + "k_proj", + "gate_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-800/adapter_model.safetensors b/checkpoint-800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d4b054ff40327f56b6a032fd25746859fdf7fa02 --- /dev/null +++ b/checkpoint-800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61d12353ea40442ff3a274491793030455b38d38b19d8d82aec992ecf072e7a0 +size 778096664 diff --git a/checkpoint-800/optimizer.pt b/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8fc00ec7a53aa9ea514ac205f4b8273d33803c91 --- /dev/null +++ b/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3c268dba77c690a85d961068b11046a96a8676e0b3862dd20917c9ceae1a5a9 +size 395571252 diff --git a/checkpoint-800/rng_state.pth b/checkpoint-800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f99aefc038be5725d771f6a26bc71abc1b9ccce --- /dev/null +++ b/checkpoint-800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b713c12bceac72271f1e8746c8f09cd5b70b4d39fe10226f1fc8cc3af72f7a2 +size 14244 diff --git a/checkpoint-800/scheduler.pt b/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8fbe17b5367a7a1c98059c65d027ff6ed8ed204f --- /dev/null +++ b/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab88cd4a44fb116f194d08e6a0ad4c52384eafc503d226ab3f3e9ff97e1a7487 +size 1064 diff --git a/checkpoint-800/special_tokens_map.json b/checkpoint-800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-800/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-800/tokenizer.json b/checkpoint-800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-800/tokenizer_config.json b/checkpoint-800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f29bafcf7d24e386a389486e71a4e81dfef0f5c2 --- /dev/null +++ b/checkpoint-800/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-800/trainer_state.json b/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..406ce143600b645953815b5a8a96c9fdb0ffa1b5 --- /dev/null +++ b/checkpoint-800/trainer_state.json @@ -0,0 +1,12033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.797202797202797, + "eval_steps": 500, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 399.0, + "epoch": 0.0034965034965034965, + "grad_norm": 0.9857833385467529, + "kl": 0.0, + "learning_rate": 2.5000000000000002e-08, + "loss": 0.0, + "reward": 1.75, + "reward_std": 1.069111704826355, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4166666865348816, + "step": 1 + }, + { + "completion_length": 305.3333435058594, + "epoch": 0.006993006993006993, + "grad_norm": 1.3122953176498413, + "kl": 0.0, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.0, + "reward": 1.0500000715255737, + "reward_std": 0.6340347528457642, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 2 + }, + { + "completion_length": 475.3333435058594, + "epoch": 0.01048951048951049, + "grad_norm": 6.344944953918457, + "kl": 0.0006356238736771047, + "learning_rate": 7.500000000000001e-08, + "loss": 0.0, + "reward": 1.4750001430511475, + "reward_std": 1.025061011314392, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 3 + }, + { + "completion_length": 378.3333435058594, + "epoch": 0.013986013986013986, + "grad_norm": 0.9831988215446472, + "kl": 0.0006719424272887409, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.0, + "reward": 1.2208333015441895, + "reward_std": 1.3383214473724365, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.22083334624767303, + "step": 4 + }, + { + "completion_length": 925.0, + "epoch": 0.017482517482517484, + "grad_norm": 1.042701005935669, + "kl": 0.000699286290910095, + "learning_rate": 1.2500000000000002e-07, + "loss": 0.0, + "reward": 2.4666666984558105, + "reward_std": 1.618847370147705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 5 + }, + { + "completion_length": 130.6666717529297, + "epoch": 0.02097902097902098, + "grad_norm": 1.276957631111145, + "kl": 0.0007741473382338881, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.0, + "reward": 0.38333332538604736, + "reward_std": 0.7222649455070496, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 6 + }, + { + "completion_length": 185.5, + "epoch": 0.024475524475524476, + "grad_norm": 1.277024507522583, + "kl": 0.0007853443967178464, + "learning_rate": 1.7500000000000002e-07, + "loss": 0.0, + "reward": 0.7250000238418579, + "reward_std": 0.44017040729522705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 7 + }, + { + "completion_length": 113.83333587646484, + "epoch": 0.027972027972027972, + "grad_norm": 4.894377708435059, + "kl": 0.0010196010116487741, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.0, + "reward": 0.7250000238418579, + "reward_std": 0.5777109861373901, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 8 + }, + { + "completion_length": 195.33334350585938, + "epoch": 0.03146853146853147, + "grad_norm": 0.9491543769836426, + "kl": 0.0009398699621669948, + "learning_rate": 2.2500000000000002e-07, + "loss": 0.0, + "reward": 1.2750000953674316, + "reward_std": 0.673609733581543, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10833333432674408, + "step": 9 + }, + { + "completion_length": 168.83334350585938, + "epoch": 0.03496503496503497, + "grad_norm": 4.634313583374023, + "kl": 0.0008446139981970191, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.0, + "reward": 0.5791666507720947, + "reward_std": 1.011238932609558, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.07916666567325592, + "step": 10 + }, + { + "completion_length": 181.0, + "epoch": 0.038461538461538464, + "grad_norm": 0.9203607439994812, + "kl": 0.0005472182529047132, + "learning_rate": 2.75e-07, + "loss": 0.0, + "reward": 1.2833333015441895, + "reward_std": 0.9125057458877563, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.11666666716337204, + "step": 11 + }, + { + "completion_length": 181.1666717529297, + "epoch": 0.04195804195804196, + "grad_norm": 1.4339206218719482, + "kl": 0.0007050944259390235, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.0, + "reward": 1.7333333492279053, + "reward_std": 1.0063133239746094, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.23333333432674408, + "step": 12 + }, + { + "completion_length": 130.0, + "epoch": 0.045454545454545456, + "grad_norm": 1.073473334312439, + "kl": 0.0007636564550921321, + "learning_rate": 3.25e-07, + "loss": 0.0, + "reward": 1.058333396911621, + "reward_std": 0.7486097812652588, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 13 + }, + { + "completion_length": 356.16668701171875, + "epoch": 0.04895104895104895, + "grad_norm": 0.8452476859092712, + "kl": 0.0006562608177773654, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.0, + "reward": 0.7416666746139526, + "reward_std": 0.8662659525871277, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.24166667461395264, + "step": 14 + }, + { + "completion_length": 143.1666717529297, + "epoch": 0.05244755244755245, + "grad_norm": 0.9590725302696228, + "kl": 0.0008172739762812853, + "learning_rate": 3.75e-07, + "loss": 0.0, + "reward": 0.5541666746139526, + "reward_std": 0.9553031921386719, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05416666716337204, + "step": 15 + }, + { + "completion_length": 454.16668701171875, + "epoch": 0.055944055944055944, + "grad_norm": 1.2272268533706665, + "kl": 0.0007388863014057279, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.0, + "reward": 1.2083333730697632, + "reward_std": 1.0360583066940308, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 16 + }, + { + "completion_length": 152.5, + "epoch": 0.05944055944055944, + "grad_norm": 1.0074872970581055, + "kl": 0.0006766216829419136, + "learning_rate": 4.2500000000000006e-07, + "loss": 0.0, + "reward": 0.8916666507720947, + "reward_std": 0.8662659525871277, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 17 + }, + { + "completion_length": 250.1666717529297, + "epoch": 0.06293706293706294, + "grad_norm": 1.305372953414917, + "kl": 0.001035388559103012, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.0, + "reward": 0.7166666984558105, + "reward_std": 1.2201093435287476, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 18 + }, + { + "completion_length": 243.0, + "epoch": 0.06643356643356643, + "grad_norm": 1.0690687894821167, + "kl": 0.0006665514083579183, + "learning_rate": 4.7500000000000006e-07, + "loss": 0.0, + "reward": 0.9916666746139526, + "reward_std": 0.6167792677879333, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.15833333134651184, + "step": 19 + }, + { + "completion_length": 276.16668701171875, + "epoch": 0.06993006993006994, + "grad_norm": 1.052300214767456, + "kl": 0.0005925261066295207, + "learning_rate": 5.000000000000001e-07, + "loss": 0.0, + "reward": 1.5333333015441895, + "reward_std": 1.0186593532562256, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.20000001788139343, + "step": 20 + }, + { + "completion_length": 333.3333435058594, + "epoch": 0.07342657342657342, + "grad_norm": 0.95088130235672, + "kl": 0.0006341444095596671, + "learning_rate": 5.250000000000001e-07, + "loss": 0.0, + "reward": 1.8583333492279053, + "reward_std": 0.8458231687545776, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3583333194255829, + "step": 21 + }, + { + "completion_length": 166.6666717529297, + "epoch": 0.07692307692307693, + "grad_norm": 1.2825149297714233, + "kl": 0.0007712479564361274, + "learning_rate": 5.5e-07, + "loss": 0.0, + "reward": 0.7666666507720947, + "reward_std": 1.1881358623504639, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10000000894069672, + "step": 22 + }, + { + "completion_length": 380.0, + "epoch": 0.08041958041958042, + "grad_norm": 1.2229748964309692, + "kl": 0.0007141837850213051, + "learning_rate": 5.750000000000001e-07, + "loss": 0.0, + "reward": 1.933333396911621, + "reward_std": 0.7672461867332458, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 23 + }, + { + "completion_length": 250.0, + "epoch": 0.08391608391608392, + "grad_norm": 1.1869820356369019, + "kl": 0.0007901927456259727, + "learning_rate": 6.000000000000001e-07, + "loss": 0.0, + "reward": 0.9666666984558105, + "reward_std": 0.797287106513977, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 24 + }, + { + "completion_length": 224.33334350585938, + "epoch": 0.08741258741258741, + "grad_norm": 1.1140718460083008, + "kl": 0.0006676652701571584, + "learning_rate": 6.25e-07, + "loss": 0.0, + "reward": 1.125, + "reward_std": 1.069462537765503, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.125, + "step": 25 + }, + { + "completion_length": 112.33333587646484, + "epoch": 0.09090909090909091, + "grad_norm": 1.20625901222229, + "kl": 0.0006995900766924024, + "learning_rate": 6.5e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0, + "step": 26 + }, + { + "completion_length": 398.8333435058594, + "epoch": 0.0944055944055944, + "grad_norm": 5.332723617553711, + "kl": 0.0007186655420809984, + "learning_rate": 6.750000000000001e-07, + "loss": 0.0, + "reward": 1.6625001430511475, + "reward_std": 0.9664044380187988, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3291666805744171, + "step": 27 + }, + { + "completion_length": 336.3333435058594, + "epoch": 0.0979020979020979, + "grad_norm": 0.7707162499427795, + "kl": 0.0007305681938305497, + "learning_rate": 7.000000000000001e-07, + "loss": 0.0, + "reward": 1.441666603088379, + "reward_std": 0.9876319766044617, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.2750000059604645, + "step": 28 + }, + { + "completion_length": 355.8333435058594, + "epoch": 0.10139860139860139, + "grad_norm": 0.999113142490387, + "kl": 0.0006821553106419742, + "learning_rate": 7.25e-07, + "loss": 0.0, + "reward": 1.1583333015441895, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.15833333134651184, + "step": 29 + }, + { + "completion_length": 188.1666717529297, + "epoch": 0.1048951048951049, + "grad_norm": 1.1029480695724487, + "kl": 0.0007804523920640349, + "learning_rate": 7.5e-07, + "loss": 0.0, + "reward": 1.183333396911621, + "reward_std": 1.0680201053619385, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.18333333730697632, + "step": 30 + }, + { + "completion_length": 380.3333435058594, + "epoch": 0.10839160839160839, + "grad_norm": 0.9132871627807617, + "kl": 0.0008556495886296034, + "learning_rate": 7.750000000000001e-07, + "loss": 0.0, + "reward": 2.2375001907348633, + "reward_std": 1.4762918949127197, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40416666865348816, + "step": 31 + }, + { + "completion_length": 348.0, + "epoch": 0.11188811188811189, + "grad_norm": 1.549122929573059, + "kl": 0.0009064790210686624, + "learning_rate": 8.000000000000001e-07, + "loss": 0.0, + "reward": 0.8291666507720947, + "reward_std": 1.029613733291626, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.16250000894069672, + "step": 32 + }, + { + "completion_length": 349.5, + "epoch": 0.11538461538461539, + "grad_norm": 0.8771302700042725, + "kl": 0.0008574656676501036, + "learning_rate": 8.250000000000001e-07, + "loss": 0.0, + "reward": 1.133333444595337, + "reward_std": 0.9867455363273621, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.30000001192092896, + "step": 33 + }, + { + "completion_length": 698.8333740234375, + "epoch": 0.11888111888111888, + "grad_norm": 0.7568854689598083, + "kl": 0.0007735582767054439, + "learning_rate": 8.500000000000001e-07, + "loss": 0.0, + "reward": 1.933333396911621, + "reward_std": 1.1737406253814697, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 34 + }, + { + "completion_length": 655.3333740234375, + "epoch": 0.12237762237762238, + "grad_norm": 1.5077099800109863, + "kl": 0.0007145506679080427, + "learning_rate": 8.75e-07, + "loss": 0.0, + "reward": 1.337499976158142, + "reward_std": 0.7572566270828247, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5041667222976685, + "step": 35 + }, + { + "completion_length": 156.0, + "epoch": 0.1258741258741259, + "grad_norm": 1.1091190576553345, + "kl": 0.0010963345412164927, + "learning_rate": 9.000000000000001e-07, + "loss": 0.0, + "reward": 1.1583333015441895, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.15833333134651184, + "step": 36 + }, + { + "completion_length": 184.6666717529297, + "epoch": 0.12937062937062938, + "grad_norm": 1.1978340148925781, + "kl": 0.000993944238871336, + "learning_rate": 9.25e-07, + "loss": 0.0, + "reward": 0.8333333730697632, + "reward_std": 1.2944754362106323, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 37 + }, + { + "completion_length": 170.1666717529297, + "epoch": 0.13286713286713286, + "grad_norm": 0.9296630620956421, + "kl": 0.0012741987593472004, + "learning_rate": 9.500000000000001e-07, + "loss": 0.0001, + "reward": 1.25, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 38 + }, + { + "completion_length": 284.3333435058594, + "epoch": 0.13636363636363635, + "grad_norm": 1.3948841094970703, + "kl": 0.0010804318590089679, + "learning_rate": 9.750000000000002e-07, + "loss": 0.0, + "reward": 1.1083333492279053, + "reward_std": 1.263098120689392, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2750000059604645, + "step": 39 + }, + { + "completion_length": 132.1666717529297, + "epoch": 0.13986013986013987, + "grad_norm": 1.0202951431274414, + "kl": 0.0013121496886014938, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0001, + "reward": 0.3333333432674408, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0, + "step": 40 + }, + { + "completion_length": 156.1666717529297, + "epoch": 0.14335664335664336, + "grad_norm": 0.9724128246307373, + "kl": 0.0010785979684442282, + "learning_rate": 1.025e-06, + "loss": 0.0, + "reward": 0.6083333492279053, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10833333432674408, + "step": 41 + }, + { + "completion_length": 603.1666870117188, + "epoch": 0.14685314685314685, + "grad_norm": 0.7776791453361511, + "kl": 0.0006764258723706007, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.0, + "reward": 1.4500001668930054, + "reward_std": 0.30659419298171997, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.45000001788139343, + "step": 42 + }, + { + "completion_length": 183.1666717529297, + "epoch": 0.15034965034965034, + "grad_norm": 1.2581369876861572, + "kl": 0.0012429999187588692, + "learning_rate": 1.075e-06, + "loss": 0.0, + "reward": 1.1749999523162842, + "reward_std": 1.0567638874053955, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17499999701976776, + "step": 43 + }, + { + "completion_length": 379.16668701171875, + "epoch": 0.15384615384615385, + "grad_norm": 2.0310208797454834, + "kl": 0.0011767616961151361, + "learning_rate": 1.1e-06, + "loss": 0.0, + "reward": 2.633333683013916, + "reward_std": 1.0595598220825195, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.46666663885116577, + "step": 44 + }, + { + "completion_length": 637.3333740234375, + "epoch": 0.15734265734265734, + "grad_norm": 1.2500090599060059, + "kl": 0.001643048133701086, + "learning_rate": 1.125e-06, + "loss": 0.0001, + "reward": 1.1500000953674316, + "reward_std": 0.7307531237602234, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 45 + }, + { + "completion_length": 182.0, + "epoch": 0.16083916083916083, + "grad_norm": 2.3323163986206055, + "kl": 0.003556631039828062, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.0001, + "reward": 0.9666666984558105, + "reward_std": 1.0230672359466553, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.13333334028720856, + "step": 46 + }, + { + "completion_length": 109.83333587646484, + "epoch": 0.16433566433566432, + "grad_norm": 1.834832787513733, + "kl": 0.002168774139136076, + "learning_rate": 1.175e-06, + "loss": 0.0001, + "reward": 0.5583333373069763, + "reward_std": 0.6248332858085632, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 47 + }, + { + "completion_length": 337.16668701171875, + "epoch": 0.16783216783216784, + "grad_norm": 1.1725846529006958, + "kl": 0.002405840437859297, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0001, + "reward": 0.6500000357627869, + "reward_std": 0.7962412238121033, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 48 + }, + { + "completion_length": 437.3333435058594, + "epoch": 0.17132867132867133, + "grad_norm": 0.743201494216919, + "kl": 0.0013375936541706324, + "learning_rate": 1.2250000000000001e-06, + "loss": 0.0001, + "reward": 1.183333396911621, + "reward_std": 1.3611271381378174, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3499999940395355, + "step": 49 + }, + { + "completion_length": 533.8333740234375, + "epoch": 0.17482517482517482, + "grad_norm": 0.7576809525489807, + "kl": 0.0019401045283302665, + "learning_rate": 1.25e-06, + "loss": 0.0001, + "reward": 1.7291667461395264, + "reward_std": 0.7050561308860779, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5625, + "step": 50 + }, + { + "completion_length": 203.5, + "epoch": 0.17832167832167833, + "grad_norm": 1.4076164960861206, + "kl": 0.0030774520710110664, + "learning_rate": 1.275e-06, + "loss": 0.0001, + "reward": 0.7750000357627869, + "reward_std": 0.5135659575462341, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2750000059604645, + "step": 51 + }, + { + "completion_length": 409.0, + "epoch": 0.18181818181818182, + "grad_norm": 0.8726016879081726, + "kl": 0.0025800741277635098, + "learning_rate": 1.3e-06, + "loss": 0.0001, + "reward": 0.5916666984558105, + "reward_std": 0.7324047088623047, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 52 + }, + { + "completion_length": 356.5, + "epoch": 0.1853146853146853, + "grad_norm": 0.877477765083313, + "kl": 0.0021268115378916264, + "learning_rate": 1.3250000000000002e-06, + "loss": 0.0001, + "reward": 1.6166666746139526, + "reward_std": 0.6976150274276733, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.28333336114883423, + "step": 53 + }, + { + "completion_length": 243.33334350585938, + "epoch": 0.1888111888111888, + "grad_norm": 0.9792532324790955, + "kl": 0.0043938253074884415, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.0002, + "reward": 1.1708333492279053, + "reward_std": 1.282616138458252, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17083333432674408, + "step": 54 + }, + { + "completion_length": 393.3333435058594, + "epoch": 0.19230769230769232, + "grad_norm": 1.205925703048706, + "kl": 0.0031106050591915846, + "learning_rate": 1.3750000000000002e-06, + "loss": 0.0001, + "reward": 0.9666666984558105, + "reward_std": 0.8084965944290161, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 55 + }, + { + "completion_length": 228.83334350585938, + "epoch": 0.1958041958041958, + "grad_norm": 0.7984407544136047, + "kl": 0.007072250358760357, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.0003, + "reward": 0.6916667222976685, + "reward_std": 1.1655113697052002, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19166666269302368, + "step": 56 + }, + { + "completion_length": 361.66668701171875, + "epoch": 0.1993006993006993, + "grad_norm": 3.0838680267333984, + "kl": 0.006738494616001844, + "learning_rate": 1.425e-06, + "loss": 0.0003, + "reward": 1.3041667938232422, + "reward_std": 0.2600080370903015, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30416667461395264, + "step": 57 + }, + { + "completion_length": 502.66668701171875, + "epoch": 0.20279720279720279, + "grad_norm": 0.7226095795631409, + "kl": 0.0058082761242985725, + "learning_rate": 1.45e-06, + "loss": 0.0002, + "reward": 1.399999976158142, + "reward_std": 1.3337916135787964, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.40000003576278687, + "step": 58 + }, + { + "completion_length": 210.5, + "epoch": 0.2062937062937063, + "grad_norm": 1.079681158065796, + "kl": 0.009464471600949764, + "learning_rate": 1.475e-06, + "loss": 0.0004, + "reward": 0.9750000238418579, + "reward_std": 1.1890122890472412, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.14166668057441711, + "step": 59 + }, + { + "completion_length": 208.5, + "epoch": 0.2097902097902098, + "grad_norm": 1.8312753438949585, + "kl": 0.03959222882986069, + "learning_rate": 1.5e-06, + "loss": 0.0016, + "reward": 0.5333333611488342, + "reward_std": 0.8553751707077026, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.20000001788139343, + "step": 60 + }, + { + "completion_length": 285.5, + "epoch": 0.21328671328671328, + "grad_norm": 0.9337784051895142, + "kl": 0.011914614588022232, + "learning_rate": 1.525e-06, + "loss": 0.0005, + "reward": 1.4458332061767578, + "reward_std": 0.4955846071243286, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.27916666865348816, + "step": 61 + }, + { + "completion_length": 276.3333435058594, + "epoch": 0.21678321678321677, + "grad_norm": 1.4266396760940552, + "kl": 0.02391706220805645, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.001, + "reward": 1.1583333015441895, + "reward_std": 0.8598934412002563, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32500001788139343, + "step": 62 + }, + { + "completion_length": 381.3333435058594, + "epoch": 0.2202797202797203, + "grad_norm": 1.1708087921142578, + "kl": 0.012987270019948483, + "learning_rate": 1.5750000000000002e-06, + "loss": 0.0005, + "reward": 1.5416667461395264, + "reward_std": 1.3807305097579956, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 63 + }, + { + "completion_length": 237.0, + "epoch": 0.22377622377622378, + "grad_norm": 1.3068374395370483, + "kl": 0.027782242745161057, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0011, + "reward": 1.433333396911621, + "reward_std": 1.162611961364746, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2666666805744171, + "step": 64 + }, + { + "completion_length": 797.6666870117188, + "epoch": 0.22727272727272727, + "grad_norm": 0.7319328784942627, + "kl": 0.013491494581103325, + "learning_rate": 1.6250000000000001e-06, + "loss": 0.0005, + "reward": 1.3166667222976685, + "reward_std": 0.8604747653007507, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3166666626930237, + "step": 65 + }, + { + "completion_length": 237.1666717529297, + "epoch": 0.23076923076923078, + "grad_norm": 1.9626200199127197, + "kl": 0.015099573880434036, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0006, + "reward": 0.9666666388511658, + "reward_std": 0.797287106513977, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 66 + }, + { + "completion_length": 221.1666717529297, + "epoch": 0.23426573426573427, + "grad_norm": 0.7815642952919006, + "kl": 0.03964684158563614, + "learning_rate": 1.6750000000000003e-06, + "loss": 0.0016, + "reward": 1.6416667699813843, + "reward_std": 1.0584973096847534, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.14166668057441711, + "step": 67 + }, + { + "completion_length": 227.33334350585938, + "epoch": 0.23776223776223776, + "grad_norm": 1.5282418727874756, + "kl": 0.0695306807756424, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0028, + "reward": 0.75, + "reward_std": 0.7375635504722595, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.25, + "step": 68 + }, + { + "completion_length": 673.3333740234375, + "epoch": 0.24125874125874125, + "grad_norm": 0.8560697436332703, + "kl": 0.03540939837694168, + "learning_rate": 1.725e-06, + "loss": 0.0014, + "reward": 2.200000047683716, + "reward_std": 0.9581232070922852, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5333333611488342, + "step": 69 + }, + { + "completion_length": 254.6666717529297, + "epoch": 0.24475524475524477, + "grad_norm": 1.2371562719345093, + "kl": 0.03692096844315529, + "learning_rate": 1.75e-06, + "loss": 0.0015, + "reward": 1.8249998092651367, + "reward_std": 0.9968700408935547, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32499998807907104, + "step": 70 + }, + { + "completion_length": 234.6666717529297, + "epoch": 0.24825174825174826, + "grad_norm": 0.9824966192245483, + "kl": 0.07421376556158066, + "learning_rate": 1.7750000000000002e-06, + "loss": 0.003, + "reward": 1.1666667461395264, + "reward_std": 0.6485882997512817, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3333333432674408, + "step": 71 + }, + { + "completion_length": 580.0, + "epoch": 0.2517482517482518, + "grad_norm": 1.0504631996154785, + "kl": 0.048039551824331284, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0019, + "reward": 1.808333396911621, + "reward_std": 1.302849531173706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 72 + }, + { + "completion_length": 788.1666870117188, + "epoch": 0.25524475524475526, + "grad_norm": 0.6447965502738953, + "kl": 0.04130098968744278, + "learning_rate": 1.825e-06, + "loss": 0.0017, + "reward": 1.3875000476837158, + "reward_std": 1.093817949295044, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5541666746139526, + "step": 73 + }, + { + "completion_length": 376.16668701171875, + "epoch": 0.25874125874125875, + "grad_norm": 1.347108244895935, + "kl": 0.19923770427703857, + "learning_rate": 1.85e-06, + "loss": 0.008, + "reward": 1.529166579246521, + "reward_std": 0.6618943214416504, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19583332538604736, + "step": 74 + }, + { + "completion_length": 227.1666717529297, + "epoch": 0.26223776223776224, + "grad_norm": 0.8091520667076111, + "kl": 0.06355344504117966, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.0025, + "reward": 0.75, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 75 + }, + { + "completion_length": 502.3333435058594, + "epoch": 0.26573426573426573, + "grad_norm": 1.1315293312072754, + "kl": 0.11514662951231003, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0046, + "reward": 1.504166603088379, + "reward_std": 1.256027102470398, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.33750003576278687, + "step": 76 + }, + { + "completion_length": 306.16668701171875, + "epoch": 0.2692307692307692, + "grad_norm": 1.6002874374389648, + "kl": 0.07964249700307846, + "learning_rate": 1.925e-06, + "loss": 0.0032, + "reward": 1.7083333730697632, + "reward_std": 1.2195971012115479, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5416666865348816, + "step": 77 + }, + { + "completion_length": 253.0, + "epoch": 0.2727272727272727, + "grad_norm": 1.134474754333496, + "kl": 0.09407778084278107, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0038, + "reward": 1.8333333730697632, + "reward_std": 1.0842816829681396, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3333333432674408, + "step": 78 + }, + { + "completion_length": 456.3333435058594, + "epoch": 0.2762237762237762, + "grad_norm": 1.4590799808502197, + "kl": 0.08163408935070038, + "learning_rate": 1.975e-06, + "loss": 0.0033, + "reward": 1.1875, + "reward_std": 1.164232611656189, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3541666865348816, + "step": 79 + }, + { + "completion_length": 273.0, + "epoch": 0.27972027972027974, + "grad_norm": 1.589087724685669, + "kl": 0.08010071516036987, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0032, + "reward": 0.9125000238418579, + "reward_std": 0.9088110327720642, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.07916666567325592, + "step": 80 + }, + { + "completion_length": 196.1666717529297, + "epoch": 0.28321678321678323, + "grad_norm": 1.4217482805252075, + "kl": 0.0619954913854599, + "learning_rate": 2.025e-06, + "loss": 0.0025, + "reward": 1.058333396911621, + "reward_std": 0.7486097812652588, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 81 + }, + { + "completion_length": 340.8333435058594, + "epoch": 0.2867132867132867, + "grad_norm": 1.056475043296814, + "kl": 0.05495650693774223, + "learning_rate": 2.05e-06, + "loss": 0.0022, + "reward": 0.8625000715255737, + "reward_std": 0.5305068492889404, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19583332538604736, + "step": 82 + }, + { + "completion_length": 410.66668701171875, + "epoch": 0.2902097902097902, + "grad_norm": 0.5162915587425232, + "kl": 0.04134432598948479, + "learning_rate": 2.075e-06, + "loss": 0.0017, + "reward": 1.1875, + "reward_std": 0.7466174364089966, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1875, + "step": 83 + }, + { + "completion_length": 510.66668701171875, + "epoch": 0.2937062937062937, + "grad_norm": 0.9501734972000122, + "kl": 0.047528013586997986, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0019, + "reward": 1.258333444595337, + "reward_std": 1.1069854497909546, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 84 + }, + { + "completion_length": 476.0, + "epoch": 0.2972027972027972, + "grad_norm": 1.0745543241500854, + "kl": 0.04738708958029747, + "learning_rate": 2.125e-06, + "loss": 0.0019, + "reward": 0.7541666030883789, + "reward_std": 0.6050654649734497, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2541666626930237, + "step": 85 + }, + { + "completion_length": 346.16668701171875, + "epoch": 0.3006993006993007, + "grad_norm": 0.7894018888473511, + "kl": 0.03818603605031967, + "learning_rate": 2.15e-06, + "loss": 0.0015, + "reward": 1.5499999523162842, + "reward_std": 0.8729261159896851, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 86 + }, + { + "completion_length": 157.5, + "epoch": 0.3041958041958042, + "grad_norm": 1.2285088300704956, + "kl": 0.04852033406496048, + "learning_rate": 2.1750000000000004e-06, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 1.2284135818481445, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 87 + }, + { + "completion_length": 853.5, + "epoch": 0.3076923076923077, + "grad_norm": 1.1314716339111328, + "kl": 0.03052813559770584, + "learning_rate": 2.2e-06, + "loss": 0.0012, + "reward": 1.5625, + "reward_std": 1.093817949295044, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3958333432674408, + "step": 88 + }, + { + "completion_length": 372.66668701171875, + "epoch": 0.3111888111888112, + "grad_norm": 0.9353286623954773, + "kl": 0.027921725064516068, + "learning_rate": 2.2250000000000003e-06, + "loss": 0.0011, + "reward": 1.8250000476837158, + "reward_std": 0.9234446287155151, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32500001788139343, + "step": 89 + }, + { + "completion_length": 296.3333435058594, + "epoch": 0.3146853146853147, + "grad_norm": 1.140289306640625, + "kl": 0.04811665043234825, + "learning_rate": 2.25e-06, + "loss": 0.0019, + "reward": 1.125, + "reward_std": 1.1268318891525269, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 90 + }, + { + "completion_length": 99.83333587646484, + "epoch": 0.3181818181818182, + "grad_norm": 4.178561687469482, + "kl": 0.09318779408931732, + "learning_rate": 2.2750000000000002e-06, + "loss": 0.0037, + "reward": 0.5583333373069763, + "reward_std": 0.9645810127258301, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 91 + }, + { + "completion_length": 192.1666717529297, + "epoch": 0.32167832167832167, + "grad_norm": 1.560648798942566, + "kl": 0.03698144853115082, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0015, + "reward": 1.9249999523162842, + "reward_std": 0.718853235244751, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.25833335518836975, + "step": 92 + }, + { + "completion_length": 576.5, + "epoch": 0.32517482517482516, + "grad_norm": 1.093043327331543, + "kl": 0.021529672667384148, + "learning_rate": 2.325e-06, + "loss": 0.0009, + "reward": 1.070833444595337, + "reward_std": 0.6477686166763306, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.23749999701976776, + "step": 93 + }, + { + "completion_length": 335.8333435058594, + "epoch": 0.32867132867132864, + "grad_norm": 0.8303731679916382, + "kl": 0.019405633211135864, + "learning_rate": 2.35e-06, + "loss": 0.0008, + "reward": 0.8416666984558105, + "reward_std": 0.9183772802352905, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17499999701976776, + "step": 94 + }, + { + "completion_length": 569.5, + "epoch": 0.3321678321678322, + "grad_norm": 1.4912625551223755, + "kl": 0.014733041636645794, + "learning_rate": 2.375e-06, + "loss": 0.0006, + "reward": 1.4541667699813843, + "reward_std": 1.1459076404571533, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4541666507720947, + "step": 95 + }, + { + "completion_length": 232.83334350585938, + "epoch": 0.3356643356643357, + "grad_norm": 0.9174475073814392, + "kl": 0.018923718482255936, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0008, + "reward": 1.3333333730697632, + "reward_std": 0.9877583980560303, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 96 + }, + { + "completion_length": 742.1666870117188, + "epoch": 0.33916083916083917, + "grad_norm": 1.258750557899475, + "kl": 0.017664968967437744, + "learning_rate": 2.425e-06, + "loss": 0.0007, + "reward": 1.4583333730697632, + "reward_std": 0.6202150583267212, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 97 + }, + { + "completion_length": 270.8333435058594, + "epoch": 0.34265734265734266, + "grad_norm": 0.9259786605834961, + "kl": 0.05115365609526634, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.002, + "reward": 1.5500000715255737, + "reward_std": 0.8729261159896851, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.21666666865348816, + "step": 98 + }, + { + "completion_length": 476.3333435058594, + "epoch": 0.34615384615384615, + "grad_norm": 1.240902066230774, + "kl": 0.036602895706892014, + "learning_rate": 2.475e-06, + "loss": 0.0015, + "reward": 1.2791666984558105, + "reward_std": 1.1935679912567139, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.27916669845581055, + "step": 99 + }, + { + "completion_length": 213.6666717529297, + "epoch": 0.34965034965034963, + "grad_norm": 0.943215548992157, + "kl": 0.04590342566370964, + "learning_rate": 2.5e-06, + "loss": 0.0018, + "reward": 1.841666579246521, + "reward_std": 0.6873258352279663, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.34166666865348816, + "step": 100 + }, + { + "completion_length": 401.0, + "epoch": 0.3531468531468531, + "grad_norm": 0.7366496324539185, + "kl": 0.016905900090932846, + "learning_rate": 2.5250000000000004e-06, + "loss": 0.0007, + "reward": 1.3000000715255737, + "reward_std": 1.1256110668182373, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 101 + }, + { + "completion_length": 854.5, + "epoch": 0.35664335664335667, + "grad_norm": 8.089740753173828, + "kl": 0.08785610646009445, + "learning_rate": 2.55e-06, + "loss": 0.0035, + "reward": 1.316666603088379, + "reward_std": 1.2330517768859863, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 102 + }, + { + "completion_length": 455.16668701171875, + "epoch": 0.36013986013986016, + "grad_norm": 1.6066083908081055, + "kl": 0.03349429741501808, + "learning_rate": 2.5750000000000003e-06, + "loss": 0.0013, + "reward": 1.7333333492279053, + "reward_std": 1.6448911428451538, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40000003576278687, + "step": 103 + }, + { + "completion_length": 558.6666870117188, + "epoch": 0.36363636363636365, + "grad_norm": 1.2461860179901123, + "kl": 0.0453556627035141, + "learning_rate": 2.6e-06, + "loss": 0.0018, + "reward": 1.933333396911621, + "reward_std": 1.1851863861083984, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 104 + }, + { + "completion_length": 462.16668701171875, + "epoch": 0.36713286713286714, + "grad_norm": 0.9176071286201477, + "kl": 0.05445032939314842, + "learning_rate": 2.6250000000000003e-06, + "loss": 0.0022, + "reward": 1.2916667461395264, + "reward_std": 0.9144214391708374, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 105 + }, + { + "completion_length": 357.5, + "epoch": 0.3706293706293706, + "grad_norm": 1.1796709299087524, + "kl": 0.08697855472564697, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0035, + "reward": 0.9833333492279053, + "reward_std": 1.2412359714508057, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 106 + }, + { + "completion_length": 556.8333740234375, + "epoch": 0.3741258741258741, + "grad_norm": 1.1719709634780884, + "kl": 0.09557916224002838, + "learning_rate": 2.6750000000000002e-06, + "loss": 0.0038, + "reward": 0.9541666507720947, + "reward_std": 1.0742924213409424, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2875000238418579, + "step": 107 + }, + { + "completion_length": 490.8333435058594, + "epoch": 0.3776223776223776, + "grad_norm": 0.9839584827423096, + "kl": 0.07620736211538315, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.003, + "reward": 1.3416666984558105, + "reward_std": 1.0551856756210327, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5083333253860474, + "step": 108 + }, + { + "completion_length": 459.8333435058594, + "epoch": 0.3811188811188811, + "grad_norm": 1.0232492685317993, + "kl": 0.09754881262779236, + "learning_rate": 2.7250000000000006e-06, + "loss": 0.0039, + "reward": 1.7916667461395264, + "reward_std": 1.201422929763794, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 109 + }, + { + "completion_length": 432.5, + "epoch": 0.38461538461538464, + "grad_norm": 0.7946304082870483, + "kl": 0.043154411017894745, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0017, + "reward": 2.1000001430511475, + "reward_std": 0.8933085203170776, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 110 + }, + { + "completion_length": 346.8333435058594, + "epoch": 0.3881118881118881, + "grad_norm": 0.9842674136161804, + "kl": 0.1046643778681755, + "learning_rate": 2.7750000000000005e-06, + "loss": 0.0042, + "reward": 0.8166667222976685, + "reward_std": 0.7353004217147827, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 111 + }, + { + "completion_length": 214.5, + "epoch": 0.3916083916083916, + "grad_norm": 1.1671849489212036, + "kl": 0.1281026154756546, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0051, + "reward": 1.0500000715255737, + "reward_std": 0.14832398295402527, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 112 + }, + { + "completion_length": 908.6666870117188, + "epoch": 0.3951048951048951, + "grad_norm": 0.3388780951499939, + "kl": 0.022495290264487267, + "learning_rate": 2.825e-06, + "loss": 0.0009, + "reward": 2.3375000953674316, + "reward_std": 0.3727431893348694, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6708333492279053, + "step": 113 + }, + { + "completion_length": 891.6666870117188, + "epoch": 0.3986013986013986, + "grad_norm": 0.467278391122818, + "kl": 0.025123490020632744, + "learning_rate": 2.85e-06, + "loss": 0.001, + "reward": 1.8541667461395264, + "reward_std": 0.7543899416923523, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6875, + "step": 114 + }, + { + "completion_length": 546.1666870117188, + "epoch": 0.4020979020979021, + "grad_norm": 1.054366111755371, + "kl": 0.0783834159374237, + "learning_rate": 2.875e-06, + "loss": 0.0031, + "reward": 2.4000000953674316, + "reward_std": 1.306904673576355, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5666666626930237, + "step": 115 + }, + { + "completion_length": 835.1666870117188, + "epoch": 0.40559440559440557, + "grad_norm": 0.7376688122749329, + "kl": 0.04768560454249382, + "learning_rate": 2.9e-06, + "loss": 0.0019, + "reward": 1.5291666984558105, + "reward_std": 0.32841163873672485, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5291666984558105, + "step": 116 + }, + { + "completion_length": 368.3333435058594, + "epoch": 0.4090909090909091, + "grad_norm": 1.456405758857727, + "kl": 0.1393664926290512, + "learning_rate": 2.925e-06, + "loss": 0.0056, + "reward": 0.9541666507720947, + "reward_std": 0.7450531721115112, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2875000238418579, + "step": 117 + }, + { + "completion_length": 485.5, + "epoch": 0.4125874125874126, + "grad_norm": 1.4957919120788574, + "kl": 0.1291833370923996, + "learning_rate": 2.95e-06, + "loss": 0.0052, + "reward": 1.5833333730697632, + "reward_std": 1.4998888969421387, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4166666865348816, + "step": 118 + }, + { + "completion_length": 356.3333435058594, + "epoch": 0.4160839160839161, + "grad_norm": 1.178475022315979, + "kl": 0.10108506679534912, + "learning_rate": 2.9750000000000003e-06, + "loss": 0.004, + "reward": 0.7083333730697632, + "reward_std": 0.7506109476089478, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 119 + }, + { + "completion_length": 140.33334350585938, + "epoch": 0.4195804195804196, + "grad_norm": 1.4624924659729004, + "kl": 0.2249661386013031, + "learning_rate": 3e-06, + "loss": 0.009, + "reward": 0.9166666865348816, + "reward_std": 0.9174239635467529, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 120 + }, + { + "completion_length": 673.1666870117188, + "epoch": 0.4230769230769231, + "grad_norm": 1.0837116241455078, + "kl": 0.09312133491039276, + "learning_rate": 3.0250000000000003e-06, + "loss": 0.0037, + "reward": 2.2208335399627686, + "reward_std": 0.9818881750106812, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.38749998807907104, + "step": 121 + }, + { + "completion_length": 238.1666717529297, + "epoch": 0.42657342657342656, + "grad_norm": 1.0982871055603027, + "kl": 0.05689762160181999, + "learning_rate": 3.05e-06, + "loss": 0.0023, + "reward": 1.1166666746139526, + "reward_std": 0.7567474246025085, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.11666666716337204, + "step": 122 + }, + { + "completion_length": 576.1666870117188, + "epoch": 0.43006993006993005, + "grad_norm": 1.0922025442123413, + "kl": 0.04579655081033707, + "learning_rate": 3.075e-06, + "loss": 0.0018, + "reward": 2.4000000953674316, + "reward_std": 1.0807406902313232, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5666666626930237, + "step": 123 + }, + { + "completion_length": 736.6666870117188, + "epoch": 0.43356643356643354, + "grad_norm": 1.5019290447235107, + "kl": 0.030428007245063782, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0012, + "reward": 1.504166603088379, + "reward_std": 1.2472386360168457, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5041667222976685, + "step": 124 + }, + { + "completion_length": 603.5, + "epoch": 0.4370629370629371, + "grad_norm": 4.212569713592529, + "kl": 0.37697991728782654, + "learning_rate": 3.125e-06, + "loss": 0.0151, + "reward": 1.6416667699813843, + "reward_std": 0.8303112387657166, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6416667103767395, + "step": 125 + }, + { + "completion_length": 492.0, + "epoch": 0.4405594405594406, + "grad_norm": 0.9634215831756592, + "kl": 0.06763506680727005, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0027, + "reward": 2.125, + "reward_std": 1.2069590091705322, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 126 + }, + { + "completion_length": 792.1666870117188, + "epoch": 0.44405594405594406, + "grad_norm": 0.4220138192176819, + "kl": 0.03986603766679764, + "learning_rate": 3.175e-06, + "loss": 0.0016, + "reward": 1.1375000476837158, + "reward_std": 0.5137485265731812, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6375000476837158, + "step": 127 + }, + { + "completion_length": 535.5, + "epoch": 0.44755244755244755, + "grad_norm": 4.797938823699951, + "kl": 0.13327616453170776, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0053, + "reward": 1.1791666746139526, + "reward_std": 1.1582764387130737, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.34583336114883423, + "step": 128 + }, + { + "completion_length": 444.8333435058594, + "epoch": 0.45104895104895104, + "grad_norm": 0.7808079719543457, + "kl": 0.055326174944639206, + "learning_rate": 3.2250000000000005e-06, + "loss": 0.0022, + "reward": 1.495833396911621, + "reward_std": 0.7681823968887329, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.16250000894069672, + "step": 129 + }, + { + "completion_length": 454.66668701171875, + "epoch": 0.45454545454545453, + "grad_norm": 0.8776301741600037, + "kl": 0.11162035167217255, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0045, + "reward": 1.5750001668930054, + "reward_std": 1.4812158346176147, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.24166665971279144, + "step": 130 + }, + { + "completion_length": 769.6666870117188, + "epoch": 0.458041958041958, + "grad_norm": 0.4391367733478546, + "kl": 0.025292951613664627, + "learning_rate": 3.2750000000000004e-06, + "loss": 0.001, + "reward": 2.433333396911621, + "reward_std": 0.2746209502220154, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6000000238418579, + "step": 131 + }, + { + "completion_length": 528.6666870117188, + "epoch": 0.46153846153846156, + "grad_norm": 0.8809014558792114, + "kl": 0.12223925441503525, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0049, + "reward": 2.120833396911621, + "reward_std": 1.101410150527954, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4541666507720947, + "step": 132 + }, + { + "completion_length": 491.3333435058594, + "epoch": 0.46503496503496505, + "grad_norm": 1.0070464611053467, + "kl": 0.05908138304948807, + "learning_rate": 3.3250000000000004e-06, + "loss": 0.0024, + "reward": 0.5916666984558105, + "reward_std": 0.5335416197776794, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 133 + }, + { + "completion_length": 892.5, + "epoch": 0.46853146853146854, + "grad_norm": 0.4570764899253845, + "kl": 0.037701599299907684, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0015, + "reward": 1.7249999046325684, + "reward_std": 1.292478322982788, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 134 + }, + { + "completion_length": 806.8333740234375, + "epoch": 0.47202797202797203, + "grad_norm": 0.5572299361228943, + "kl": 0.05404336377978325, + "learning_rate": 3.3750000000000003e-06, + "loss": 0.0022, + "reward": 1.4583333730697632, + "reward_std": 0.990033745765686, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7916666269302368, + "step": 135 + }, + { + "completion_length": 589.0, + "epoch": 0.4755244755244755, + "grad_norm": 0.7575751543045044, + "kl": 0.04170485585927963, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0017, + "reward": 2.683333396911621, + "reward_std": 1.1075499057769775, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8500000238418579, + "step": 136 + }, + { + "completion_length": 1060.166748046875, + "epoch": 0.479020979020979, + "grad_norm": 0.5119641423225403, + "kl": 0.04976843297481537, + "learning_rate": 3.4250000000000007e-06, + "loss": 0.002, + "reward": 1.1125000715255737, + "reward_std": 0.39457258582115173, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6124999523162842, + "step": 137 + }, + { + "completion_length": 559.8333740234375, + "epoch": 0.4825174825174825, + "grad_norm": 0.6115387082099915, + "kl": 0.05675242468714714, + "learning_rate": 3.45e-06, + "loss": 0.0023, + "reward": 2.0416667461395264, + "reward_std": 0.5715476274490356, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5416666865348816, + "step": 138 + }, + { + "completion_length": 685.6666870117188, + "epoch": 0.486013986013986, + "grad_norm": 1.2578071355819702, + "kl": 0.07080799341201782, + "learning_rate": 3.475e-06, + "loss": 0.0028, + "reward": 1.379166603088379, + "reward_std": 1.0072758197784424, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7125000357627869, + "step": 139 + }, + { + "completion_length": 987.5, + "epoch": 0.48951048951048953, + "grad_norm": 0.6280319690704346, + "kl": 0.03268418833613396, + "learning_rate": 3.5e-06, + "loss": 0.0013, + "reward": 0.9291666746139526, + "reward_std": 0.6654728651046753, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5958333015441895, + "step": 140 + }, + { + "completion_length": 728.5, + "epoch": 0.493006993006993, + "grad_norm": 0.8773026466369629, + "kl": 0.032183535397052765, + "learning_rate": 3.525e-06, + "loss": 0.0013, + "reward": 2.862499952316284, + "reward_std": 0.7864078879356384, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6958333253860474, + "step": 141 + }, + { + "completion_length": 405.8333435058594, + "epoch": 0.4965034965034965, + "grad_norm": 0.8974792957305908, + "kl": 0.059865664690732956, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0024, + "reward": 1.6875, + "reward_std": 0.8300225734710693, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.3541666865348816, + "step": 142 + }, + { + "completion_length": 1081.666748046875, + "epoch": 0.5, + "grad_norm": 0.5286564230918884, + "kl": 0.022505857050418854, + "learning_rate": 3.575e-06, + "loss": 0.0009, + "reward": 2.370833396911621, + "reward_std": 0.4925486445426941, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8708332777023315, + "step": 143 + }, + { + "completion_length": 1141.3333740234375, + "epoch": 0.5034965034965035, + "grad_norm": 0.527409017086029, + "kl": 0.021072231233119965, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0008, + "reward": 1.9291666746139526, + "reward_std": 0.7955214381217957, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5958333611488342, + "step": 144 + }, + { + "completion_length": 515.5, + "epoch": 0.506993006993007, + "grad_norm": 2.5036261081695557, + "kl": 0.3181736469268799, + "learning_rate": 3.625e-06, + "loss": 0.0127, + "reward": 1.5833333730697632, + "reward_std": 0.9988327026367188, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5833333730697632, + "step": 145 + }, + { + "completion_length": 599.5, + "epoch": 0.5104895104895105, + "grad_norm": 0.7538139224052429, + "kl": 0.041587017476558685, + "learning_rate": 3.65e-06, + "loss": 0.0017, + "reward": 1.3583334684371948, + "reward_std": 0.6873258352279663, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6916666030883789, + "step": 146 + }, + { + "completion_length": 645.6666870117188, + "epoch": 0.513986013986014, + "grad_norm": 0.6815938353538513, + "kl": 0.031590305268764496, + "learning_rate": 3.6750000000000004e-06, + "loss": 0.0013, + "reward": 2.445833683013916, + "reward_std": 1.186003565788269, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6124999523162842, + "step": 147 + }, + { + "completion_length": 731.0, + "epoch": 0.5174825174825175, + "grad_norm": 1.4654277563095093, + "kl": 0.11272114515304565, + "learning_rate": 3.7e-06, + "loss": 0.0045, + "reward": 1.2125000953674316, + "reward_std": 0.7435977458953857, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7124999761581421, + "step": 148 + }, + { + "completion_length": 476.16668701171875, + "epoch": 0.5209790209790209, + "grad_norm": 3.388495683670044, + "kl": 0.9080104827880859, + "learning_rate": 3.7250000000000003e-06, + "loss": 0.0363, + "reward": 1.8958333730697632, + "reward_std": 0.9965461492538452, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3958333432674408, + "step": 149 + }, + { + "completion_length": 1053.166748046875, + "epoch": 0.5244755244755245, + "grad_norm": 0.4761454164981842, + "kl": 0.027715642005205154, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0011, + "reward": 3.2916667461395264, + "reward_std": 0.7417322397232056, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 150 + }, + { + "completion_length": 751.1666870117188, + "epoch": 0.527972027972028, + "grad_norm": 0.6827074885368347, + "kl": 0.0386313796043396, + "learning_rate": 3.7750000000000003e-06, + "loss": 0.0015, + "reward": 2.495833396911621, + "reward_std": 1.0227923393249512, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6625000238418579, + "step": 151 + }, + { + "completion_length": 721.8333740234375, + "epoch": 0.5314685314685315, + "grad_norm": 1.2814685106277466, + "kl": 0.041070081293582916, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0016, + "reward": 2.4666666984558105, + "reward_std": 0.8834120631217957, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 152 + }, + { + "completion_length": 513.0, + "epoch": 0.534965034965035, + "grad_norm": 0.6044140458106995, + "kl": 0.08036690950393677, + "learning_rate": 3.825000000000001e-06, + "loss": 0.0032, + "reward": 1.7875001430511475, + "reward_std": 1.1646621227264404, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6208333373069763, + "step": 153 + }, + { + "completion_length": 720.8333740234375, + "epoch": 0.5384615384615384, + "grad_norm": 0.7732751965522766, + "kl": 0.04927179962396622, + "learning_rate": 3.85e-06, + "loss": 0.002, + "reward": 2.383333206176758, + "reward_std": 1.4126808643341064, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 154 + }, + { + "completion_length": 708.8333740234375, + "epoch": 0.541958041958042, + "grad_norm": 0.6660548448562622, + "kl": 0.07937665283679962, + "learning_rate": 3.875e-06, + "loss": 0.0032, + "reward": 2.183333396911621, + "reward_std": 0.6377042531967163, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8500000238418579, + "step": 155 + }, + { + "completion_length": 1192.0, + "epoch": 0.5454545454545454, + "grad_norm": 0.3896901309490204, + "kl": 0.025209862738847733, + "learning_rate": 3.900000000000001e-06, + "loss": 0.001, + "reward": 1.8833332061767578, + "reward_std": 0.8691471815109253, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 156 + }, + { + "completion_length": 705.1666870117188, + "epoch": 0.548951048951049, + "grad_norm": 0.5750932097434998, + "kl": 0.04517858847975731, + "learning_rate": 3.9250000000000005e-06, + "loss": 0.0018, + "reward": 2.9541664123535156, + "reward_std": 0.6458360552787781, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6208333373069763, + "step": 157 + }, + { + "completion_length": 465.5, + "epoch": 0.5524475524475524, + "grad_norm": 0.8335661888122559, + "kl": 0.08351196348667145, + "learning_rate": 3.95e-06, + "loss": 0.0033, + "reward": 2.424999952316284, + "reward_std": 0.941673994064331, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5916666984558105, + "step": 158 + }, + { + "completion_length": 539.6666870117188, + "epoch": 0.5559440559440559, + "grad_norm": 1.1459757089614868, + "kl": 0.12647944688796997, + "learning_rate": 3.975000000000001e-06, + "loss": 0.0051, + "reward": 1.6416667699813843, + "reward_std": 1.2068209648132324, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 159 + }, + { + "completion_length": 798.0, + "epoch": 0.5594405594405595, + "grad_norm": 0.4939272105693817, + "kl": 0.051064085215330124, + "learning_rate": 4.000000000000001e-06, + "loss": 0.002, + "reward": 2.183333396911621, + "reward_std": 1.2081665992736816, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 160 + }, + { + "completion_length": 338.8333435058594, + "epoch": 0.5629370629370629, + "grad_norm": 0.8890612125396729, + "kl": 0.12327366322278976, + "learning_rate": 4.0250000000000004e-06, + "loss": 0.0049, + "reward": 2.575000286102295, + "reward_std": 0.9913375377655029, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40833336114883423, + "step": 161 + }, + { + "completion_length": 809.6666870117188, + "epoch": 0.5664335664335665, + "grad_norm": 0.3928314447402954, + "kl": 0.040153808891773224, + "learning_rate": 4.05e-06, + "loss": 0.0016, + "reward": 1.7208335399627686, + "reward_std": 0.5225937366485596, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7208333015441895, + "step": 162 + }, + { + "completion_length": 766.0, + "epoch": 0.5699300699300699, + "grad_norm": 0.7869060039520264, + "kl": 0.04531605541706085, + "learning_rate": 4.075e-06, + "loss": 0.0018, + "reward": 2.120833396911621, + "reward_std": 0.8866251707077026, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4541667103767395, + "step": 163 + }, + { + "completion_length": 1085.666748046875, + "epoch": 0.5734265734265734, + "grad_norm": 1.0671396255493164, + "kl": 0.06464602053165436, + "learning_rate": 4.1e-06, + "loss": 0.0026, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666865348816, + "step": 164 + }, + { + "completion_length": 628.1666870117188, + "epoch": 0.5769230769230769, + "grad_norm": 0.9583672285079956, + "kl": 0.06743767857551575, + "learning_rate": 4.125e-06, + "loss": 0.0027, + "reward": 2.137500286102295, + "reward_std": 1.376930594444275, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.637499988079071, + "step": 165 + }, + { + "completion_length": 351.8333435058594, + "epoch": 0.5804195804195804, + "grad_norm": 0.6946209669113159, + "kl": 0.09894745796918869, + "learning_rate": 4.15e-06, + "loss": 0.004, + "reward": 2.7750000953674316, + "reward_std": 0.7055140733718872, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4416666626930237, + "step": 166 + }, + { + "completion_length": 448.16668701171875, + "epoch": 0.583916083916084, + "grad_norm": 0.6712130308151245, + "kl": 0.0714031383395195, + "learning_rate": 4.175e-06, + "loss": 0.0029, + "reward": 1.9583333730697632, + "reward_std": 0.6499359011650085, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6250000596046448, + "step": 167 + }, + { + "completion_length": 763.0, + "epoch": 0.5874125874125874, + "grad_norm": 0.5934569239616394, + "kl": 0.039833370596170425, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0016, + "reward": 1.7208335399627686, + "reward_std": 0.6870983839035034, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.720833420753479, + "step": 168 + }, + { + "completion_length": 813.8333740234375, + "epoch": 0.5909090909090909, + "grad_norm": 0.46408811211586, + "kl": 0.0639135017991066, + "learning_rate": 4.225e-06, + "loss": 0.0026, + "reward": 2.6625001430511475, + "reward_std": 0.271454393863678, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6625000238418579, + "step": 169 + }, + { + "completion_length": 621.3333740234375, + "epoch": 0.5944055944055944, + "grad_norm": 1.6175382137298584, + "kl": 0.23431169986724854, + "learning_rate": 4.25e-06, + "loss": 0.0094, + "reward": 1.5250000953674316, + "reward_std": 1.00784432888031, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 170 + }, + { + "completion_length": 685.1666870117188, + "epoch": 0.5979020979020979, + "grad_norm": 0.7504808306694031, + "kl": 0.06654171645641327, + "learning_rate": 4.2750000000000006e-06, + "loss": 0.0027, + "reward": 2.4583334922790527, + "reward_std": 0.8187898397445679, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 171 + }, + { + "completion_length": 772.6666870117188, + "epoch": 0.6013986013986014, + "grad_norm": 0.39892545342445374, + "kl": 0.030765770003199577, + "learning_rate": 4.3e-06, + "loss": 0.0012, + "reward": 1.7333333492279053, + "reward_std": 0.40207797288894653, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 172 + }, + { + "completion_length": 600.8333740234375, + "epoch": 0.6048951048951049, + "grad_norm": 0.6147928833961487, + "kl": 0.07108036428689957, + "learning_rate": 4.325e-06, + "loss": 0.0028, + "reward": 2.054166793823242, + "reward_std": 0.5684225559234619, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7208333015441895, + "step": 173 + }, + { + "completion_length": 761.3333740234375, + "epoch": 0.6083916083916084, + "grad_norm": 1.1690645217895508, + "kl": 0.11572085320949554, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0046, + "reward": 1.9583333730697632, + "reward_std": 1.2491663694381714, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7916666865348816, + "step": 174 + }, + { + "completion_length": 800.6666870117188, + "epoch": 0.6118881118881119, + "grad_norm": 1.141146183013916, + "kl": 0.0763167217373848, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.0031, + "reward": 1.4458335638046265, + "reward_std": 1.0782413482666016, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6124999523162842, + "step": 175 + }, + { + "completion_length": 582.0, + "epoch": 0.6153846153846154, + "grad_norm": 0.9667629599571228, + "kl": 0.04065123200416565, + "learning_rate": 4.4e-06, + "loss": 0.0016, + "reward": 1.5625, + "reward_std": 1.3656271696090698, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5625, + "step": 176 + }, + { + "completion_length": 653.6666870117188, + "epoch": 0.6188811188811189, + "grad_norm": 0.7743256092071533, + "kl": 0.07254478335380554, + "learning_rate": 4.425e-06, + "loss": 0.0029, + "reward": 1.308333396911621, + "reward_std": 0.7324048280715942, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6416666507720947, + "step": 177 + }, + { + "completion_length": 624.8333740234375, + "epoch": 0.6223776223776224, + "grad_norm": 1.7900493144989014, + "kl": 0.2500300407409668, + "learning_rate": 4.450000000000001e-06, + "loss": 0.01, + "reward": 1.3583333492279053, + "reward_std": 0.7825705409049988, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916667222976685, + "step": 178 + }, + { + "completion_length": 1285.0, + "epoch": 0.6258741258741258, + "grad_norm": 0.3387628197669983, + "kl": 0.025821728631854057, + "learning_rate": 4.475e-06, + "loss": 0.001, + "reward": 2.7916667461395264, + "reward_std": 0.678355872631073, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666269302368, + "step": 179 + }, + { + "completion_length": 975.8333740234375, + "epoch": 0.6293706293706294, + "grad_norm": 0.41932833194732666, + "kl": 0.04700490087270737, + "learning_rate": 4.5e-06, + "loss": 0.0019, + "reward": 1.8500001430511475, + "reward_std": 0.6782330274581909, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8499999046325684, + "step": 180 + }, + { + "completion_length": 771.8333740234375, + "epoch": 0.6328671328671329, + "grad_norm": 0.6049262881278992, + "kl": 0.05856431648135185, + "learning_rate": 4.525000000000001e-06, + "loss": 0.0023, + "reward": 1.6624999046325684, + "reward_std": 1.5213277339935303, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6625000238418579, + "step": 181 + }, + { + "completion_length": 718.3333740234375, + "epoch": 0.6363636363636364, + "grad_norm": 0.519266664981842, + "kl": 0.05408002436161041, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0022, + "reward": 3.012500286102295, + "reward_std": 1.0839452743530273, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8458333015441895, + "step": 182 + }, + { + "completion_length": 417.3333435058594, + "epoch": 0.6398601398601399, + "grad_norm": 1.159592866897583, + "kl": 0.06883987784385681, + "learning_rate": 4.575e-06, + "loss": 0.0028, + "reward": 2.308333396911621, + "reward_std": 1.089686393737793, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416666507720947, + "step": 183 + }, + { + "completion_length": 403.66668701171875, + "epoch": 0.6433566433566433, + "grad_norm": 0.9109689593315125, + "kl": 0.12938742339611053, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0052, + "reward": 2.829166889190674, + "reward_std": 0.9263390898704529, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4958333373069763, + "step": 184 + }, + { + "completion_length": 584.1666870117188, + "epoch": 0.6468531468531469, + "grad_norm": 1.3091282844543457, + "kl": 0.1182996854186058, + "learning_rate": 4.625000000000001e-06, + "loss": 0.0047, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 185 + }, + { + "completion_length": 715.8333740234375, + "epoch": 0.6503496503496503, + "grad_norm": 0.8944427967071533, + "kl": 0.07471362501382828, + "learning_rate": 4.65e-06, + "loss": 0.003, + "reward": 2.5500001907348633, + "reward_std": 1.0044898986816406, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 186 + }, + { + "completion_length": 328.66668701171875, + "epoch": 0.6538461538461539, + "grad_norm": 2.0265045166015625, + "kl": 0.3070363402366638, + "learning_rate": 4.675000000000001e-06, + "loss": 0.0123, + "reward": 2.0291666984558105, + "reward_std": 0.9910117983818054, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.36250001192092896, + "step": 187 + }, + { + "completion_length": 463.8333435058594, + "epoch": 0.6573426573426573, + "grad_norm": 1.1863874197006226, + "kl": 0.07772837579250336, + "learning_rate": 4.7e-06, + "loss": 0.0031, + "reward": 2.5333335399627686, + "reward_std": 0.9558593034744263, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5333333611488342, + "step": 188 + }, + { + "completion_length": 516.5, + "epoch": 0.6608391608391608, + "grad_norm": 0.690477192401886, + "kl": 0.08707510679960251, + "learning_rate": 4.7250000000000005e-06, + "loss": 0.0035, + "reward": 3.4000000953674316, + "reward_std": 1.2024973630905151, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 189 + }, + { + "completion_length": 656.8333740234375, + "epoch": 0.6643356643356644, + "grad_norm": 0.7191756963729858, + "kl": 0.05152536556124687, + "learning_rate": 4.75e-06, + "loss": 0.0021, + "reward": 1.7833335399627686, + "reward_std": 0.5288351774215698, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 190 + }, + { + "completion_length": 510.16668701171875, + "epoch": 0.6678321678321678, + "grad_norm": 1.589722990989685, + "kl": 0.11165278404951096, + "learning_rate": 4.775e-06, + "loss": 0.0045, + "reward": 1.5916666984558105, + "reward_std": 1.1620744466781616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5916666984558105, + "step": 191 + }, + { + "completion_length": 463.3333435058594, + "epoch": 0.6713286713286714, + "grad_norm": 1.1402506828308105, + "kl": 0.12224837392568588, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0049, + "reward": 3.0166664123535156, + "reward_std": 0.46224093437194824, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6833333373069763, + "step": 192 + }, + { + "completion_length": 668.8333740234375, + "epoch": 0.6748251748251748, + "grad_norm": 0.829407811164856, + "kl": 0.04827030003070831, + "learning_rate": 4.825e-06, + "loss": 0.0019, + "reward": 2.516666889190674, + "reward_std": 0.9416297674179077, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 193 + }, + { + "completion_length": 653.1666870117188, + "epoch": 0.6783216783216783, + "grad_norm": 0.8737359642982483, + "kl": 0.11687206476926804, + "learning_rate": 4.85e-06, + "loss": 0.0047, + "reward": 1.883333444595337, + "reward_std": 0.9978310465812683, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666388511658, + "step": 194 + }, + { + "completion_length": 521.1666870117188, + "epoch": 0.6818181818181818, + "grad_norm": 1.265020728111267, + "kl": 0.1497541069984436, + "learning_rate": 4.875e-06, + "loss": 0.006, + "reward": 1.6666667461395264, + "reward_std": 1.1578716039657593, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6666666865348816, + "step": 195 + }, + { + "completion_length": 720.3333740234375, + "epoch": 0.6853146853146853, + "grad_norm": 0.5844486355781555, + "kl": 0.07905390858650208, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0032, + "reward": 2.683333396911621, + "reward_std": 0.7659417986869812, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 196 + }, + { + "completion_length": 654.3333740234375, + "epoch": 0.6888111888111889, + "grad_norm": 1.0279442071914673, + "kl": 0.05869147181510925, + "learning_rate": 4.925e-06, + "loss": 0.0023, + "reward": 1.8250000476837158, + "reward_std": 1.047735571861267, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.824999988079071, + "step": 197 + }, + { + "completion_length": 696.5, + "epoch": 0.6923076923076923, + "grad_norm": 0.5949178338050842, + "kl": 0.10564576834440231, + "learning_rate": 4.95e-06, + "loss": 0.0042, + "reward": 2.7958333492279053, + "reward_std": 0.8044278621673584, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 198 + }, + { + "completion_length": 667.3333740234375, + "epoch": 0.6958041958041958, + "grad_norm": 1.4045933485031128, + "kl": 0.2249039262533188, + "learning_rate": 4.975000000000001e-06, + "loss": 0.009, + "reward": 1.7833333015441895, + "reward_std": 1.2967909574508667, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 199 + }, + { + "completion_length": 549.0, + "epoch": 0.6993006993006993, + "grad_norm": 11.491266250610352, + "kl": 2.7085909843444824, + "learning_rate": 5e-06, + "loss": 0.1083, + "reward": 1.883333444595337, + "reward_std": 1.0934655666351318, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 200 + }, + { + "completion_length": 1157.666748046875, + "epoch": 0.7027972027972028, + "grad_norm": 0.3758504092693329, + "kl": 0.03439244627952576, + "learning_rate": 4.99999619228322e-06, + "loss": 0.0014, + "reward": 1.5375001430511475, + "reward_std": 0.490853875875473, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 201 + }, + { + "completion_length": 276.66668701171875, + "epoch": 0.7062937062937062, + "grad_norm": 1.4240407943725586, + "kl": 0.09711845219135284, + "learning_rate": 4.999984769144476e-06, + "loss": 0.0039, + "reward": 1.774999976158142, + "reward_std": 1.4250439405441284, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.44166669249534607, + "step": 202 + }, + { + "completion_length": 506.16668701171875, + "epoch": 0.7097902097902098, + "grad_norm": 0.8863720893859863, + "kl": 0.0886097177863121, + "learning_rate": 4.999965730618567e-06, + "loss": 0.0035, + "reward": 2.4166667461395264, + "reward_std": 0.7717944979667664, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.75, + "step": 203 + }, + { + "completion_length": 558.8333740234375, + "epoch": 0.7132867132867133, + "grad_norm": 1.036176323890686, + "kl": 0.11752279102802277, + "learning_rate": 4.999939076763487e-06, + "loss": 0.0047, + "reward": 1.8583334684371948, + "reward_std": 0.7761551141738892, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916666030883789, + "step": 204 + }, + { + "completion_length": 590.3333740234375, + "epoch": 0.7167832167832168, + "grad_norm": 1.2968803644180298, + "kl": 0.1260688155889511, + "learning_rate": 4.9999048076604286e-06, + "loss": 0.005, + "reward": 1.883333444595337, + "reward_std": 1.0934655666351318, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666984558105, + "step": 205 + }, + { + "completion_length": 653.3333740234375, + "epoch": 0.7202797202797203, + "grad_norm": 1.9041389226913452, + "kl": 0.350026935338974, + "learning_rate": 4.999862923413781e-06, + "loss": 0.014, + "reward": 1.8041666746139526, + "reward_std": 0.5104941129684448, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6375000476837158, + "step": 206 + }, + { + "completion_length": 359.3333435058594, + "epoch": 0.7237762237762237, + "grad_norm": 1.4652067422866821, + "kl": 0.09337612986564636, + "learning_rate": 4.9998134241511305e-06, + "loss": 0.0037, + "reward": 1.875, + "reward_std": 1.1440061330795288, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5416666865348816, + "step": 207 + }, + { + "completion_length": 393.3333435058594, + "epoch": 0.7272727272727273, + "grad_norm": 0.8172839879989624, + "kl": 0.11479752510786057, + "learning_rate": 4.999756310023261e-06, + "loss": 0.0046, + "reward": 3.2916667461395264, + "reward_std": 0.46627962589263916, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.625, + "step": 208 + }, + { + "completion_length": 1035.166748046875, + "epoch": 0.7307692307692307, + "grad_norm": 0.45489755272865295, + "kl": 0.03647574782371521, + "learning_rate": 4.9996915812041515e-06, + "loss": 0.0015, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 209 + }, + { + "completion_length": 561.5, + "epoch": 0.7342657342657343, + "grad_norm": 0.7732179164886475, + "kl": 0.10910838097333908, + "learning_rate": 4.9996192378909785e-06, + "loss": 0.0044, + "reward": 3.075000286102295, + "reward_std": 0.9852665662765503, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416667342185974, + "step": 210 + }, + { + "completion_length": 327.3333435058594, + "epoch": 0.7377622377622378, + "grad_norm": 1.1959446668624878, + "kl": 0.18659886717796326, + "learning_rate": 4.999539280304111e-06, + "loss": 0.0075, + "reward": 1.7333333492279053, + "reward_std": 0.6875075697898865, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5666666626930237, + "step": 211 + }, + { + "completion_length": 698.1666870117188, + "epoch": 0.7412587412587412, + "grad_norm": 0.5885636806488037, + "kl": 0.06670037657022476, + "learning_rate": 4.999451708687114e-06, + "loss": 0.0027, + "reward": 2.7750003337860107, + "reward_std": 0.8341163396835327, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7749999761581421, + "step": 212 + }, + { + "completion_length": 679.8333740234375, + "epoch": 0.7447552447552448, + "grad_norm": 0.9122396111488342, + "kl": 0.10316199064254761, + "learning_rate": 4.999356523306746e-06, + "loss": 0.0041, + "reward": 2.008333444595337, + "reward_std": 1.2973692417144775, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5083333253860474, + "step": 213 + }, + { + "completion_length": 604.1666870117188, + "epoch": 0.7482517482517482, + "grad_norm": 0.7414869070053101, + "kl": 0.08340045064687729, + "learning_rate": 4.9992537244529585e-06, + "loss": 0.0033, + "reward": 3.299999952316284, + "reward_std": 0.41713306307792664, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8000000715255737, + "step": 214 + }, + { + "completion_length": 704.5, + "epoch": 0.7517482517482518, + "grad_norm": 2.09073543548584, + "kl": 0.10594753921031952, + "learning_rate": 4.999143312438893e-06, + "loss": 0.0042, + "reward": 1.7416666746139526, + "reward_std": 0.9259679317474365, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7416667342185974, + "step": 215 + }, + { + "completion_length": 587.8333740234375, + "epoch": 0.7552447552447552, + "grad_norm": 1.304240107536316, + "kl": 0.1295248121023178, + "learning_rate": 4.999025287600886e-06, + "loss": 0.0052, + "reward": 2.616666793823242, + "reward_std": 1.6061341762542725, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6166666746139526, + "step": 216 + }, + { + "completion_length": 495.8333435058594, + "epoch": 0.7587412587412588, + "grad_norm": 1.2090598344802856, + "kl": 0.11880560964345932, + "learning_rate": 4.9988996502984604e-06, + "loss": 0.0048, + "reward": 2.7333333492279053, + "reward_std": 1.022578477859497, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5666667222976685, + "step": 217 + }, + { + "completion_length": 565.6666870117188, + "epoch": 0.7622377622377622, + "grad_norm": 0.553954005241394, + "kl": 0.052788302302360535, + "learning_rate": 4.998766400914329e-06, + "loss": 0.0021, + "reward": 2.6999998092651367, + "reward_std": 0.9705669283866882, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.699999988079071, + "step": 218 + }, + { + "completion_length": 645.6666870117188, + "epoch": 0.7657342657342657, + "grad_norm": 2.507683038711548, + "kl": 0.2849184274673462, + "learning_rate": 4.998625539854394e-06, + "loss": 0.0114, + "reward": 2.6000001430511475, + "reward_std": 1.0089600086212158, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7666666507720947, + "step": 219 + }, + { + "completion_length": 321.66668701171875, + "epoch": 0.7692307692307693, + "grad_norm": 1.2175945043563843, + "kl": 0.0842239186167717, + "learning_rate": 4.99847706754774e-06, + "loss": 0.0034, + "reward": 2.933333158493042, + "reward_std": 0.6516644954681396, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6000000238418579, + "step": 220 + }, + { + "completion_length": 700.5, + "epoch": 0.7727272727272727, + "grad_norm": 2.048892021179199, + "kl": 0.16157689690589905, + "learning_rate": 4.9983209844466404e-06, + "loss": 0.0065, + "reward": 1.625, + "reward_std": 1.0167349576950073, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 221 + }, + { + "completion_length": 833.5, + "epoch": 0.7762237762237763, + "grad_norm": 0.9171572327613831, + "kl": 0.06645169854164124, + "learning_rate": 4.998157291026553e-06, + "loss": 0.0027, + "reward": 2.9083335399627686, + "reward_std": 1.2068209648132324, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7416666746139526, + "step": 222 + }, + { + "completion_length": 506.3333435058594, + "epoch": 0.7797202797202797, + "grad_norm": 19.220211029052734, + "kl": 3.192702293395996, + "learning_rate": 4.9979859877861155e-06, + "loss": 0.1277, + "reward": 3.191666603088379, + "reward_std": 1.2146673202514648, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.6916667222976685, + "step": 223 + }, + { + "completion_length": 593.0, + "epoch": 0.7832167832167832, + "grad_norm": 0.8852243423461914, + "kl": 0.09442658722400665, + "learning_rate": 4.997807075247147e-06, + "loss": 0.0038, + "reward": 3.2750003337860107, + "reward_std": 0.6691412925720215, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7749999761581421, + "step": 224 + }, + { + "completion_length": 831.1666870117188, + "epoch": 0.7867132867132867, + "grad_norm": 0.4429211914539337, + "kl": 0.04310205578804016, + "learning_rate": 4.997620553954645e-06, + "loss": 0.0017, + "reward": 3.1541666984558105, + "reward_std": 1.132741928100586, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8208333849906921, + "step": 225 + }, + { + "completion_length": 731.0, + "epoch": 0.7902097902097902, + "grad_norm": 0.4210525155067444, + "kl": 0.0507250651717186, + "learning_rate": 4.997426424476787e-06, + "loss": 0.002, + "reward": 3.758333206176758, + "reward_std": 0.40052053332328796, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 226 + }, + { + "completion_length": 683.1666870117188, + "epoch": 0.7937062937062938, + "grad_norm": 1.443489670753479, + "kl": 0.1432674527168274, + "learning_rate": 4.9972246874049254e-06, + "loss": 0.0057, + "reward": 2.7166666984558105, + "reward_std": 1.075019359588623, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 227 + }, + { + "completion_length": 749.0, + "epoch": 0.7972027972027972, + "grad_norm": 0.4731828272342682, + "kl": 0.05084119364619255, + "learning_rate": 4.9970153433535855e-06, + "loss": 0.002, + "reward": 2.5250000953674316, + "reward_std": 0.49371039867401123, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8583332896232605, + "step": 228 + }, + { + "completion_length": 749.6666870117188, + "epoch": 0.8006993006993007, + "grad_norm": 1.1463042497634888, + "kl": 0.0917380303144455, + "learning_rate": 4.996798392960466e-06, + "loss": 0.0037, + "reward": 3.1000001430511475, + "reward_std": 1.1304867267608643, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 229 + }, + { + "completion_length": 444.3333435058594, + "epoch": 0.8041958041958042, + "grad_norm": 2.1588308811187744, + "kl": 0.2637466788291931, + "learning_rate": 4.9965738368864345e-06, + "loss": 0.0105, + "reward": 1.4583333730697632, + "reward_std": 0.665895402431488, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4583333432674408, + "step": 230 + }, + { + "completion_length": 563.8333740234375, + "epoch": 0.8076923076923077, + "grad_norm": 1.7064660787582397, + "kl": 0.15527644753456116, + "learning_rate": 4.99634167581553e-06, + "loss": 0.0062, + "reward": 2.9208335876464844, + "reward_std": 1.1095513105392456, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5874999761581421, + "step": 231 + }, + { + "completion_length": 571.6666870117188, + "epoch": 0.8111888111888111, + "grad_norm": 0.7909032106399536, + "kl": 0.10144728422164917, + "learning_rate": 4.996101910454953e-06, + "loss": 0.0041, + "reward": 3.200000286102295, + "reward_std": 0.6928204298019409, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.699999988079071, + "step": 232 + }, + { + "completion_length": 442.16668701171875, + "epoch": 0.8146853146853147, + "grad_norm": 2.3640758991241455, + "kl": 0.1561039686203003, + "learning_rate": 4.995854541535072e-06, + "loss": 0.0062, + "reward": 2.8583333492279053, + "reward_std": 1.5499732494354248, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 233 + }, + { + "completion_length": 635.0, + "epoch": 0.8181818181818182, + "grad_norm": 1.519736409187317, + "kl": 0.08059443533420563, + "learning_rate": 4.995599569809414e-06, + "loss": 0.0032, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 234 + }, + { + "completion_length": 867.1666870117188, + "epoch": 0.8216783216783217, + "grad_norm": 1.0411657094955444, + "kl": 0.18848155438899994, + "learning_rate": 4.995336996054668e-06, + "loss": 0.0075, + "reward": 2.566666603088379, + "reward_std": 0.8010410666465759, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 235 + }, + { + "completion_length": 767.0, + "epoch": 0.8251748251748252, + "grad_norm": 1.3162877559661865, + "kl": 0.1943603754043579, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.0078, + "reward": 2.8458335399627686, + "reward_std": 1.271457552909851, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8458333015441895, + "step": 236 + }, + { + "completion_length": 971.0, + "epoch": 0.8286713286713286, + "grad_norm": 0.7847824096679688, + "kl": 0.07626049965620041, + "learning_rate": 4.994789045680448e-06, + "loss": 0.0031, + "reward": 2.766666889190674, + "reward_std": 1.1245739459991455, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7666666507720947, + "step": 237 + }, + { + "completion_length": 552.0, + "epoch": 0.8321678321678322, + "grad_norm": 0.7410560250282288, + "kl": 0.10457824170589447, + "learning_rate": 4.994503670730126e-06, + "loss": 0.0042, + "reward": 3.391666889190674, + "reward_std": 0.7059863805770874, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7250000238418579, + "step": 238 + }, + { + "completion_length": 725.6666870117188, + "epoch": 0.8356643356643356, + "grad_norm": 0.4836815595626831, + "kl": 0.05600851774215698, + "learning_rate": 4.9942106970890136e-06, + "loss": 0.0022, + "reward": 2.7333333492279053, + "reward_std": 0.40207791328430176, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8999999761581421, + "step": 239 + }, + { + "completion_length": 670.1666870117188, + "epoch": 0.8391608391608392, + "grad_norm": 1.1572860479354858, + "kl": 0.09645780920982361, + "learning_rate": 4.993910125649561e-06, + "loss": 0.0039, + "reward": 1.945833444595337, + "reward_std": 1.1002748012542725, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.612500011920929, + "step": 240 + }, + { + "completion_length": 716.0, + "epoch": 0.8426573426573427, + "grad_norm": 0.6385201811790466, + "kl": 0.10877624154090881, + "learning_rate": 4.993601957327361e-06, + "loss": 0.0044, + "reward": 1.7999999523162842, + "reward_std": 1.3168143033981323, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 241 + }, + { + "completion_length": 783.0, + "epoch": 0.8461538461538461, + "grad_norm": 0.4785465598106384, + "kl": 0.06399235874414444, + "learning_rate": 4.993286193061145e-06, + "loss": 0.0026, + "reward": 2.258333444595337, + "reward_std": 0.5389031767845154, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9249999523162842, + "step": 242 + }, + { + "completion_length": 660.6666870117188, + "epoch": 0.8496503496503497, + "grad_norm": 0.7678278684616089, + "kl": 0.07323874533176422, + "learning_rate": 4.9929628338127904e-06, + "loss": 0.0029, + "reward": 2.575000047683716, + "reward_std": 1.0048632621765137, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7416666746139526, + "step": 243 + }, + { + "completion_length": 904.5, + "epoch": 0.8531468531468531, + "grad_norm": 0.41908255219459534, + "kl": 0.049275174736976624, + "learning_rate": 4.992631880567301e-06, + "loss": 0.002, + "reward": 1.9250000715255737, + "reward_std": 0.6354132890701294, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.9249999523162842, + "step": 244 + }, + { + "completion_length": 524.8333740234375, + "epoch": 0.8566433566433567, + "grad_norm": 0.9670363068580627, + "kl": 0.17363564670085907, + "learning_rate": 4.992293334332821e-06, + "loss": 0.0069, + "reward": 1.558333396911621, + "reward_std": 1.3331979513168335, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333373069763, + "step": 245 + }, + { + "completion_length": 869.1666870117188, + "epoch": 0.8601398601398601, + "grad_norm": 0.45620983839035034, + "kl": 0.0668826699256897, + "learning_rate": 4.991947196140619e-06, + "loss": 0.0027, + "reward": 2.5416667461395264, + "reward_std": 0.9057685732841492, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 246 + }, + { + "completion_length": 841.3333740234375, + "epoch": 0.8636363636363636, + "grad_norm": 0.559363603591919, + "kl": 0.0583985298871994, + "learning_rate": 4.991593467045092e-06, + "loss": 0.0023, + "reward": 2.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 247 + }, + { + "completion_length": 599.1666870117188, + "epoch": 0.8671328671328671, + "grad_norm": 0.9642091989517212, + "kl": 0.11994724720716476, + "learning_rate": 4.9912321481237616e-06, + "loss": 0.0048, + "reward": 2.5250000953674316, + "reward_std": 1.0810874700546265, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 248 + }, + { + "completion_length": 462.16668701171875, + "epoch": 0.8706293706293706, + "grad_norm": 36.93287658691406, + "kl": 9.688800811767578, + "learning_rate": 4.990863240477266e-06, + "loss": 0.3876, + "reward": 2.133333444595337, + "reward_std": 1.5154757499694824, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.46666666865348816, + "step": 249 + }, + { + "completion_length": 339.0, + "epoch": 0.8741258741258742, + "grad_norm": 26.625389099121094, + "kl": 0.959087610244751, + "learning_rate": 4.990486745229364e-06, + "loss": 0.0384, + "reward": 2.4000000953674316, + "reward_std": 1.4926488399505615, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5666666626930237, + "step": 250 + }, + { + "completion_length": 618.1666870117188, + "epoch": 0.8776223776223776, + "grad_norm": 0.8756181597709656, + "kl": 0.1540575623512268, + "learning_rate": 4.990102663526925e-06, + "loss": 0.0062, + "reward": 2.3583335876464844, + "reward_std": 0.7564169764518738, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916666030883789, + "step": 251 + }, + { + "completion_length": 659.0, + "epoch": 0.8811188811188811, + "grad_norm": 1.4729007482528687, + "kl": 0.22244331240653992, + "learning_rate": 4.989710996539926e-06, + "loss": 0.0089, + "reward": 2.6666667461395264, + "reward_std": 1.386602759361267, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6666666865348816, + "step": 252 + }, + { + "completion_length": 471.0, + "epoch": 0.8846153846153846, + "grad_norm": 1.7183626890182495, + "kl": 0.19531545042991638, + "learning_rate": 4.989311745461456e-06, + "loss": 0.0078, + "reward": 2.2624998092651367, + "reward_std": 1.547720193862915, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.42916664481163025, + "step": 253 + }, + { + "completion_length": 809.5, + "epoch": 0.8881118881118881, + "grad_norm": 1.3393943309783936, + "kl": 0.06276177614927292, + "learning_rate": 4.9889049115077e-06, + "loss": 0.0025, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 254 + }, + { + "completion_length": 696.0, + "epoch": 0.8916083916083916, + "grad_norm": 0.5159295201301575, + "kl": 0.06829811632633209, + "learning_rate": 4.988490495917948e-06, + "loss": 0.0027, + "reward": 2.375, + "reward_std": 0.8226482272148132, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.875, + "step": 255 + }, + { + "completion_length": 469.8333435058594, + "epoch": 0.8951048951048951, + "grad_norm": 15.731892585754395, + "kl": 5.195942401885986, + "learning_rate": 4.988068499954578e-06, + "loss": 0.2078, + "reward": 2.5333333015441895, + "reward_std": 1.7218208312988281, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5333333611488342, + "step": 256 + }, + { + "completion_length": 267.66668701171875, + "epoch": 0.8986013986013986, + "grad_norm": 2.6494510173797607, + "kl": 0.2645886242389679, + "learning_rate": 4.987638924903066e-06, + "loss": 0.0106, + "reward": 1.9833333492279053, + "reward_std": 1.6277797222137451, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4833333194255829, + "step": 257 + }, + { + "completion_length": 772.3333740234375, + "epoch": 0.9020979020979021, + "grad_norm": 0.4527927339076996, + "kl": 0.06693247705698013, + "learning_rate": 4.987201772071971e-06, + "loss": 0.0027, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 258 + }, + { + "completion_length": 585.6666870117188, + "epoch": 0.9055944055944056, + "grad_norm": 0.689224362373352, + "kl": 0.08530323952436447, + "learning_rate": 4.9867570427929356e-06, + "loss": 0.0034, + "reward": 0.7916666865348816, + "reward_std": 0.9183772802352905, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4583333432674408, + "step": 259 + }, + { + "completion_length": 537.1666870117188, + "epoch": 0.9090909090909091, + "grad_norm": 0.6728858947753906, + "kl": 0.0897747129201889, + "learning_rate": 4.986304738420684e-06, + "loss": 0.0036, + "reward": 3.129167079925537, + "reward_std": 1.1996268033981323, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7958333492279053, + "step": 260 + }, + { + "completion_length": 407.8333435058594, + "epoch": 0.9125874125874126, + "grad_norm": 1.1994887590408325, + "kl": 0.09183052182197571, + "learning_rate": 4.985844860333012e-06, + "loss": 0.0037, + "reward": 2.3583335876464844, + "reward_std": 1.752831220626831, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 261 + }, + { + "completion_length": 677.5, + "epoch": 0.916083916083916, + "grad_norm": 0.508855402469635, + "kl": 0.07326661795377731, + "learning_rate": 4.985377409930789e-06, + "loss": 0.0029, + "reward": 3.375, + "reward_std": 0.8635681867599487, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 262 + }, + { + "completion_length": 736.8333740234375, + "epoch": 0.9195804195804196, + "grad_norm": 0.9614912271499634, + "kl": 0.09196578711271286, + "learning_rate": 4.98490238863795e-06, + "loss": 0.0037, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 263 + }, + { + "completion_length": 770.8333740234375, + "epoch": 0.9230769230769231, + "grad_norm": 0.47455278038978577, + "kl": 0.06785900890827179, + "learning_rate": 4.984419797901491e-06, + "loss": 0.0027, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 264 + }, + { + "completion_length": 623.6666870117188, + "epoch": 0.9265734265734266, + "grad_norm": 0.5573136210441589, + "kl": 0.08627455681562424, + "learning_rate": 4.9839296391914696e-06, + "loss": 0.0035, + "reward": 3.116666793823242, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 265 + }, + { + "completion_length": 391.3333435058594, + "epoch": 0.9300699300699301, + "grad_norm": 1.9462356567382812, + "kl": 0.16661277413368225, + "learning_rate": 4.983431914000991e-06, + "loss": 0.0067, + "reward": 2.4749999046325684, + "reward_std": 1.4665435552597046, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 266 + }, + { + "completion_length": 397.3333435058594, + "epoch": 0.9335664335664335, + "grad_norm": 1.011677622795105, + "kl": 0.23764805495738983, + "learning_rate": 4.982926623846216e-06, + "loss": 0.0095, + "reward": 3.366666793823242, + "reward_std": 0.6274287104606628, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7000000476837158, + "step": 267 + }, + { + "completion_length": 417.0, + "epoch": 0.9370629370629371, + "grad_norm": 1.4490914344787598, + "kl": 0.13754335045814514, + "learning_rate": 4.9824137702663424e-06, + "loss": 0.0055, + "reward": 1.7125000953674316, + "reward_std": 0.9684717655181885, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7124999761581421, + "step": 268 + }, + { + "completion_length": 410.5, + "epoch": 0.9405594405594405, + "grad_norm": 0.8436146974563599, + "kl": 0.14260268211364746, + "learning_rate": 4.981893354823614e-06, + "loss": 0.0057, + "reward": 1.8125, + "reward_std": 1.1806514263153076, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6458333730697632, + "step": 269 + }, + { + "completion_length": 644.6666870117188, + "epoch": 0.9440559440559441, + "grad_norm": 0.7549885511398315, + "kl": 0.09023593366146088, + "learning_rate": 4.981365379103306e-06, + "loss": 0.0036, + "reward": 2.3500001430511475, + "reward_std": 1.3856406211853027, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 270 + }, + { + "completion_length": 195.5, + "epoch": 0.9475524475524476, + "grad_norm": 1.895914077758789, + "kl": 0.29670989513397217, + "learning_rate": 4.980829844713722e-06, + "loss": 0.0119, + "reward": 1.649999976158142, + "reward_std": 1.0168579816818237, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.3166666626930237, + "step": 271 + }, + { + "completion_length": 359.8333435058594, + "epoch": 0.951048951048951, + "grad_norm": 1.0856112241744995, + "kl": 0.255443274974823, + "learning_rate": 4.980286753286196e-06, + "loss": 0.0102, + "reward": 2.2916667461395264, + "reward_std": 1.2310227155685425, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.625, + "step": 272 + }, + { + "completion_length": 726.8333740234375, + "epoch": 0.9545454545454546, + "grad_norm": 0.2943981885910034, + "kl": 0.12990406155586243, + "learning_rate": 4.979736106475075e-06, + "loss": 0.0064, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 273 + }, + { + "completion_length": 680.0, + "epoch": 0.958041958041958, + "grad_norm": 0.5072641372680664, + "kl": 0.07472037523984909, + "learning_rate": 4.979177905957726e-06, + "loss": 0.003, + "reward": 3.012500286102295, + "reward_std": 1.1379531621932983, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8458333015441895, + "step": 274 + }, + { + "completion_length": 491.5, + "epoch": 0.9615384615384616, + "grad_norm": 0.6770206689834595, + "kl": 0.13075995445251465, + "learning_rate": 4.978612153434527e-06, + "loss": 0.0052, + "reward": 2.008333444595337, + "reward_std": 0.7618508338928223, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6750000715255737, + "step": 275 + }, + { + "completion_length": 749.6666870117188, + "epoch": 0.965034965034965, + "grad_norm": 0.5412439107894897, + "kl": 0.10561086982488632, + "learning_rate": 4.978038850628855e-06, + "loss": 0.0042, + "reward": 2.870833396911621, + "reward_std": 0.6615166068077087, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 276 + }, + { + "completion_length": 511.5, + "epoch": 0.9685314685314685, + "grad_norm": 1.1368520259857178, + "kl": 0.14474637806415558, + "learning_rate": 4.977457999287091e-06, + "loss": 0.0058, + "reward": 1.7583332061767578, + "reward_std": 1.0646204948425293, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 277 + }, + { + "completion_length": 750.6666870117188, + "epoch": 0.972027972027972, + "grad_norm": 1.0957084894180298, + "kl": 0.10108073800802231, + "learning_rate": 4.9768696011786095e-06, + "loss": 0.004, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 278 + }, + { + "completion_length": 324.3333435058594, + "epoch": 0.9755244755244755, + "grad_norm": 1.0172570943832397, + "kl": 0.31204575300216675, + "learning_rate": 4.976273658095772e-06, + "loss": 0.0125, + "reward": 0.908333420753479, + "reward_std": 1.0532886981964111, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40833330154418945, + "step": 279 + }, + { + "completion_length": 329.66668701171875, + "epoch": 0.9790209790209791, + "grad_norm": 0.753690242767334, + "kl": 0.09907300770282745, + "learning_rate": 4.975670171853926e-06, + "loss": 0.004, + "reward": 2.7750003337860107, + "reward_std": 1.0994317531585693, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7749999761581421, + "step": 280 + }, + { + "completion_length": 615.3333740234375, + "epoch": 0.9825174825174825, + "grad_norm": 0.8215593695640564, + "kl": 0.09376661479473114, + "learning_rate": 4.975059144291395e-06, + "loss": 0.0038, + "reward": 2.375, + "reward_std": 0.6509608626365662, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8749999403953552, + "step": 281 + }, + { + "completion_length": 435.8333435058594, + "epoch": 0.986013986013986, + "grad_norm": 1.3309355974197388, + "kl": 0.21346941590309143, + "learning_rate": 4.974440577269473e-06, + "loss": 0.0085, + "reward": 2.0333333015441895, + "reward_std": 1.6485350131988525, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5333333611488342, + "step": 282 + }, + { + "completion_length": 470.3333435058594, + "epoch": 0.9895104895104895, + "grad_norm": 1.1230376958847046, + "kl": 0.1047142893075943, + "learning_rate": 4.973814472672424e-06, + "loss": 0.0042, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 283 + }, + { + "completion_length": 887.5, + "epoch": 0.993006993006993, + "grad_norm": 0.6477030515670776, + "kl": 0.08142790198326111, + "learning_rate": 4.973180832407471e-06, + "loss": 0.0033, + "reward": 1.4250000715255737, + "reward_std": 0.9661781191825867, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5916666388511658, + "step": 284 + }, + { + "completion_length": 566.3333740234375, + "epoch": 0.9965034965034965, + "grad_norm": 0.7089259624481201, + "kl": 0.1486695259809494, + "learning_rate": 4.972539658404793e-06, + "loss": 0.0059, + "reward": 1.7166666984558105, + "reward_std": 0.7332576513290405, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 285 + }, + { + "completion_length": 899.3333740234375, + "epoch": 1.0, + "grad_norm": 0.6575971841812134, + "kl": 0.0989997610449791, + "learning_rate": 4.971890952617515e-06, + "loss": 0.004, + "reward": 2.8583335876464844, + "reward_std": 0.9960757493972778, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 286 + }, + { + "completion_length": 414.5, + "epoch": 1.0034965034965035, + "grad_norm": 1.0364247560501099, + "kl": 0.19011634588241577, + "learning_rate": 4.971234717021709e-06, + "loss": 0.0076, + "reward": 1.7916667461395264, + "reward_std": 1.7468304634094238, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6250000596046448, + "step": 287 + }, + { + "completion_length": 524.0, + "epoch": 1.006993006993007, + "grad_norm": 0.9833644032478333, + "kl": 0.14835724234580994, + "learning_rate": 4.970570953616383e-06, + "loss": 0.0059, + "reward": 2.3583335876464844, + "reward_std": 1.1191142797470093, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916666030883789, + "step": 288 + }, + { + "completion_length": 681.1666870117188, + "epoch": 1.0104895104895104, + "grad_norm": 0.6175888180732727, + "kl": 0.10941031575202942, + "learning_rate": 4.969899664423473e-06, + "loss": 0.0044, + "reward": 2.704166889190674, + "reward_std": 0.7567061185836792, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8708333373069763, + "step": 289 + }, + { + "completion_length": 386.5, + "epoch": 1.013986013986014, + "grad_norm": 2.7495882511138916, + "kl": 0.5513795614242554, + "learning_rate": 4.9692208514878445e-06, + "loss": 0.0221, + "reward": 1.3666666746139526, + "reward_std": 1.0023306608200073, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5333333611488342, + "step": 290 + }, + { + "completion_length": 679.6666870117188, + "epoch": 1.0174825174825175, + "grad_norm": 0.9174596667289734, + "kl": 0.14350205659866333, + "learning_rate": 4.968534516877279e-06, + "loss": 0.0057, + "reward": 2.879167079925537, + "reward_std": 1.0047906637191772, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7124999761581421, + "step": 291 + }, + { + "completion_length": 322.0, + "epoch": 1.020979020979021, + "grad_norm": 6.856034278869629, + "kl": 3.479478597640991, + "learning_rate": 4.96784066268247e-06, + "loss": 0.1392, + "reward": 0.875, + "reward_std": 0.9832345247268677, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.375, + "step": 292 + }, + { + "completion_length": 500.5, + "epoch": 1.0244755244755244, + "grad_norm": 0.8394511938095093, + "kl": 0.14955884218215942, + "learning_rate": 4.967139291017018e-06, + "loss": 0.006, + "reward": 2.133333206176758, + "reward_std": 1.149202585220337, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 293 + }, + { + "completion_length": 470.5, + "epoch": 1.027972027972028, + "grad_norm": 1.0547795295715332, + "kl": 0.26865124702453613, + "learning_rate": 4.966430404017424e-06, + "loss": 0.0107, + "reward": 1.7916667461395264, + "reward_std": 1.1534368991851807, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 294 + }, + { + "completion_length": 357.3333435058594, + "epoch": 1.0314685314685315, + "grad_norm": 1.61123788356781, + "kl": 0.2728823125362396, + "learning_rate": 4.965714003843079e-06, + "loss": 0.0109, + "reward": 3.266666889190674, + "reward_std": 1.6014575958251953, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7666666507720947, + "step": 295 + }, + { + "completion_length": 388.3333435058594, + "epoch": 1.034965034965035, + "grad_norm": 0.8229731917381287, + "kl": 0.33708059787750244, + "learning_rate": 4.964990092676263e-06, + "loss": 0.0135, + "reward": 0.7125000357627869, + "reward_std": 0.5300353765487671, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3791666626930237, + "step": 296 + }, + { + "completion_length": 667.0, + "epoch": 1.0384615384615385, + "grad_norm": 1.0831242799758911, + "kl": 0.26999422907829285, + "learning_rate": 4.964258672722135e-06, + "loss": 0.0108, + "reward": 2.5458335876464844, + "reward_std": 1.5425965785980225, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7124999761581421, + "step": 297 + }, + { + "completion_length": 804.1666870117188, + "epoch": 1.0419580419580419, + "grad_norm": 0.625715434551239, + "kl": 0.12136679887771606, + "learning_rate": 4.963519746208726e-06, + "loss": 0.0049, + "reward": 1.5791667699813843, + "reward_std": 1.2249915599822998, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7458333373069763, + "step": 298 + }, + { + "completion_length": 615.3333740234375, + "epoch": 1.0454545454545454, + "grad_norm": 0.9705678820610046, + "kl": 0.2214520424604416, + "learning_rate": 4.962773315386935e-06, + "loss": 0.0089, + "reward": 2.516666889190674, + "reward_std": 1.2355836629867554, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 299 + }, + { + "completion_length": 836.1666870117188, + "epoch": 1.048951048951049, + "grad_norm": 1.5465428829193115, + "kl": 0.24709966778755188, + "learning_rate": 4.962019382530521e-06, + "loss": 0.0099, + "reward": 2.0458333492279053, + "reward_std": 1.097544550895691, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 300 + }, + { + "completion_length": 597.6666870117188, + "epoch": 1.0524475524475525, + "grad_norm": 3.8257570266723633, + "kl": 0.9686455130577087, + "learning_rate": 4.961257949936092e-06, + "loss": 0.0387, + "reward": 1.4750001430511475, + "reward_std": 1.025061011314392, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 301 + }, + { + "completion_length": 516.6666870117188, + "epoch": 1.055944055944056, + "grad_norm": 2.1578736305236816, + "kl": 0.25257474184036255, + "learning_rate": 4.960489019923105e-06, + "loss": 0.0101, + "reward": 1.712499976158142, + "reward_std": 1.2360976934432983, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7125000357627869, + "step": 302 + }, + { + "completion_length": 390.3333435058594, + "epoch": 1.0594405594405594, + "grad_norm": 1.1851695775985718, + "kl": 0.30646514892578125, + "learning_rate": 4.959712594833855e-06, + "loss": 0.0123, + "reward": 1.3875000476837158, + "reward_std": 1.3440377712249756, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5541666746139526, + "step": 303 + }, + { + "completion_length": 329.66668701171875, + "epoch": 1.062937062937063, + "grad_norm": 1.7874314785003662, + "kl": 0.5978689193725586, + "learning_rate": 4.958928677033465e-06, + "loss": 0.0239, + "reward": 2.5625, + "reward_std": 1.447562575340271, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5625, + "step": 304 + }, + { + "completion_length": 676.5, + "epoch": 1.0664335664335665, + "grad_norm": 1.6353819370269775, + "kl": 0.2865048348903656, + "learning_rate": 4.958137268909887e-06, + "loss": 0.0115, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 305 + }, + { + "completion_length": 685.1666870117188, + "epoch": 1.06993006993007, + "grad_norm": 0.5405178666114807, + "kl": 0.16403402388095856, + "learning_rate": 4.957338372873886e-06, + "loss": 0.0066, + "reward": 1.1166666746139526, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 306 + }, + { + "completion_length": 377.16668701171875, + "epoch": 1.0734265734265733, + "grad_norm": 1.3861095905303955, + "kl": 0.5912900567054749, + "learning_rate": 4.956531991359038e-06, + "loss": 0.0237, + "reward": 0.9541667699813843, + "reward_std": 0.9423928260803223, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4541666507720947, + "step": 307 + }, + { + "completion_length": 568.1666870117188, + "epoch": 1.0769230769230769, + "grad_norm": 2.0841739177703857, + "kl": 0.3946326673030853, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.0158, + "reward": 1.2583333253860474, + "reward_std": 1.1876096725463867, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666388511658, + "step": 308 + }, + { + "completion_length": 610.1666870117188, + "epoch": 1.0804195804195804, + "grad_norm": 0.7838713526725769, + "kl": 0.20940952003002167, + "learning_rate": 4.95489678174111e-06, + "loss": 0.0084, + "reward": 1.1750000715255737, + "reward_std": 1.1035170555114746, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6750000715255737, + "step": 309 + }, + { + "completion_length": 780.3333740234375, + "epoch": 1.083916083916084, + "grad_norm": 0.91953444480896, + "kl": 0.13563194870948792, + "learning_rate": 4.9540679586191605e-06, + "loss": 0.0054, + "reward": 1.8500001430511475, + "reward_std": 1.006479024887085, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 310 + }, + { + "completion_length": 468.66668701171875, + "epoch": 1.0874125874125875, + "grad_norm": 1.1062681674957275, + "kl": 0.36474311351776123, + "learning_rate": 4.953231659980613e-06, + "loss": 0.0146, + "reward": 2.058333396911621, + "reward_std": 1.7576736211776733, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 311 + }, + { + "completion_length": 571.3333740234375, + "epoch": 1.0909090909090908, + "grad_norm": 0.7562583088874817, + "kl": 0.17403468489646912, + "learning_rate": 4.9523878883729794e-06, + "loss": 0.007, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 312 + }, + { + "completion_length": 580.6666870117188, + "epoch": 1.0944055944055944, + "grad_norm": 0.7236371040344238, + "kl": 0.20542237162590027, + "learning_rate": 4.9515366463665324e-06, + "loss": 0.0082, + "reward": 2.4000000953674316, + "reward_std": 0.8803409337997437, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 313 + }, + { + "completion_length": 372.5, + "epoch": 1.097902097902098, + "grad_norm": 0.736242949962616, + "kl": 0.19798314571380615, + "learning_rate": 4.9506779365543054e-06, + "loss": 0.0079, + "reward": 3.0916666984558105, + "reward_std": 0.4247548282146454, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9250000715255737, + "step": 314 + }, + { + "completion_length": 660.8333740234375, + "epoch": 1.1013986013986015, + "grad_norm": 0.7641960978507996, + "kl": 0.29524654150009155, + "learning_rate": 4.949811761552074e-06, + "loss": 0.0118, + "reward": 2.4166669845581055, + "reward_std": 1.2176480293273926, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7499999403953552, + "step": 315 + }, + { + "completion_length": 838.3333740234375, + "epoch": 1.104895104895105, + "grad_norm": 0.5717921853065491, + "kl": 0.14558419585227966, + "learning_rate": 4.94893812399836e-06, + "loss": 0.0058, + "reward": 2.258333206176758, + "reward_std": 0.8662660121917725, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9249999523162842, + "step": 316 + }, + { + "completion_length": 308.8333435058594, + "epoch": 1.1083916083916083, + "grad_norm": 1.5407124757766724, + "kl": 0.36382099986076355, + "learning_rate": 4.948057026554415e-06, + "loss": 0.0146, + "reward": 1.2291667461395264, + "reward_std": 1.2054479122161865, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5625, + "step": 317 + }, + { + "completion_length": 582.1666870117188, + "epoch": 1.1118881118881119, + "grad_norm": 0.5300387144088745, + "kl": 0.19406351447105408, + "learning_rate": 4.947168471904213e-06, + "loss": 0.0078, + "reward": 1.375, + "reward_std": 0.4937104880809784, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8749999403953552, + "step": 318 + }, + { + "completion_length": 889.3333740234375, + "epoch": 1.1153846153846154, + "grad_norm": 0.7921298146247864, + "kl": 0.14385448396205902, + "learning_rate": 4.946272462754447e-06, + "loss": 0.0058, + "reward": 1.629166603088379, + "reward_std": 0.8614546656608582, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 319 + }, + { + "completion_length": 576.6666870117188, + "epoch": 1.118881118881119, + "grad_norm": 2.1564207077026367, + "kl": 0.8259252309799194, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.033, + "reward": 1.399999976158142, + "reward_std": 1.3337916135787964, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40000003576278687, + "step": 320 + }, + { + "completion_length": 471.8333435058594, + "epoch": 1.1223776223776223, + "grad_norm": 1.2515596151351929, + "kl": 0.24163812398910522, + "learning_rate": 4.944458091896515e-06, + "loss": 0.0097, + "reward": 1.883333444595337, + "reward_std": 1.3325413465499878, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666984558105, + "step": 321 + }, + { + "completion_length": 416.66668701171875, + "epoch": 1.1258741258741258, + "grad_norm": 0.7721207141876221, + "kl": 0.2213769555091858, + "learning_rate": 4.9435397357152406e-06, + "loss": 0.0089, + "reward": 1.899999976158142, + "reward_std": 0.6442049741744995, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 322 + }, + { + "completion_length": 1349.5, + "epoch": 1.1293706293706294, + "grad_norm": 0.3130567967891693, + "kl": 0.10197386145591736, + "learning_rate": 4.94261393608816e-06, + "loss": 0.0041, + "reward": 1.9666666984558105, + "reward_std": 0.9277212023735046, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7999999523162842, + "step": 323 + }, + { + "completion_length": 669.5, + "epoch": 1.132867132867133, + "grad_norm": 0.9291994571685791, + "kl": 0.22598087787628174, + "learning_rate": 4.9416806958354206e-06, + "loss": 0.009, + "reward": 0.949999988079071, + "reward_std": 0.6595453023910522, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 324 + }, + { + "completion_length": 184.1666717529297, + "epoch": 1.1363636363636362, + "grad_norm": 2.9357590675354004, + "kl": 0.44805118441581726, + "learning_rate": 4.9407400177998335e-06, + "loss": 0.0179, + "reward": 2.450000047683716, + "reward_std": 1.4673106670379639, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 325 + }, + { + "completion_length": 786.8333740234375, + "epoch": 1.1398601398601398, + "grad_norm": 0.7112540006637573, + "kl": 0.23709163069725037, + "learning_rate": 4.939791904846869e-06, + "loss": 0.0095, + "reward": 2.7333335876464844, + "reward_std": 0.40207797288894653, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 326 + }, + { + "completion_length": 391.16668701171875, + "epoch": 1.1433566433566433, + "grad_norm": 1.6311299800872803, + "kl": 0.31598275899887085, + "learning_rate": 4.938836359864641e-06, + "loss": 0.0126, + "reward": 2.2791666984558105, + "reward_std": 0.9937827587127686, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6124999523162842, + "step": 327 + }, + { + "completion_length": 325.8333435058594, + "epoch": 1.1468531468531469, + "grad_norm": 1.6858141422271729, + "kl": 0.40026235580444336, + "learning_rate": 4.937873385763909e-06, + "loss": 0.016, + "reward": 2.0250000953674316, + "reward_std": 1.1339092254638672, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 328 + }, + { + "completion_length": 313.3333435058594, + "epoch": 1.1503496503496504, + "grad_norm": 1.9852374792099, + "kl": 0.36842843890190125, + "learning_rate": 4.936902985478055e-06, + "loss": 0.0147, + "reward": 2.5250000953674316, + "reward_std": 1.1206024885177612, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 329 + }, + { + "completion_length": 333.66668701171875, + "epoch": 1.1538461538461537, + "grad_norm": 1.0456072092056274, + "kl": 0.3002980351448059, + "learning_rate": 4.935925161963089e-06, + "loss": 0.012, + "reward": 2.1083335876464844, + "reward_std": 0.9068719744682312, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7749999761581421, + "step": 330 + }, + { + "completion_length": 419.16668701171875, + "epoch": 1.1573426573426573, + "grad_norm": 0.9209095239639282, + "kl": 0.19463126361370087, + "learning_rate": 4.93493991819763e-06, + "loss": 0.0078, + "reward": 3.566666603088379, + "reward_std": 0.9389710426330566, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 331 + }, + { + "completion_length": 501.3333435058594, + "epoch": 1.1608391608391608, + "grad_norm": 0.9894822239875793, + "kl": 0.23653444647789001, + "learning_rate": 4.933947257182901e-06, + "loss": 0.0095, + "reward": 2.4583334922790527, + "reward_std": 1.6280101537704468, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 332 + }, + { + "completion_length": 283.8333435058594, + "epoch": 1.1643356643356644, + "grad_norm": 1.3056206703186035, + "kl": 0.3558562397956848, + "learning_rate": 4.932947181942721e-06, + "loss": 0.0142, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 333 + }, + { + "completion_length": 617.8333740234375, + "epoch": 1.167832167832168, + "grad_norm": 0.7905691266059875, + "kl": 0.2221965491771698, + "learning_rate": 4.9319396955234925e-06, + "loss": 0.0089, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 334 + }, + { + "completion_length": 802.3333740234375, + "epoch": 1.1713286713286712, + "grad_norm": 0.650930643081665, + "kl": 0.2902371287345886, + "learning_rate": 4.930924800994192e-06, + "loss": 0.0116, + "reward": 2.9375, + "reward_std": 0.9523326754570007, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7708332538604736, + "step": 335 + }, + { + "completion_length": 571.5, + "epoch": 1.1748251748251748, + "grad_norm": 2.592233180999756, + "kl": 0.44388240575790405, + "learning_rate": 4.9299025014463665e-06, + "loss": 0.0178, + "reward": 2.625, + "reward_std": 1.0167349576950073, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 336 + }, + { + "completion_length": 765.0, + "epoch": 1.1783216783216783, + "grad_norm": 0.8478806018829346, + "kl": 0.23496964573860168, + "learning_rate": 4.928872799994116e-06, + "loss": 0.0094, + "reward": 2.4166665077209473, + "reward_std": 1.0943796634674072, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.75, + "step": 337 + }, + { + "completion_length": 369.5, + "epoch": 1.1818181818181819, + "grad_norm": 1.2003388404846191, + "kl": 0.283313125371933, + "learning_rate": 4.92783569977409e-06, + "loss": 0.0113, + "reward": 2.4625000953674316, + "reward_std": 1.1056389808654785, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 338 + }, + { + "completion_length": 241.1666717529297, + "epoch": 1.1853146853146854, + "grad_norm": 1.1362509727478027, + "kl": 0.36542683839797974, + "learning_rate": 4.926791203945477e-06, + "loss": 0.0146, + "reward": 2.941667079925537, + "reward_std": 1.237908124923706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7749999761581421, + "step": 339 + }, + { + "completion_length": 262.3333435058594, + "epoch": 1.1888111888111887, + "grad_norm": 2.5425589084625244, + "kl": 0.46542689204216003, + "learning_rate": 4.925739315689991e-06, + "loss": 0.0186, + "reward": 2.2166666984558105, + "reward_std": 1.3840761184692383, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666388511658, + "step": 340 + }, + { + "completion_length": 458.8333435058594, + "epoch": 1.1923076923076923, + "grad_norm": 1.0685269832611084, + "kl": 0.28533288836479187, + "learning_rate": 4.924680038211868e-06, + "loss": 0.0114, + "reward": 3.0375001430511475, + "reward_std": 0.7974568605422974, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 341 + }, + { + "completion_length": 680.6666870117188, + "epoch": 1.1958041958041958, + "grad_norm": 1.049636960029602, + "kl": 0.2565695643424988, + "learning_rate": 4.923613374737848e-06, + "loss": 0.0103, + "reward": 2.875, + "reward_std": 0.7834221124649048, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.875, + "step": 342 + }, + { + "completion_length": 669.5, + "epoch": 1.1993006993006994, + "grad_norm": 0.47562330961227417, + "kl": 0.15911276638507843, + "learning_rate": 4.922539328517174e-06, + "loss": 0.0064, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 343 + }, + { + "completion_length": 533.1666870117188, + "epoch": 1.2027972027972027, + "grad_norm": 2.7278823852539062, + "kl": 0.42878812551498413, + "learning_rate": 4.921457902821578e-06, + "loss": 0.0172, + "reward": 2.191666603088379, + "reward_std": 1.1499637365341187, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 344 + }, + { + "completion_length": 410.5, + "epoch": 1.2062937062937062, + "grad_norm": 1.2009421586990356, + "kl": 0.30361247062683105, + "learning_rate": 4.92036910094527e-06, + "loss": 0.0121, + "reward": 2.2958333492279053, + "reward_std": 0.7362772822380066, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7958333492279053, + "step": 345 + }, + { + "completion_length": 678.0, + "epoch": 1.2097902097902098, + "grad_norm": 1.1339452266693115, + "kl": 0.36994367837905884, + "learning_rate": 4.9192729262049285e-06, + "loss": 0.0148, + "reward": 1.375, + "reward_std": 1.7195203304290771, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.375, + "step": 346 + }, + { + "completion_length": 364.66668701171875, + "epoch": 1.2132867132867133, + "grad_norm": 1.0105022192001343, + "kl": 0.22824347019195557, + "learning_rate": 4.918169381939693e-06, + "loss": 0.0091, + "reward": 1.75, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.75, + "step": 347 + }, + { + "completion_length": 231.83334350585938, + "epoch": 1.2167832167832167, + "grad_norm": 2.2665371894836426, + "kl": 0.5012367963790894, + "learning_rate": 4.917058471511149e-06, + "loss": 0.02, + "reward": 0.8916667699813843, + "reward_std": 0.8929818868637085, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 348 + }, + { + "completion_length": 149.6666717529297, + "epoch": 1.2202797202797202, + "grad_norm": 1.465401530265808, + "kl": 0.71610426902771, + "learning_rate": 4.915940198303324e-06, + "loss": 0.0286, + "reward": 2.183333396911621, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5166666507720947, + "step": 349 + }, + { + "completion_length": 265.66668701171875, + "epoch": 1.2237762237762237, + "grad_norm": 1.1324924230575562, + "kl": 0.39196571707725525, + "learning_rate": 4.914814565722671e-06, + "loss": 0.0157, + "reward": 2.016666889190674, + "reward_std": 0.9521905779838562, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 350 + }, + { + "completion_length": 228.1666717529297, + "epoch": 1.2272727272727273, + "grad_norm": 2.361294746398926, + "kl": 0.5443918704986572, + "learning_rate": 4.913681577198063e-06, + "loss": 0.0218, + "reward": 2.7916667461395264, + "reward_std": 1.5844295024871826, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 351 + }, + { + "completion_length": 645.1666870117188, + "epoch": 1.2307692307692308, + "grad_norm": 1.6541866064071655, + "kl": 0.3587082326412201, + "learning_rate": 4.912541236180779e-06, + "loss": 0.0143, + "reward": 3.0208334922790527, + "reward_std": 1.1969144344329834, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6875, + "step": 352 + }, + { + "completion_length": 592.1666870117188, + "epoch": 1.2342657342657342, + "grad_norm": 3.038172483444214, + "kl": 0.6741119623184204, + "learning_rate": 4.9113935461444955e-06, + "loss": 0.027, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 353 + }, + { + "completion_length": 416.16668701171875, + "epoch": 1.2377622377622377, + "grad_norm": 1.0763347148895264, + "kl": 0.32444697618484497, + "learning_rate": 4.910238510585275e-06, + "loss": 0.013, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 354 + }, + { + "completion_length": 276.3333435058594, + "epoch": 1.2412587412587412, + "grad_norm": 2.7986843585968018, + "kl": 0.9174998998641968, + "learning_rate": 4.909076133021558e-06, + "loss": 0.0367, + "reward": 2.625, + "reward_std": 1.616709589958191, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 355 + }, + { + "completion_length": 269.16668701171875, + "epoch": 1.2447552447552448, + "grad_norm": 0.9633187055587769, + "kl": 0.3955456614494324, + "learning_rate": 4.907906416994146e-06, + "loss": 0.0158, + "reward": 3.066667079925537, + "reward_std": 0.4490731656551361, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8999999761581421, + "step": 356 + }, + { + "completion_length": 313.16668701171875, + "epoch": 1.2482517482517483, + "grad_norm": 2.313849449157715, + "kl": 0.662523627281189, + "learning_rate": 4.906729366066197e-06, + "loss": 0.0265, + "reward": 1.7666667699813843, + "reward_std": 1.1767185926437378, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 357 + }, + { + "completion_length": 216.0, + "epoch": 1.2517482517482517, + "grad_norm": 4.379472255706787, + "kl": 0.7677586078643799, + "learning_rate": 4.905544983823214e-06, + "loss": 0.0307, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 358 + }, + { + "completion_length": 860.3333740234375, + "epoch": 1.2552447552447552, + "grad_norm": 2.9275009632110596, + "kl": 0.6438803672790527, + "learning_rate": 4.904353273873029e-06, + "loss": 0.0258, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 359 + }, + { + "completion_length": 217.83334350585938, + "epoch": 1.2587412587412588, + "grad_norm": 2.738201141357422, + "kl": 0.6947124004364014, + "learning_rate": 4.903154239845798e-06, + "loss": 0.0278, + "reward": 2.125, + "reward_std": 1.3129165172576904, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 360 + }, + { + "completion_length": 850.6666870117188, + "epoch": 1.2622377622377623, + "grad_norm": 0.6407853364944458, + "kl": 0.21777069568634033, + "learning_rate": 4.901947885393986e-06, + "loss": 0.0087, + "reward": 3.066667079925537, + "reward_std": 0.9389710426330566, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 361 + }, + { + "completion_length": 430.5, + "epoch": 1.2657342657342658, + "grad_norm": 3.934774398803711, + "kl": 1.3171093463897705, + "learning_rate": 4.900734214192358e-06, + "loss": 0.0527, + "reward": 2.4666666984558105, + "reward_std": 1.7380064725875854, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 362 + }, + { + "completion_length": 1049.0, + "epoch": 1.2692307692307692, + "grad_norm": 1.0587317943572998, + "kl": 0.3339938521385193, + "learning_rate": 4.899513229937968e-06, + "loss": 0.0134, + "reward": 1.183333396911621, + "reward_std": 0.6088240146636963, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8499999046325684, + "step": 363 + }, + { + "completion_length": 752.5, + "epoch": 1.2727272727272727, + "grad_norm": 0.9463182687759399, + "kl": 0.2867739796638489, + "learning_rate": 4.898284936350144e-06, + "loss": 0.0115, + "reward": 1.445833444595337, + "reward_std": 1.1011831760406494, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.612500011920929, + "step": 364 + }, + { + "completion_length": 302.3333435058594, + "epoch": 1.2762237762237763, + "grad_norm": 1.0470837354660034, + "kl": 0.4384109377861023, + "learning_rate": 4.897049337170483e-06, + "loss": 0.0175, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 365 + }, + { + "completion_length": 299.5, + "epoch": 1.2797202797202798, + "grad_norm": 1.4532350301742554, + "kl": 0.48457586765289307, + "learning_rate": 4.8958064361628334e-06, + "loss": 0.0194, + "reward": 2.183333396911621, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 366 + }, + { + "completion_length": 591.1666870117188, + "epoch": 1.2832167832167833, + "grad_norm": 1.7987697124481201, + "kl": 0.44638824462890625, + "learning_rate": 4.894556237113287e-06, + "loss": 0.0179, + "reward": 1.1166666746139526, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 367 + }, + { + "completion_length": 1384.5, + "epoch": 1.2867132867132867, + "grad_norm": 0.4040040373802185, + "kl": 0.12767352163791656, + "learning_rate": 4.893298743830168e-06, + "loss": 0.0051, + "reward": 1.691666841506958, + "reward_std": 1.4019334316253662, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 368 + }, + { + "completion_length": 440.3333435058594, + "epoch": 1.2902097902097902, + "grad_norm": 1.9347208738327026, + "kl": 0.46111249923706055, + "learning_rate": 4.89203396014402e-06, + "loss": 0.0184, + "reward": 1.9333332777023315, + "reward_std": 1.0510313510894775, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7666666507720947, + "step": 369 + }, + { + "completion_length": 602.8333740234375, + "epoch": 1.2937062937062938, + "grad_norm": 1.7568728923797607, + "kl": 0.5643346309661865, + "learning_rate": 4.890761889907589e-06, + "loss": 0.0226, + "reward": 1.2333333492279053, + "reward_std": 1.1513760089874268, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40000003576278687, + "step": 370 + }, + { + "completion_length": 584.1666870117188, + "epoch": 1.297202797202797, + "grad_norm": 2.6727964878082275, + "kl": 0.5424228310585022, + "learning_rate": 4.889482536995826e-06, + "loss": 0.0217, + "reward": 2.7916667461395264, + "reward_std": 1.4527275562286377, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 371 + }, + { + "completion_length": 302.16668701171875, + "epoch": 1.3006993006993006, + "grad_norm": 1.0215359926223755, + "kl": 0.38776999711990356, + "learning_rate": 4.888195905305859e-06, + "loss": 0.0155, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 372 + }, + { + "completion_length": 1038.5, + "epoch": 1.3041958041958042, + "grad_norm": 0.8328973054885864, + "kl": 0.31271958351135254, + "learning_rate": 4.886901998756995e-06, + "loss": 0.0125, + "reward": 1.4750001430511475, + "reward_std": 1.0486897230148315, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 373 + }, + { + "completion_length": 407.16668701171875, + "epoch": 1.3076923076923077, + "grad_norm": 1.812672734260559, + "kl": 0.3156376779079437, + "learning_rate": 4.885600821290692e-06, + "loss": 0.0126, + "reward": 3.183333396911621, + "reward_std": 1.2027745246887207, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 374 + }, + { + "completion_length": 264.16668701171875, + "epoch": 1.3111888111888113, + "grad_norm": 4.727421760559082, + "kl": 1.329188585281372, + "learning_rate": 4.884292376870567e-06, + "loss": 0.0532, + "reward": 2.0916666984558105, + "reward_std": 0.94890296459198, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 375 + }, + { + "completion_length": 516.5, + "epoch": 1.3146853146853146, + "grad_norm": 2.27711820602417, + "kl": 0.6330995559692383, + "learning_rate": 4.882976669482368e-06, + "loss": 0.0253, + "reward": 1.3583333492279053, + "reward_std": 1.1029127836227417, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6916667222976685, + "step": 376 + }, + { + "completion_length": 420.66668701171875, + "epoch": 1.3181818181818181, + "grad_norm": 2.9678735733032227, + "kl": 0.8875288367271423, + "learning_rate": 4.881653703133966e-06, + "loss": 0.0355, + "reward": 1.883333444595337, + "reward_std": 1.3325413465499878, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 377 + }, + { + "completion_length": 753.1666870117188, + "epoch": 1.3216783216783217, + "grad_norm": 0.774476945400238, + "kl": 0.36767667531967163, + "learning_rate": 4.880323481855347e-06, + "loss": 0.0147, + "reward": 2.3583335876464844, + "reward_std": 1.55962073802948, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 378 + }, + { + "completion_length": 182.5, + "epoch": 1.3251748251748252, + "grad_norm": 1.207739233970642, + "kl": 0.43915602564811707, + "learning_rate": 4.878986009698596e-06, + "loss": 0.0176, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 379 + }, + { + "completion_length": 341.0, + "epoch": 1.3286713286713288, + "grad_norm": 0.7512596249580383, + "kl": 0.3403867483139038, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.0136, + "reward": 3.0416667461395264, + "reward_std": 1.4800056219100952, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.875, + "step": 380 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.332167832167832, + "grad_norm": 2.4150354862213135, + "kl": 0.6687287092208862, + "learning_rate": 4.87628932906946e-06, + "loss": 0.0267, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 381 + }, + { + "completion_length": 657.5, + "epoch": 1.3356643356643356, + "grad_norm": 1.1033812761306763, + "kl": 0.2525772750377655, + "learning_rate": 4.874930128811631e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 382 + }, + { + "completion_length": 655.6666870117188, + "epoch": 1.3391608391608392, + "grad_norm": 2.7283008098602295, + "kl": 0.7087686061859131, + "learning_rate": 4.87356369410476e-06, + "loss": 0.0284, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6333333253860474, + "step": 383 + }, + { + "completion_length": 1037.166748046875, + "epoch": 1.3426573426573427, + "grad_norm": 1.4860605001449585, + "kl": 0.35516053438186646, + "learning_rate": 4.8721900291112415e-06, + "loss": 0.0142, + "reward": 1.3416666984558105, + "reward_std": 1.0956352949142456, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6750000715255737, + "step": 384 + }, + { + "completion_length": 776.0, + "epoch": 1.3461538461538463, + "grad_norm": 2.1169064044952393, + "kl": 0.6649973392486572, + "learning_rate": 4.870809138015499e-06, + "loss": 0.0266, + "reward": 1.4750001430511475, + "reward_std": 1.2451908588409424, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 385 + }, + { + "completion_length": 803.8333740234375, + "epoch": 1.3496503496503496, + "grad_norm": 1.5138658285140991, + "kl": 0.5593903064727783, + "learning_rate": 4.869421025023965e-06, + "loss": 0.0224, + "reward": 1.2250001430511475, + "reward_std": 1.229125738143921, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5583333373069763, + "step": 386 + }, + { + "completion_length": 579.8333740234375, + "epoch": 1.3531468531468531, + "grad_norm": 0.8988491892814636, + "kl": 0.2851899266242981, + "learning_rate": 4.868025694365073e-06, + "loss": 0.0114, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 387 + }, + { + "completion_length": 173.5, + "epoch": 1.3566433566433567, + "grad_norm": 1.3644022941589355, + "kl": 0.5744073390960693, + "learning_rate": 4.866623150289241e-06, + "loss": 0.023, + "reward": 1.9666666984558105, + "reward_std": 1.2412359714508057, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 388 + }, + { + "completion_length": 578.3333740234375, + "epoch": 1.3601398601398602, + "grad_norm": 0.8156600594520569, + "kl": 0.2687755227088928, + "learning_rate": 4.865213397068864e-06, + "loss": 0.0108, + "reward": 2.2916667461395264, + "reward_std": 1.4901063442230225, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 389 + }, + { + "completion_length": 1756.8333740234375, + "epoch": 1.3636363636363638, + "grad_norm": 0.36968812346458435, + "kl": 0.11372655630111694, + "learning_rate": 4.863796438998293e-06, + "loss": 0.0045, + "reward": 1.4666666984558105, + "reward_std": 0.9174239635467529, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6333333253860474, + "step": 390 + }, + { + "completion_length": 605.5, + "epoch": 1.367132867132867, + "grad_norm": 1.086455225944519, + "kl": 0.2938157916069031, + "learning_rate": 4.862372280393828e-06, + "loss": 0.0118, + "reward": 2.4375, + "reward_std": 1.2702115774154663, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7708333730697632, + "step": 391 + }, + { + "completion_length": 736.0, + "epoch": 1.3706293706293706, + "grad_norm": 3.411510705947876, + "kl": 0.9218753576278687, + "learning_rate": 4.860940925593703e-06, + "loss": 0.0369, + "reward": 1.4583333730697632, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666865348816, + "step": 392 + }, + { + "completion_length": 166.5, + "epoch": 1.3741258741258742, + "grad_norm": 1.464406132698059, + "kl": 0.34225571155548096, + "learning_rate": 4.8595023789580745e-06, + "loss": 0.0137, + "reward": 1.6041667461395264, + "reward_std": 0.7573666572570801, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7708332538604736, + "step": 393 + }, + { + "completion_length": 646.5, + "epoch": 1.3776223776223775, + "grad_norm": 1.6122732162475586, + "kl": 0.4424184560775757, + "learning_rate": 4.858056644869002e-06, + "loss": 0.0177, + "reward": 1.3250000476837158, + "reward_std": 0.9527591466903687, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8250000476837158, + "step": 394 + }, + { + "completion_length": 641.1666870117188, + "epoch": 1.381118881118881, + "grad_norm": 0.6985570192337036, + "kl": 0.23967330157756805, + "learning_rate": 4.856603727730446e-06, + "loss": 0.0096, + "reward": 2.5458333492279053, + "reward_std": 1.5425965785980225, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7125000357627869, + "step": 395 + }, + { + "completion_length": 161.83334350585938, + "epoch": 1.3846153846153846, + "grad_norm": 1.9270485639572144, + "kl": 0.7514389753341675, + "learning_rate": 4.855143631968242e-06, + "loss": 0.0301, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 396 + }, + { + "completion_length": 166.0, + "epoch": 1.3881118881118881, + "grad_norm": 1.2144757509231567, + "kl": 0.35039469599723816, + "learning_rate": 4.853676362030095e-06, + "loss": 0.014, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 397 + }, + { + "completion_length": 569.0, + "epoch": 1.3916083916083917, + "grad_norm": 6.755039215087891, + "kl": 0.7890805006027222, + "learning_rate": 4.852201922385564e-06, + "loss": 0.0316, + "reward": 2.1083333492279053, + "reward_std": 1.7987264394760132, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6083333492279053, + "step": 398 + }, + { + "completion_length": 909.0, + "epoch": 1.395104895104895, + "grad_norm": 0.7347401976585388, + "kl": 0.18117789924144745, + "learning_rate": 4.850720317526047e-06, + "loss": 0.0072, + "reward": 1.962499976158142, + "reward_std": 0.534263551235199, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7958333492279053, + "step": 399 + }, + { + "completion_length": 793.5, + "epoch": 1.3986013986013985, + "grad_norm": 0.849243700504303, + "kl": 0.27008673548698425, + "learning_rate": 4.849231551964771e-06, + "loss": 0.0108, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 400 + }, + { + "completion_length": 554.1666870117188, + "epoch": 1.402097902097902, + "grad_norm": 2.7050747871398926, + "kl": 0.5240260362625122, + "learning_rate": 4.847735630236773e-06, + "loss": 0.021, + "reward": 3.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 401 + }, + { + "completion_length": 215.83334350585938, + "epoch": 1.4055944055944056, + "grad_norm": 0.9243234992027283, + "kl": 0.3121068477630615, + "learning_rate": 4.84623255689889e-06, + "loss": 0.0125, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 402 + }, + { + "completion_length": 235.6666717529297, + "epoch": 1.4090909090909092, + "grad_norm": 3.3891875743865967, + "kl": 0.5218031406402588, + "learning_rate": 4.844722336529745e-06, + "loss": 0.0209, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 403 + }, + { + "completion_length": 923.5, + "epoch": 1.4125874125874125, + "grad_norm": 3.197908878326416, + "kl": 0.7076524496078491, + "learning_rate": 4.84320497372973e-06, + "loss": 0.0283, + "reward": 2.0458335876464844, + "reward_std": 1.3396285772323608, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7125000357627869, + "step": 404 + }, + { + "completion_length": 197.83334350585938, + "epoch": 1.416083916083916, + "grad_norm": 1.1261261701583862, + "kl": 0.3264281153678894, + "learning_rate": 4.841680473120994e-06, + "loss": 0.0131, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 405 + }, + { + "completion_length": 554.5, + "epoch": 1.4195804195804196, + "grad_norm": 3.3561604022979736, + "kl": 0.8642048835754395, + "learning_rate": 4.840148839347434e-06, + "loss": 0.0346, + "reward": 1.8500001430511475, + "reward_std": 1.0315039157867432, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 406 + }, + { + "completion_length": 795.8333740234375, + "epoch": 1.4230769230769231, + "grad_norm": 4.25921630859375, + "kl": 0.770601749420166, + "learning_rate": 4.838610077074669e-06, + "loss": 0.0308, + "reward": 1.2916667461395264, + "reward_std": 1.0551856756210327, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 407 + }, + { + "completion_length": 915.0, + "epoch": 1.4265734265734267, + "grad_norm": 0.571506142616272, + "kl": 0.20412606000900269, + "learning_rate": 4.837064190990036e-06, + "loss": 0.0082, + "reward": 2.241666793823242, + "reward_std": 1.3698238134384155, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7416666746139526, + "step": 408 + }, + { + "completion_length": 520.6666870117188, + "epoch": 1.43006993006993, + "grad_norm": 0.9773194193840027, + "kl": 0.29276588559150696, + "learning_rate": 4.835511185802574e-06, + "loss": 0.0117, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 409 + }, + { + "completion_length": 357.5, + "epoch": 1.4335664335664335, + "grad_norm": 2.5951545238494873, + "kl": 0.4989779591560364, + "learning_rate": 4.833951066243004e-06, + "loss": 0.02, + "reward": 1.945833444595337, + "reward_std": 1.279689073562622, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.612500011920929, + "step": 410 + }, + { + "completion_length": 794.3333740234375, + "epoch": 1.437062937062937, + "grad_norm": 0.761000394821167, + "kl": 0.20721551775932312, + "learning_rate": 4.832383837063723e-06, + "loss": 0.0083, + "reward": 2.0416667461395264, + "reward_std": 1.100189447402954, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 411 + }, + { + "completion_length": 1086.5, + "epoch": 1.4405594405594406, + "grad_norm": 0.9872347116470337, + "kl": 0.296750009059906, + "learning_rate": 4.830809503038781e-06, + "loss": 0.0119, + "reward": 2.0916666984558105, + "reward_std": 1.442365050315857, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 412 + }, + { + "completion_length": 168.5, + "epoch": 1.4440559440559442, + "grad_norm": 1.2185351848602295, + "kl": 0.34197482466697693, + "learning_rate": 4.829228068963873e-06, + "loss": 0.0137, + "reward": 3.2333333492279053, + "reward_std": 0.5671566724777222, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 413 + }, + { + "completion_length": 775.3333740234375, + "epoch": 1.4475524475524475, + "grad_norm": 1.1913334131240845, + "kl": 0.3759481906890869, + "learning_rate": 4.8276395396563215e-06, + "loss": 0.015, + "reward": 0.8916667699813843, + "reward_std": 0.5633975267410278, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7250000834465027, + "step": 414 + }, + { + "completion_length": 203.6666717529297, + "epoch": 1.451048951048951, + "grad_norm": 1.0359302759170532, + "kl": 0.31211602687835693, + "learning_rate": 4.826043919955062e-06, + "loss": 0.0125, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 415 + }, + { + "completion_length": 543.6666870117188, + "epoch": 1.4545454545454546, + "grad_norm": 0.7396105527877808, + "kl": 0.25116777420043945, + "learning_rate": 4.824441214720629e-06, + "loss": 0.01, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 416 + }, + { + "completion_length": 253.0, + "epoch": 1.458041958041958, + "grad_norm": 2.3947131633758545, + "kl": 0.3577002286911011, + "learning_rate": 4.8228314288351405e-06, + "loss": 0.0143, + "reward": 1.8500001430511475, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 417 + }, + { + "completion_length": 776.0, + "epoch": 1.4615384615384617, + "grad_norm": 0.9339893460273743, + "kl": 0.2636467218399048, + "learning_rate": 4.821214567202284e-06, + "loss": 0.0105, + "reward": 2.2333333492279053, + "reward_std": 0.5671566724777222, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 418 + }, + { + "completion_length": 185.33334350585938, + "epoch": 1.465034965034965, + "grad_norm": 3.6216635704040527, + "kl": 0.6233493685722351, + "learning_rate": 4.8195906347473e-06, + "loss": 0.0249, + "reward": 1.8000000715255737, + "reward_std": 1.579240322113037, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 419 + }, + { + "completion_length": 1112.0, + "epoch": 1.4685314685314685, + "grad_norm": 0.6356344223022461, + "kl": 0.26539915800094604, + "learning_rate": 4.817959636416969e-06, + "loss": 0.0106, + "reward": 2.375, + "reward_std": 1.001873254776001, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 420 + }, + { + "completion_length": 531.1666870117188, + "epoch": 1.472027972027972, + "grad_norm": 0.8300501108169556, + "kl": 0.31844228506088257, + "learning_rate": 4.816321577179594e-06, + "loss": 0.0127, + "reward": 2.875, + "reward_std": 0.7834221124649048, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 421 + }, + { + "completion_length": 218.83334350585938, + "epoch": 1.4755244755244754, + "grad_norm": 0.796237051486969, + "kl": 0.331187903881073, + "learning_rate": 4.814676462024988e-06, + "loss": 0.0132, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 422 + }, + { + "completion_length": 186.83334350585938, + "epoch": 1.479020979020979, + "grad_norm": 1.279965877532959, + "kl": 0.3236890733242035, + "learning_rate": 4.8130242959644555e-06, + "loss": 0.0129, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 423 + }, + { + "completion_length": 249.0, + "epoch": 1.4825174825174825, + "grad_norm": 4.079779624938965, + "kl": 0.39256423711776733, + "learning_rate": 4.811365084030784e-06, + "loss": 0.0157, + "reward": 1.7125000953674316, + "reward_std": 0.9684717655181885, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7125000357627869, + "step": 424 + }, + { + "completion_length": 183.33334350585938, + "epoch": 1.486013986013986, + "grad_norm": 1.1069165468215942, + "kl": 0.262847363948822, + "learning_rate": 4.809698831278217e-06, + "loss": 0.0105, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 425 + }, + { + "completion_length": 199.6666717529297, + "epoch": 1.4895104895104896, + "grad_norm": 1.413517713546753, + "kl": 0.39733991026878357, + "learning_rate": 4.808025542782453e-06, + "loss": 0.0159, + "reward": 2.7083334922790527, + "reward_std": 0.8662660121917725, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 426 + }, + { + "completion_length": 235.6666717529297, + "epoch": 1.493006993006993, + "grad_norm": 0.9659198522567749, + "kl": 0.2365071177482605, + "learning_rate": 4.806345223640616e-06, + "loss": 0.0095, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 427 + }, + { + "completion_length": 774.1666870117188, + "epoch": 1.4965034965034965, + "grad_norm": 0.830765962600708, + "kl": 0.33350443840026855, + "learning_rate": 4.804657878971252e-06, + "loss": 0.0133, + "reward": 2.183333396911621, + "reward_std": 1.3265244960784912, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 428 + }, + { + "completion_length": 203.0, + "epoch": 1.5, + "grad_norm": 1.0319793224334717, + "kl": 0.27221041917800903, + "learning_rate": 4.802963513914304e-06, + "loss": 0.0109, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 429 + }, + { + "completion_length": 461.16668701171875, + "epoch": 1.5034965034965035, + "grad_norm": 1.0231879949569702, + "kl": 0.24733422696590424, + "learning_rate": 4.801262133631101e-06, + "loss": 0.0099, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 430 + }, + { + "completion_length": 244.83334350585938, + "epoch": 1.506993006993007, + "grad_norm": 0.9520881772041321, + "kl": 0.31419527530670166, + "learning_rate": 4.799553743304345e-06, + "loss": 0.0126, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 431 + }, + { + "completion_length": 206.83334350585938, + "epoch": 1.5104895104895104, + "grad_norm": 0.8148533701896667, + "kl": 0.2550124228000641, + "learning_rate": 4.7978383481380865e-06, + "loss": 0.0102, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 432 + }, + { + "completion_length": 1087.8333740234375, + "epoch": 1.513986013986014, + "grad_norm": 0.3516090214252472, + "kl": 0.2816867530345917, + "learning_rate": 4.796115953357718e-06, + "loss": 0.0113, + "reward": 2.2833333015441895, + "reward_std": 1.2408331632614136, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 433 + }, + { + "completion_length": 556.3333740234375, + "epoch": 1.5174825174825175, + "grad_norm": 3.6779227256774902, + "kl": 0.4250108003616333, + "learning_rate": 4.794386564209953e-06, + "loss": 0.017, + "reward": 2.4083335399627686, + "reward_std": 1.687132716178894, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 434 + }, + { + "completion_length": 707.8333740234375, + "epoch": 1.5209790209790208, + "grad_norm": 1.121485948562622, + "kl": 0.24696388840675354, + "learning_rate": 4.79265018596281e-06, + "loss": 0.0099, + "reward": 2.9000000953674316, + "reward_std": 0.9027735590934753, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 435 + }, + { + "completion_length": 469.8333435058594, + "epoch": 1.5244755244755246, + "grad_norm": 2.6518046855926514, + "kl": 0.7716752886772156, + "learning_rate": 4.790906823905599e-06, + "loss": 0.0309, + "reward": 1.8000000715255737, + "reward_std": 1.447066068649292, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 436 + }, + { + "completion_length": 192.83334350585938, + "epoch": 1.527972027972028, + "grad_norm": 1.165176272392273, + "kl": 0.2884241044521332, + "learning_rate": 4.7891564833489035e-06, + "loss": 0.0115, + "reward": 2.450000286102295, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 437 + }, + { + "completion_length": 254.6666717529297, + "epoch": 1.5314685314685315, + "grad_norm": 0.8783808350563049, + "kl": 0.26613113284111023, + "learning_rate": 4.787399169624562e-06, + "loss": 0.0106, + "reward": 3.370833396911621, + "reward_std": 1.011238932609558, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 438 + }, + { + "completion_length": 158.5, + "epoch": 1.534965034965035, + "grad_norm": 2.008617877960205, + "kl": 0.5028926134109497, + "learning_rate": 4.7856348880856595e-06, + "loss": 0.0201, + "reward": 1.7416666746139526, + "reward_std": 1.1517016887664795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7416666746139526, + "step": 439 + }, + { + "completion_length": 208.5, + "epoch": 1.5384615384615383, + "grad_norm": 0.8693957924842834, + "kl": 0.2799164056777954, + "learning_rate": 4.783863644106502e-06, + "loss": 0.0112, + "reward": 2.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 440 + }, + { + "completion_length": 211.5, + "epoch": 1.541958041958042, + "grad_norm": 1.5437381267547607, + "kl": 0.3011782467365265, + "learning_rate": 4.782085443082607e-06, + "loss": 0.012, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 441 + }, + { + "completion_length": 491.8333435058594, + "epoch": 1.5454545454545454, + "grad_norm": 3.308060884475708, + "kl": 0.43526870012283325, + "learning_rate": 4.780300290430683e-06, + "loss": 0.0174, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 442 + }, + { + "completion_length": 177.1666717529297, + "epoch": 1.548951048951049, + "grad_norm": 2.3108198642730713, + "kl": 0.6005208492279053, + "learning_rate": 4.778508191588613e-06, + "loss": 0.024, + "reward": 2.683333396911621, + "reward_std": 1.2110600471496582, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 443 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.5524475524475525, + "grad_norm": 0.9576809406280518, + "kl": 0.3041282296180725, + "learning_rate": 4.776709152015443e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 444 + }, + { + "completion_length": 807.3333740234375, + "epoch": 1.5559440559440558, + "grad_norm": 0.6298768520355225, + "kl": 0.2337806224822998, + "learning_rate": 4.774903177191358e-06, + "loss": 0.0094, + "reward": 2.5458335876464844, + "reward_std": 1.3377609252929688, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 445 + }, + { + "completion_length": 523.6666870117188, + "epoch": 1.5594405594405596, + "grad_norm": 1.1019190549850464, + "kl": 0.39509618282318115, + "learning_rate": 4.773090272617672e-06, + "loss": 0.0158, + "reward": 2.049999952316284, + "reward_std": 1.5391557216644287, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 446 + }, + { + "completion_length": 787.6666870117188, + "epoch": 1.562937062937063, + "grad_norm": 0.893694281578064, + "kl": 0.37470337748527527, + "learning_rate": 4.771270443816805e-06, + "loss": 0.015, + "reward": 2.2083334922790527, + "reward_std": 0.8720186948776245, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 447 + }, + { + "completion_length": 546.8333740234375, + "epoch": 1.5664335664335665, + "grad_norm": 0.837485134601593, + "kl": 0.22402605414390564, + "learning_rate": 4.769443696332272e-06, + "loss": 0.009, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 448 + }, + { + "completion_length": 177.6666717529297, + "epoch": 1.56993006993007, + "grad_norm": 1.617317795753479, + "kl": 0.3958384692668915, + "learning_rate": 4.767610035728663e-06, + "loss": 0.0158, + "reward": 2.875, + "reward_std": 1.0068515539169312, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 449 + }, + { + "completion_length": 147.33334350585938, + "epoch": 1.5734265734265733, + "grad_norm": 0.9628480076789856, + "kl": 0.3490566611289978, + "learning_rate": 4.765769467591626e-06, + "loss": 0.014, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 450 + }, + { + "completion_length": 203.83334350585938, + "epoch": 1.5769230769230769, + "grad_norm": 0.9194980263710022, + "kl": 0.3181028962135315, + "learning_rate": 4.763921997527849e-06, + "loss": 0.0127, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 451 + }, + { + "completion_length": 167.5, + "epoch": 1.5804195804195804, + "grad_norm": 3.041954517364502, + "kl": 0.426164835691452, + "learning_rate": 4.762067631165049e-06, + "loss": 0.017, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 452 + }, + { + "completion_length": 212.33334350585938, + "epoch": 1.583916083916084, + "grad_norm": 1.1762245893478394, + "kl": 0.2974995970726013, + "learning_rate": 4.760206374151947e-06, + "loss": 0.0119, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 453 + }, + { + "completion_length": 493.66668701171875, + "epoch": 1.5874125874125875, + "grad_norm": 1.3206851482391357, + "kl": 0.36789295077323914, + "learning_rate": 4.7583382321582525e-06, + "loss": 0.0147, + "reward": 1.9166667461395264, + "reward_std": 1.2738393545150757, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.75, + "step": 454 + }, + { + "completion_length": 205.0, + "epoch": 1.5909090909090908, + "grad_norm": 1.0482568740844727, + "kl": 0.2594867944717407, + "learning_rate": 4.7564632108746524e-06, + "loss": 0.0104, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 455 + }, + { + "completion_length": 502.8333435058594, + "epoch": 1.5944055944055944, + "grad_norm": 2.1341159343719482, + "kl": 0.4591405391693115, + "learning_rate": 4.754581316012785e-06, + "loss": 0.0184, + "reward": 3.7083334922790527, + "reward_std": 0.591960072517395, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 456 + }, + { + "completion_length": 633.3333740234375, + "epoch": 1.597902097902098, + "grad_norm": 1.0107204914093018, + "kl": 0.24642407894134521, + "learning_rate": 4.752692553305229e-06, + "loss": 0.0099, + "reward": 3.0375001430511475, + "reward_std": 0.7974569201469421, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708333373069763, + "step": 457 + }, + { + "completion_length": 517.0, + "epoch": 1.6013986013986012, + "grad_norm": 0.6217291355133057, + "kl": 0.22938358783721924, + "learning_rate": 4.750796928505484e-06, + "loss": 0.0092, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 458 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.604895104895105, + "grad_norm": 0.5446264743804932, + "kl": 0.1968853920698166, + "learning_rate": 4.7488944473879515e-06, + "loss": 0.0079, + "reward": 1.625, + "reward_std": 0.7960842847824097, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 459 + }, + { + "completion_length": 193.83334350585938, + "epoch": 1.6083916083916083, + "grad_norm": 0.8946224451065063, + "kl": 0.25773894786834717, + "learning_rate": 4.746985115747918e-06, + "loss": 0.0103, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 460 + }, + { + "completion_length": 204.6666717529297, + "epoch": 1.6118881118881119, + "grad_norm": 0.8260864019393921, + "kl": 0.2527741491794586, + "learning_rate": 4.745068939401539e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 461 + }, + { + "completion_length": 848.6666870117188, + "epoch": 1.6153846153846154, + "grad_norm": 1.5746495723724365, + "kl": 0.3351367712020874, + "learning_rate": 4.743145924185821e-06, + "loss": 0.0134, + "reward": 2.25, + "reward_std": 0.7803846597671509, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.75, + "step": 462 + }, + { + "completion_length": 190.0, + "epoch": 1.6188811188811187, + "grad_norm": 1.0435597896575928, + "kl": 0.26553571224212646, + "learning_rate": 4.741216075958602e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 463 + }, + { + "completion_length": 209.6666717529297, + "epoch": 1.6223776223776225, + "grad_norm": 1.0996354818344116, + "kl": 0.31133967638015747, + "learning_rate": 4.7392794005985324e-06, + "loss": 0.0125, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 464 + }, + { + "completion_length": 512.6666870117188, + "epoch": 1.6258741258741258, + "grad_norm": 0.7010518908500671, + "kl": 0.21432137489318848, + "learning_rate": 4.737335904005063e-06, + "loss": 0.0086, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 465 + }, + { + "completion_length": 527.0, + "epoch": 1.6293706293706294, + "grad_norm": 0.5995029211044312, + "kl": 0.22433510422706604, + "learning_rate": 4.735385592098421e-06, + "loss": 0.009, + "reward": 2.7916667461395264, + "reward_std": 1.4527275562286377, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 466 + }, + { + "completion_length": 191.0, + "epoch": 1.632867132867133, + "grad_norm": 1.2079272270202637, + "kl": 0.2614157795906067, + "learning_rate": 4.733428470819595e-06, + "loss": 0.0105, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 467 + }, + { + "completion_length": 783.1666870117188, + "epoch": 1.6363636363636362, + "grad_norm": 2.2251851558685303, + "kl": 0.6713162660598755, + "learning_rate": 4.731464546130315e-06, + "loss": 0.0269, + "reward": 2.4375, + "reward_std": 1.3401259183883667, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7708333730697632, + "step": 468 + }, + { + "completion_length": 529.1666870117188, + "epoch": 1.63986013986014, + "grad_norm": 0.5742272138595581, + "kl": 0.23623262345790863, + "learning_rate": 4.729493824013036e-06, + "loss": 0.0094, + "reward": 2.2125000953674316, + "reward_std": 1.234073519706726, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7124999761581421, + "step": 469 + }, + { + "completion_length": 181.0, + "epoch": 1.6433566433566433, + "grad_norm": 1.7596086263656616, + "kl": 0.33919036388397217, + "learning_rate": 4.72751631047092e-06, + "loss": 0.0136, + "reward": 1.8500001430511475, + "reward_std": 1.2247450351715088, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 470 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.6468531468531469, + "grad_norm": 1.0671755075454712, + "kl": 0.27314767241477966, + "learning_rate": 4.725532011527817e-06, + "loss": 0.0109, + "reward": 3.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 471 + }, + { + "completion_length": 189.6666717529297, + "epoch": 1.6503496503496504, + "grad_norm": 1.0676515102386475, + "kl": 0.2805836498737335, + "learning_rate": 4.723540933228245e-06, + "loss": 0.0112, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 472 + }, + { + "completion_length": 836.5, + "epoch": 1.6538461538461537, + "grad_norm": 0.8203516006469727, + "kl": 0.172221839427948, + "learning_rate": 4.721543081637372e-06, + "loss": 0.0069, + "reward": 1.5833333730697632, + "reward_std": 1.0308573246002197, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7499999403953552, + "step": 473 + }, + { + "completion_length": 169.0, + "epoch": 1.6573426573426573, + "grad_norm": 1.7924721240997314, + "kl": 0.30363911390304565, + "learning_rate": 4.719538462841003e-06, + "loss": 0.0121, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 474 + }, + { + "completion_length": 176.6666717529297, + "epoch": 1.6608391608391608, + "grad_norm": 0.19596193730831146, + "kl": 0.24111799895763397, + "learning_rate": 4.717527082945555e-06, + "loss": 0.0108, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 475 + }, + { + "completion_length": 234.6666717529297, + "epoch": 1.6643356643356644, + "grad_norm": 0.9966434240341187, + "kl": 0.25714850425720215, + "learning_rate": 4.715508948078037e-06, + "loss": 0.0103, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 476 + }, + { + "completion_length": 1046.8333740234375, + "epoch": 1.667832167832168, + "grad_norm": 0.6285001635551453, + "kl": 0.1687658280134201, + "learning_rate": 4.71348406438604e-06, + "loss": 0.0068, + "reward": 2.0250000953674316, + "reward_std": 1.4372718334197998, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 477 + }, + { + "completion_length": 219.1666717529297, + "epoch": 1.6713286713286712, + "grad_norm": 1.0476932525634766, + "kl": 0.29544544219970703, + "learning_rate": 4.71145243803771e-06, + "loss": 0.0118, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 478 + }, + { + "completion_length": 561.1666870117188, + "epoch": 1.6748251748251748, + "grad_norm": 1.0641223192214966, + "kl": 0.1950298398733139, + "learning_rate": 4.709414075221734e-06, + "loss": 0.0078, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 479 + }, + { + "completion_length": 228.5, + "epoch": 1.6783216783216783, + "grad_norm": 0.8561164736747742, + "kl": 0.26422810554504395, + "learning_rate": 4.707368982147318e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 480 + }, + { + "completion_length": 509.3333435058594, + "epoch": 1.6818181818181817, + "grad_norm": 0.5843437314033508, + "kl": 0.20474323630332947, + "learning_rate": 4.70531716504417e-06, + "loss": 0.0082, + "reward": 2.183333396911621, + "reward_std": 1.2027745246887207, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 481 + }, + { + "completion_length": 548.6666870117188, + "epoch": 1.6853146853146854, + "grad_norm": 0.648353636264801, + "kl": 0.18905925750732422, + "learning_rate": 4.703258630162481e-06, + "loss": 0.0076, + "reward": 2.2125000953674316, + "reward_std": 1.3164108991622925, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7124999761581421, + "step": 482 + }, + { + "completion_length": 219.6666717529297, + "epoch": 1.6888111888111887, + "grad_norm": 4.2207932472229, + "kl": 1.0905920267105103, + "learning_rate": 4.701193383772905e-06, + "loss": 0.0436, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 483 + }, + { + "completion_length": 1049.166748046875, + "epoch": 1.6923076923076923, + "grad_norm": 0.5171648859977722, + "kl": 0.20516209304332733, + "learning_rate": 4.699121432166542e-06, + "loss": 0.0082, + "reward": 2.2333333492279053, + "reward_std": 0.9174240827560425, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 484 + }, + { + "completion_length": 201.6666717529297, + "epoch": 1.6958041958041958, + "grad_norm": 1.1004559993743896, + "kl": 0.2839426100254059, + "learning_rate": 4.697042781654913e-06, + "loss": 0.0114, + "reward": 1.870833396911621, + "reward_std": 0.193917915225029, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 485 + }, + { + "completion_length": 190.33334350585938, + "epoch": 1.6993006993006992, + "grad_norm": 1.0573567152023315, + "kl": 0.22315821051597595, + "learning_rate": 4.6949574385699514e-06, + "loss": 0.0089, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 486 + }, + { + "completion_length": 835.5, + "epoch": 1.702797202797203, + "grad_norm": 0.7173390984535217, + "kl": 0.1510881930589676, + "learning_rate": 4.6928654092639725e-06, + "loss": 0.006, + "reward": 1.5500000715255737, + "reward_std": 1.0904128551483154, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666388511658, + "step": 487 + }, + { + "completion_length": 615.8333740234375, + "epoch": 1.7062937062937062, + "grad_norm": 0.8014463186264038, + "kl": 0.22651296854019165, + "learning_rate": 4.690766700109659e-06, + "loss": 0.0091, + "reward": 2.7083334922790527, + "reward_std": 1.315453052520752, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 488 + }, + { + "completion_length": 169.6666717529297, + "epoch": 1.7097902097902098, + "grad_norm": 3.6473190784454346, + "kl": 0.40026336908340454, + "learning_rate": 4.688661317500045e-06, + "loss": 0.016, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 489 + }, + { + "completion_length": 1151.5, + "epoch": 1.7132867132867133, + "grad_norm": 0.8561959266662598, + "kl": 0.16577297449111938, + "learning_rate": 4.68654926784849e-06, + "loss": 0.0066, + "reward": 2.7083334922790527, + "reward_std": 1.0641508102416992, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.875, + "step": 490 + }, + { + "completion_length": 397.3333435058594, + "epoch": 1.7167832167832167, + "grad_norm": 1.0723934173583984, + "kl": 0.21682481467723846, + "learning_rate": 4.6844305575886635e-06, + "loss": 0.0087, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 491 + }, + { + "completion_length": 169.6666717529297, + "epoch": 1.7202797202797204, + "grad_norm": 1.4164685010910034, + "kl": 0.245243638753891, + "learning_rate": 4.682305193174524e-06, + "loss": 0.0098, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 492 + }, + { + "completion_length": 110.33333587646484, + "epoch": 1.7237762237762237, + "grad_norm": 5.974154949188232, + "kl": 1.1889418363571167, + "learning_rate": 4.680173181080302e-06, + "loss": 0.0476, + "reward": 3.075000286102295, + "reward_std": 1.1660832166671753, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 493 + }, + { + "completion_length": 215.5, + "epoch": 1.7272727272727273, + "grad_norm": 0.9199399352073669, + "kl": 0.2431143820285797, + "learning_rate": 4.6780345278004744e-06, + "loss": 0.0097, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 494 + }, + { + "completion_length": 190.83334350585938, + "epoch": 1.7307692307692308, + "grad_norm": 0.9801461696624756, + "kl": 0.25382137298583984, + "learning_rate": 4.675889239849749e-06, + "loss": 0.0102, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 495 + }, + { + "completion_length": 846.6666870117188, + "epoch": 1.7342657342657342, + "grad_norm": 0.6822401881217957, + "kl": 0.21501430869102478, + "learning_rate": 4.673737323763048e-06, + "loss": 0.0086, + "reward": 2.679166793823242, + "reward_std": 1.3748105764389038, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8458333015441895, + "step": 496 + }, + { + "completion_length": 182.33334350585938, + "epoch": 1.737762237762238, + "grad_norm": 6.3415422439575195, + "kl": 1.284159541130066, + "learning_rate": 4.671578786095479e-06, + "loss": 0.0514, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 497 + }, + { + "completion_length": 164.83334350585938, + "epoch": 1.7412587412587412, + "grad_norm": 1.421428918838501, + "kl": 0.3243716359138489, + "learning_rate": 4.669413633422322e-06, + "loss": 0.013, + "reward": 3.566666603088379, + "reward_std": 0.6013872623443604, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 498 + }, + { + "completion_length": 229.6666717529297, + "epoch": 1.7447552447552448, + "grad_norm": 0.8355535864830017, + "kl": 0.24279817938804626, + "learning_rate": 4.667241872339007e-06, + "loss": 0.0097, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 499 + }, + { + "completion_length": 672.6666870117188, + "epoch": 1.7482517482517483, + "grad_norm": 0.5215955376625061, + "kl": 0.19877499341964722, + "learning_rate": 4.665063509461098e-06, + "loss": 0.008, + "reward": 2.924999952316284, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 500 + }, + { + "completion_length": 198.83334350585938, + "epoch": 1.7517482517482517, + "grad_norm": 0.9148537516593933, + "kl": 0.24169328808784485, + "learning_rate": 4.6628785514242615e-06, + "loss": 0.0097, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 501 + }, + { + "completion_length": 928.5, + "epoch": 1.7552447552447552, + "grad_norm": 0.4413454532623291, + "kl": 0.15593400597572327, + "learning_rate": 4.6606870048842626e-06, + "loss": 0.0062, + "reward": 2.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 502 + }, + { + "completion_length": 508.0, + "epoch": 1.7587412587412588, + "grad_norm": 0.7536454796791077, + "kl": 0.24186736345291138, + "learning_rate": 4.658488876516929e-06, + "loss": 0.0097, + "reward": 2.625, + "reward_std": 1.616709589958191, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 503 + }, + { + "completion_length": 208.33334350585938, + "epoch": 1.762237762237762, + "grad_norm": 1.1730728149414062, + "kl": 0.2987002432346344, + "learning_rate": 4.656284173018144e-06, + "loss": 0.0119, + "reward": 2.758333206176758, + "reward_std": 1.0394309759140015, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 504 + }, + { + "completion_length": 197.1666717529297, + "epoch": 1.7657342657342658, + "grad_norm": 2.2083706855773926, + "kl": 0.3215945363044739, + "learning_rate": 4.654072901103815e-06, + "loss": 0.0129, + "reward": 2.0416667461395264, + "reward_std": 0.9002315402030945, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 505 + }, + { + "completion_length": 572.0, + "epoch": 1.7692307692307692, + "grad_norm": 0.8655341863632202, + "kl": 0.24153539538383484, + "learning_rate": 4.65185506750986e-06, + "loss": 0.0097, + "reward": 1.870833396911621, + "reward_std": 1.0137083530426025, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 506 + }, + { + "completion_length": 517.5, + "epoch": 1.7727272727272727, + "grad_norm": 0.49979329109191895, + "kl": 0.16330799460411072, + "learning_rate": 4.649630678992184e-06, + "loss": 0.0065, + "reward": 2.4000000953674316, + "reward_std": 0.9460445642471313, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 507 + }, + { + "completion_length": 324.16668701171875, + "epoch": 1.7762237762237763, + "grad_norm": 0.9129101037979126, + "kl": 0.26079505681991577, + "learning_rate": 4.6473997423266615e-06, + "loss": 0.0104, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 508 + }, + { + "completion_length": 316.16668701171875, + "epoch": 1.7797202797202796, + "grad_norm": 0.7381297945976257, + "kl": 0.34089159965515137, + "learning_rate": 4.645162264309112e-06, + "loss": 0.0136, + "reward": 3.2333335876464844, + "reward_std": 0.849509596824646, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 509 + }, + { + "completion_length": 207.83334350585938, + "epoch": 1.7832167832167833, + "grad_norm": 1.0436253547668457, + "kl": 0.2835765480995178, + "learning_rate": 4.642918251755281e-06, + "loss": 0.0113, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 510 + }, + { + "completion_length": 230.33334350585938, + "epoch": 1.7867132867132867, + "grad_norm": 0.9628374576568604, + "kl": 0.2641430199146271, + "learning_rate": 4.640667711500821e-06, + "loss": 0.0106, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 511 + }, + { + "completion_length": 507.66668701171875, + "epoch": 1.7902097902097902, + "grad_norm": 0.3851446211338043, + "kl": 0.251933217048645, + "learning_rate": 4.638410650401267e-06, + "loss": 0.0101, + "reward": 3.516666889190674, + "reward_std": 1.0614454746246338, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 512 + }, + { + "completion_length": 192.0, + "epoch": 1.7937062937062938, + "grad_norm": 1.3856638669967651, + "kl": 0.2984909415245056, + "learning_rate": 4.636147075332019e-06, + "loss": 0.0119, + "reward": 3.0916666984558105, + "reward_std": 1.2249150276184082, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 513 + }, + { + "completion_length": 206.83334350585938, + "epoch": 1.797202797202797, + "grad_norm": 0.9139816164970398, + "kl": 0.24960675835609436, + "learning_rate": 4.633876993188319e-06, + "loss": 0.01, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 514 + }, + { + "completion_length": 538.0, + "epoch": 1.8006993006993008, + "grad_norm": 0.7666388750076294, + "kl": 0.2067805826663971, + "learning_rate": 4.631600410885231e-06, + "loss": 0.0083, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 515 + }, + { + "completion_length": 186.0, + "epoch": 1.8041958041958042, + "grad_norm": 0.9322411417961121, + "kl": 0.24232684075832367, + "learning_rate": 4.62931733535762e-06, + "loss": 0.0097, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 516 + }, + { + "completion_length": 170.6666717529297, + "epoch": 1.8076923076923077, + "grad_norm": 1.5746034383773804, + "kl": 0.36948150396347046, + "learning_rate": 4.627027773560129e-06, + "loss": 0.0148, + "reward": 2.516666889190674, + "reward_std": 1.525341510772705, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 517 + }, + { + "completion_length": 193.0, + "epoch": 1.8111888111888113, + "grad_norm": 0.9759989380836487, + "kl": 0.3557225167751312, + "learning_rate": 4.62473173246716e-06, + "loss": 0.0142, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 518 + }, + { + "completion_length": 523.6666870117188, + "epoch": 1.8146853146853146, + "grad_norm": 0.9804190993309021, + "kl": 0.2574712038040161, + "learning_rate": 4.622429219072854e-06, + "loss": 0.0103, + "reward": 1.633333444595337, + "reward_std": 1.1919171810150146, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 519 + }, + { + "completion_length": 1029.166748046875, + "epoch": 1.8181818181818183, + "grad_norm": 0.5941687822341919, + "kl": 0.1915300190448761, + "learning_rate": 4.620120240391065e-06, + "loss": 0.0077, + "reward": 2.883333206176758, + "reward_std": 1.7192052602767944, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 520 + }, + { + "completion_length": 157.1666717529297, + "epoch": 1.8216783216783217, + "grad_norm": 3.1836304664611816, + "kl": 0.6161837577819824, + "learning_rate": 4.6178048034553435e-06, + "loss": 0.0246, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 521 + }, + { + "completion_length": 201.33334350585938, + "epoch": 1.8251748251748252, + "grad_norm": 1.5185062885284424, + "kl": 0.31097742915153503, + "learning_rate": 4.6154829153189105e-06, + "loss": 0.0124, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 522 + }, + { + "completion_length": 186.1666717529297, + "epoch": 1.8286713286713288, + "grad_norm": 0.936562180519104, + "kl": 0.3272198438644409, + "learning_rate": 4.613154583054641e-06, + "loss": 0.0131, + "reward": 2.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 523 + }, + { + "completion_length": 216.6666717529297, + "epoch": 1.832167832167832, + "grad_norm": 0.9323495626449585, + "kl": 0.3112618923187256, + "learning_rate": 4.610819813755038e-06, + "loss": 0.0125, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 524 + }, + { + "completion_length": 525.3333740234375, + "epoch": 1.8356643356643356, + "grad_norm": 0.40873953700065613, + "kl": 0.241009920835495, + "learning_rate": 4.608478614532215e-06, + "loss": 0.0096, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 525 + }, + { + "completion_length": 160.83334350585938, + "epoch": 1.8391608391608392, + "grad_norm": 1.1447237730026245, + "kl": 0.37633103132247925, + "learning_rate": 4.60613099251787e-06, + "loss": 0.0151, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 526 + }, + { + "completion_length": 176.5, + "epoch": 1.8426573426573427, + "grad_norm": 1.4215019941329956, + "kl": 0.31421756744384766, + "learning_rate": 4.603776954863266e-06, + "loss": 0.0126, + "reward": 2.2083334922790527, + "reward_std": 0.6003471612930298, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 527 + }, + { + "completion_length": 511.16668701171875, + "epoch": 1.8461538461538463, + "grad_norm": 0.7890862226486206, + "kl": 0.21260276436805725, + "learning_rate": 4.601416508739211e-06, + "loss": 0.0085, + "reward": 2.2916667461395264, + "reward_std": 1.4901063442230225, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 528 + }, + { + "completion_length": 145.6666717529297, + "epoch": 1.8496503496503496, + "grad_norm": 2.972633123397827, + "kl": 1.6821321249008179, + "learning_rate": 4.599049661336033e-06, + "loss": 0.0673, + "reward": 2.4583334922790527, + "reward_std": 1.3603004217147827, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 529 + }, + { + "completion_length": 337.66668701171875, + "epoch": 1.8531468531468531, + "grad_norm": 0.4933686852455139, + "kl": 0.2972989082336426, + "learning_rate": 4.596676419863561e-06, + "loss": 0.0119, + "reward": 3.758333206176758, + "reward_std": 0.4694856107234955, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9250000715255737, + "step": 530 + }, + { + "completion_length": 1491.166748046875, + "epoch": 1.8566433566433567, + "grad_norm": 0.7114420533180237, + "kl": 0.16526620090007782, + "learning_rate": 4.5942967915510975e-06, + "loss": 0.0066, + "reward": 2.683333396911621, + "reward_std": 0.8942409753799438, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 531 + }, + { + "completion_length": 822.0, + "epoch": 1.86013986013986, + "grad_norm": 0.4190931022167206, + "kl": 0.21502110362052917, + "learning_rate": 4.591910783647405e-06, + "loss": 0.0086, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 532 + }, + { + "completion_length": 739.5, + "epoch": 1.8636363636363638, + "grad_norm": 0.5615747570991516, + "kl": 0.223265141248703, + "learning_rate": 4.589518403420676e-06, + "loss": 0.0089, + "reward": 2.3500001430511475, + "reward_std": 1.5231547355651855, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 533 + }, + { + "completion_length": 188.6666717529297, + "epoch": 1.867132867132867, + "grad_norm": 0.754673957824707, + "kl": 0.2731919288635254, + "learning_rate": 4.587119658158517e-06, + "loss": 0.0109, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 534 + }, + { + "completion_length": 528.3333740234375, + "epoch": 1.8706293706293706, + "grad_norm": 0.45285508036613464, + "kl": 0.21540388464927673, + "learning_rate": 4.584714555167921e-06, + "loss": 0.0086, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 535 + }, + { + "completion_length": 513.1666870117188, + "epoch": 1.8741258741258742, + "grad_norm": 0.6436936259269714, + "kl": 0.2541727125644684, + "learning_rate": 4.582303101775249e-06, + "loss": 0.0102, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 536 + }, + { + "completion_length": 503.3333435058594, + "epoch": 1.8776223776223775, + "grad_norm": 0.5080775618553162, + "kl": 0.2073960304260254, + "learning_rate": 4.579885305326206e-06, + "loss": 0.0083, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 537 + }, + { + "completion_length": 209.6666717529297, + "epoch": 1.8811188811188813, + "grad_norm": 0.9030362963676453, + "kl": 0.283308744430542, + "learning_rate": 4.577461173185821e-06, + "loss": 0.0113, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 538 + }, + { + "completion_length": 121.5, + "epoch": 1.8846153846153846, + "grad_norm": 2.8895628452301025, + "kl": 0.8616495132446289, + "learning_rate": 4.5750307127384194e-06, + "loss": 0.0345, + "reward": 1.4666666984558105, + "reward_std": 1.2002778053283691, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 539 + }, + { + "completion_length": 208.83334350585938, + "epoch": 1.8881118881118881, + "grad_norm": 1.0781502723693848, + "kl": 0.2666887640953064, + "learning_rate": 4.572593931387604e-06, + "loss": 0.0107, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 540 + }, + { + "completion_length": 529.8333740234375, + "epoch": 1.8916083916083917, + "grad_norm": 0.8341970443725586, + "kl": 0.27578771114349365, + "learning_rate": 4.570150836556236e-06, + "loss": 0.011, + "reward": 2.683333396911621, + "reward_std": 0.9092121124267578, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 541 + }, + { + "completion_length": 509.0, + "epoch": 1.895104895104895, + "grad_norm": 0.7221694588661194, + "kl": 0.20753830671310425, + "learning_rate": 4.567701435686405e-06, + "loss": 0.0083, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 542 + }, + { + "completion_length": 999.0, + "epoch": 1.8986013986013988, + "grad_norm": 0.8567831516265869, + "kl": 0.2119346261024475, + "learning_rate": 4.5652457362394094e-06, + "loss": 0.0085, + "reward": 1.808333396911621, + "reward_std": 2.014302968978882, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 543 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.902097902097902, + "grad_norm": 0.5826951265335083, + "kl": 0.2415902316570282, + "learning_rate": 4.562783745695738e-06, + "loss": 0.0097, + "reward": 2.183333396911621, + "reward_std": 1.0230673551559448, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 544 + }, + { + "completion_length": 831.0, + "epoch": 1.9055944055944056, + "grad_norm": 0.5661029815673828, + "kl": 0.2621002495288849, + "learning_rate": 4.560315471555039e-06, + "loss": 0.0105, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 545 + }, + { + "completion_length": 190.6666717529297, + "epoch": 1.9090909090909092, + "grad_norm": 0.8984940648078918, + "kl": 0.261735200881958, + "learning_rate": 4.5578409213361055e-06, + "loss": 0.0105, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 546 + }, + { + "completion_length": 672.5, + "epoch": 1.9125874125874125, + "grad_norm": 0.6307451128959656, + "kl": 0.3331562280654907, + "learning_rate": 4.555360102576844e-06, + "loss": 0.0133, + "reward": 3.5916666984558105, + "reward_std": 0.5571505427360535, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9250000715255737, + "step": 547 + }, + { + "completion_length": 193.5, + "epoch": 1.916083916083916, + "grad_norm": 0.9689189791679382, + "kl": 0.31761375069618225, + "learning_rate": 4.55287302283426e-06, + "loss": 0.0127, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 548 + }, + { + "completion_length": 477.0, + "epoch": 1.9195804195804196, + "grad_norm": 1.1217161417007446, + "kl": 0.4803551435470581, + "learning_rate": 4.550379689684431e-06, + "loss": 0.0192, + "reward": 2.924999952316284, + "reward_std": 0.06123730167746544, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.9249999523162842, + "step": 549 + }, + { + "completion_length": 501.66668701171875, + "epoch": 1.9230769230769231, + "grad_norm": 0.48732584714889526, + "kl": 0.3280116021633148, + "learning_rate": 4.54788011072248e-06, + "loss": 0.0131, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 550 + }, + { + "completion_length": 190.5, + "epoch": 1.9265734265734267, + "grad_norm": 0.05169845372438431, + "kl": 0.2321687638759613, + "learning_rate": 4.545374293562559e-06, + "loss": 0.0117, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 551 + }, + { + "completion_length": 226.33334350585938, + "epoch": 1.93006993006993, + "grad_norm": 1.1284880638122559, + "kl": 0.3435511291027069, + "learning_rate": 4.542862245837821e-06, + "loss": 0.0137, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 552 + }, + { + "completion_length": 197.5, + "epoch": 1.9335664335664335, + "grad_norm": 0.8085185289382935, + "kl": 0.2905815541744232, + "learning_rate": 4.540343975200401e-06, + "loss": 0.0116, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 553 + }, + { + "completion_length": 504.8333435058594, + "epoch": 1.937062937062937, + "grad_norm": 0.38323989510536194, + "kl": 0.26971811056137085, + "learning_rate": 4.537819489321385e-06, + "loss": 0.0108, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 554 + }, + { + "completion_length": 172.5, + "epoch": 1.9405594405594404, + "grad_norm": 1.8462821245193481, + "kl": 0.32645952701568604, + "learning_rate": 4.535288795890799e-06, + "loss": 0.0131, + "reward": 2.8500001430511475, + "reward_std": 1.3190906047821045, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 555 + }, + { + "completion_length": 508.66668701171875, + "epoch": 1.9440559440559442, + "grad_norm": 0.48262494802474976, + "kl": 0.26610442996025085, + "learning_rate": 4.5327519026175694e-06, + "loss": 0.0106, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 556 + }, + { + "completion_length": 205.33334350585938, + "epoch": 1.9475524475524475, + "grad_norm": 0.8724077343940735, + "kl": 0.34979626536369324, + "learning_rate": 4.530208817229516e-06, + "loss": 0.014, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 557 + }, + { + "completion_length": 466.3333435058594, + "epoch": 1.951048951048951, + "grad_norm": 1.2409106492996216, + "kl": 0.5075003504753113, + "learning_rate": 4.527659547473317e-06, + "loss": 0.0203, + "reward": 1.774999976158142, + "reward_std": 1.3299436569213867, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6083333492279053, + "step": 558 + }, + { + "completion_length": 201.0, + "epoch": 1.9545454545454546, + "grad_norm": 0.9538130760192871, + "kl": 0.22750967741012573, + "learning_rate": 4.5251041011144905e-06, + "loss": 0.0091, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 559 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.958041958041958, + "grad_norm": 0.8161240220069885, + "kl": 0.28019654750823975, + "learning_rate": 4.522542485937369e-06, + "loss": 0.0112, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 560 + }, + { + "completion_length": 515.5, + "epoch": 1.9615384615384617, + "grad_norm": 0.6905736327171326, + "kl": 0.20913702249526978, + "learning_rate": 4.519974709745076e-06, + "loss": 0.0084, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 561 + }, + { + "completion_length": 201.5, + "epoch": 1.965034965034965, + "grad_norm": 1.109075665473938, + "kl": 0.29383933544158936, + "learning_rate": 4.517400780359505e-06, + "loss": 0.0118, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 562 + }, + { + "completion_length": 849.0, + "epoch": 1.9685314685314685, + "grad_norm": 0.5454800128936768, + "kl": 0.16988810896873474, + "learning_rate": 4.51482070562129e-06, + "loss": 0.0068, + "reward": 2.4666666984558105, + "reward_std": 1.949530005455017, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 563 + }, + { + "completion_length": 826.0, + "epoch": 1.972027972027972, + "grad_norm": 0.521063506603241, + "kl": 0.2149253934621811, + "learning_rate": 4.512234493389785e-06, + "loss": 0.0086, + "reward": 1.9666666984558105, + "reward_std": 1.5233734846115112, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 564 + }, + { + "completion_length": 502.8333435058594, + "epoch": 1.9755244755244754, + "grad_norm": 0.4798555076122284, + "kl": 0.26902374625205994, + "learning_rate": 4.509642151543043e-06, + "loss": 0.0108, + "reward": 1.625, + "reward_std": 0.7960842847824097, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 565 + }, + { + "completion_length": 525.0, + "epoch": 1.9790209790209792, + "grad_norm": 0.566384494304657, + "kl": 0.2703857123851776, + "learning_rate": 4.507043687977787e-06, + "loss": 0.0108, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 566 + }, + { + "completion_length": 194.33334350585938, + "epoch": 1.9825174825174825, + "grad_norm": 2.502077579498291, + "kl": 0.4179210364818573, + "learning_rate": 4.504439110609385e-06, + "loss": 0.0167, + "reward": 1.383333444595337, + "reward_std": 0.8920015096664429, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 567 + }, + { + "completion_length": 199.33334350585938, + "epoch": 1.986013986013986, + "grad_norm": 0.07109465450048447, + "kl": 0.2686344385147095, + "learning_rate": 4.501828427371834e-06, + "loss": 0.0131, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 568 + }, + { + "completion_length": 190.83334350585938, + "epoch": 1.9895104895104896, + "grad_norm": 1.11842942237854, + "kl": 0.2603175640106201, + "learning_rate": 4.4992116462177274e-06, + "loss": 0.0104, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 569 + }, + { + "completion_length": 513.8333740234375, + "epoch": 1.993006993006993, + "grad_norm": 0.47602808475494385, + "kl": 0.20756664872169495, + "learning_rate": 4.496588775118232e-06, + "loss": 0.0083, + "reward": 2.2916667461395264, + "reward_std": 1.19181227684021, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 570 + }, + { + "completion_length": 197.1666717529297, + "epoch": 1.9965034965034965, + "grad_norm": 0.7599025368690491, + "kl": 0.23664715886116028, + "learning_rate": 4.4939598220630724e-06, + "loss": 0.0095, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 571 + }, + { + "completion_length": 207.83334350585938, + "epoch": 2.0, + "grad_norm": 0.7908173203468323, + "kl": 0.28615739941596985, + "learning_rate": 4.491324795060491e-06, + "loss": 0.0114, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 572 + }, + { + "completion_length": 194.6666717529297, + "epoch": 2.0034965034965033, + "grad_norm": 0.9715352654457092, + "kl": 0.3183891177177429, + "learning_rate": 4.48868370213724e-06, + "loss": 0.0127, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 573 + }, + { + "completion_length": 173.83334350585938, + "epoch": 2.006993006993007, + "grad_norm": 2.3841874599456787, + "kl": 1.3214149475097656, + "learning_rate": 4.4860365513385456e-06, + "loss": 0.0529, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 574 + }, + { + "completion_length": 213.83334350585938, + "epoch": 2.0104895104895104, + "grad_norm": 0.9496575593948364, + "kl": 0.22735705971717834, + "learning_rate": 4.4833833507280884e-06, + "loss": 0.0091, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 575 + }, + { + "completion_length": 511.0, + "epoch": 2.013986013986014, + "grad_norm": 0.6045878529548645, + "kl": 0.25393787026405334, + "learning_rate": 4.4807241083879774e-06, + "loss": 0.0102, + "reward": 1.4583333730697632, + "reward_std": 0.8187898397445679, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 576 + }, + { + "completion_length": 222.1666717529297, + "epoch": 2.0174825174825175, + "grad_norm": 0.7379043102264404, + "kl": 0.22020569443702698, + "learning_rate": 4.478058832418726e-06, + "loss": 0.0088, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 577 + }, + { + "completion_length": 204.6666717529297, + "epoch": 2.020979020979021, + "grad_norm": 0.9404547810554504, + "kl": 0.2797861695289612, + "learning_rate": 4.475387530939226e-06, + "loss": 0.0112, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 578 + }, + { + "completion_length": 206.6666717529297, + "epoch": 2.0244755244755246, + "grad_norm": 0.8784480690956116, + "kl": 0.24152153730392456, + "learning_rate": 4.4727102120867274e-06, + "loss": 0.0097, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 579 + }, + { + "completion_length": 414.66668701171875, + "epoch": 2.027972027972028, + "grad_norm": 0.6715477705001831, + "kl": 0.21307629346847534, + "learning_rate": 4.470026884016805e-06, + "loss": 0.0085, + "reward": 2.4583334922790527, + "reward_std": 1.5001389980316162, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 580 + }, + { + "completion_length": 528.5, + "epoch": 2.0314685314685317, + "grad_norm": 0.7886191010475159, + "kl": 0.4145243763923645, + "learning_rate": 4.467337554903344e-06, + "loss": 0.0166, + "reward": 3.5416667461395264, + "reward_std": 1.0002083778381348, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.875, + "step": 581 + }, + { + "completion_length": 457.5, + "epoch": 2.034965034965035, + "grad_norm": 5.719381809234619, + "kl": 1.370613932609558, + "learning_rate": 4.464642232938505e-06, + "loss": 0.0548, + "reward": 1.9750001430511475, + "reward_std": 2.163504123687744, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 582 + }, + { + "completion_length": 361.5, + "epoch": 2.0384615384615383, + "grad_norm": 0.5381609201431274, + "kl": 0.23687216639518738, + "learning_rate": 4.461940926332708e-06, + "loss": 0.0095, + "reward": 1.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 583 + }, + { + "completion_length": 874.6666870117188, + "epoch": 2.041958041958042, + "grad_norm": 0.45025861263275146, + "kl": 0.16833463311195374, + "learning_rate": 4.4592336433146e-06, + "loss": 0.0067, + "reward": 2.9583334922790527, + "reward_std": 1.6554203033447266, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 584 + }, + { + "completion_length": 726.3333740234375, + "epoch": 2.0454545454545454, + "grad_norm": 0.4446694254875183, + "kl": 0.17844387888908386, + "learning_rate": 4.456520392131035e-06, + "loss": 0.0071, + "reward": 1.133333444595337, + "reward_std": 0.9595138430595398, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 585 + }, + { + "completion_length": 830.3333740234375, + "epoch": 2.0489510489510487, + "grad_norm": 0.8371572494506836, + "kl": 0.16316595673561096, + "learning_rate": 4.453801181047047e-06, + "loss": 0.0065, + "reward": 1.524999976158142, + "reward_std": 1.1206024885177612, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 586 + }, + { + "completion_length": 110.5, + "epoch": 2.0524475524475525, + "grad_norm": 3.6648356914520264, + "kl": 0.4860494136810303, + "learning_rate": 4.4510760183458246e-06, + "loss": 0.0194, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 587 + }, + { + "completion_length": 228.6666717529297, + "epoch": 2.055944055944056, + "grad_norm": 0.8717478513717651, + "kl": 0.28448450565338135, + "learning_rate": 4.448344912328686e-06, + "loss": 0.0114, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 588 + }, + { + "completion_length": 614.0, + "epoch": 2.0594405594405596, + "grad_norm": 0.352130651473999, + "kl": 0.19009076058864594, + "learning_rate": 4.445607871315053e-06, + "loss": 0.0076, + "reward": 1.7333333492279053, + "reward_std": 0.5307227969169617, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 589 + }, + { + "completion_length": 476.3333435058594, + "epoch": 2.062937062937063, + "grad_norm": 2.5581870079040527, + "kl": 0.5677192807197571, + "learning_rate": 4.442864903642428e-06, + "loss": 0.0227, + "reward": 1.8000000715255737, + "reward_std": 1.5792405605316162, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 590 + }, + { + "completion_length": 314.66668701171875, + "epoch": 2.0664335664335662, + "grad_norm": 0.657811164855957, + "kl": 0.20458662509918213, + "learning_rate": 4.440116017666365e-06, + "loss": 0.0082, + "reward": 3.116666793823242, + "reward_std": 1.3291600942611694, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 591 + }, + { + "completion_length": 516.0, + "epoch": 2.06993006993007, + "grad_norm": 0.473056823015213, + "kl": 0.19687163829803467, + "learning_rate": 4.437361221760449e-06, + "loss": 0.0079, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 592 + }, + { + "completion_length": 217.0, + "epoch": 2.0734265734265733, + "grad_norm": 0.793745756149292, + "kl": 0.2862774133682251, + "learning_rate": 4.434600524316266e-06, + "loss": 0.0115, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 593 + }, + { + "completion_length": 216.0, + "epoch": 2.076923076923077, + "grad_norm": 0.7589979767799377, + "kl": 0.2887541651725769, + "learning_rate": 4.431833933743378e-06, + "loss": 0.0116, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 594 + }, + { + "completion_length": 234.0, + "epoch": 2.0804195804195804, + "grad_norm": 0.952064037322998, + "kl": 0.30340343713760376, + "learning_rate": 4.4290614584693005e-06, + "loss": 0.0121, + "reward": 2.5375001430511475, + "reward_std": 0.9115578532218933, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 595 + }, + { + "completion_length": 1109.8333740234375, + "epoch": 2.0839160839160837, + "grad_norm": 0.382217139005661, + "kl": 0.1974603831768036, + "learning_rate": 4.426283106939474e-06, + "loss": 0.0079, + "reward": 1.7166666984558105, + "reward_std": 0.967298686504364, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7166666984558105, + "step": 596 + }, + { + "completion_length": 497.66668701171875, + "epoch": 2.0874125874125875, + "grad_norm": 0.7741627097129822, + "kl": 0.2393149733543396, + "learning_rate": 4.423498887617238e-06, + "loss": 0.0096, + "reward": 1.9583333730697632, + "reward_std": 1.400148868560791, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 597 + }, + { + "completion_length": 518.0, + "epoch": 2.090909090909091, + "grad_norm": 0.534230649471283, + "kl": 0.22715210914611816, + "learning_rate": 4.420708808983809e-06, + "loss": 0.0091, + "reward": 2.4583334922790527, + "reward_std": 1.3603003025054932, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 598 + }, + { + "completion_length": 502.8333435058594, + "epoch": 2.0944055944055946, + "grad_norm": 0.5411605834960938, + "kl": 0.2008448839187622, + "learning_rate": 4.41791287953825e-06, + "loss": 0.008, + "reward": 2.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 599 + }, + { + "completion_length": 545.6666870117188, + "epoch": 2.097902097902098, + "grad_norm": 0.44943779706954956, + "kl": 0.225155770778656, + "learning_rate": 4.415111107797445e-06, + "loss": 0.009, + "reward": 3.016666889190674, + "reward_std": 1.3952300548553467, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 600 + }, + { + "completion_length": 239.0, + "epoch": 2.1013986013986012, + "grad_norm": 0.9387716054916382, + "kl": 0.2535586357116699, + "learning_rate": 4.412303502296081e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 601 + }, + { + "completion_length": 188.0, + "epoch": 2.104895104895105, + "grad_norm": 3.3025033473968506, + "kl": 0.3564508557319641, + "learning_rate": 4.409490071586606e-06, + "loss": 0.0143, + "reward": 2.9583334922790527, + "reward_std": 1.6554205417633057, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 602 + }, + { + "completion_length": 526.8333740234375, + "epoch": 2.1083916083916083, + "grad_norm": 0.7135488986968994, + "kl": 0.25961729884147644, + "learning_rate": 4.406670824239221e-06, + "loss": 0.0104, + "reward": 2.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 603 + }, + { + "completion_length": 201.0, + "epoch": 2.111888111888112, + "grad_norm": 0.5526494979858398, + "kl": 0.26036110520362854, + "learning_rate": 4.403845768841842e-06, + "loss": 0.0104, + "reward": 2.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 604 + }, + { + "completion_length": 516.8333740234375, + "epoch": 2.1153846153846154, + "grad_norm": 0.4089651107788086, + "kl": 0.2617362141609192, + "learning_rate": 4.401014914000078e-06, + "loss": 0.0105, + "reward": 3.2916667461395264, + "reward_std": 1.6125807762145996, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 605 + }, + { + "completion_length": 192.5, + "epoch": 2.1188811188811187, + "grad_norm": 0.7996219396591187, + "kl": 0.30715522170066833, + "learning_rate": 4.398178268337202e-06, + "loss": 0.0123, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 606 + }, + { + "completion_length": 793.3333740234375, + "epoch": 2.1223776223776225, + "grad_norm": 0.8545472025871277, + "kl": 0.20438644289970398, + "learning_rate": 4.395335840494131e-06, + "loss": 0.0082, + "reward": 3.375, + "reward_std": 0.493710458278656, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.875, + "step": 607 + }, + { + "completion_length": 197.5, + "epoch": 2.125874125874126, + "grad_norm": 0.09662449359893799, + "kl": 0.2624778151512146, + "learning_rate": 4.3924876391293915e-06, + "loss": 0.0117, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 608 + }, + { + "completion_length": 199.0, + "epoch": 2.129370629370629, + "grad_norm": 0.8693634867668152, + "kl": 0.232680082321167, + "learning_rate": 4.389633672919099e-06, + "loss": 0.0093, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 609 + }, + { + "completion_length": 213.1666717529297, + "epoch": 2.132867132867133, + "grad_norm": 0.23271039128303528, + "kl": 0.2889987826347351, + "learning_rate": 4.386773950556931e-06, + "loss": 0.0139, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 610 + }, + { + "completion_length": 197.83334350585938, + "epoch": 2.1363636363636362, + "grad_norm": 0.8127601742744446, + "kl": 0.35951054096221924, + "learning_rate": 4.3839084807540956e-06, + "loss": 0.0144, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 611 + }, + { + "completion_length": 164.6666717529297, + "epoch": 2.13986013986014, + "grad_norm": 1.0649946928024292, + "kl": 0.26743820309638977, + "learning_rate": 4.381037272239311e-06, + "loss": 0.0107, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 612 + }, + { + "completion_length": 217.6666717529297, + "epoch": 2.1433566433566433, + "grad_norm": 0.8122753500938416, + "kl": 0.27118992805480957, + "learning_rate": 4.378160333758779e-06, + "loss": 0.0108, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 613 + }, + { + "completion_length": 201.6666717529297, + "epoch": 2.1468531468531467, + "grad_norm": 0.8640854358673096, + "kl": 0.2445271909236908, + "learning_rate": 4.3752776740761495e-06, + "loss": 0.0098, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 614 + }, + { + "completion_length": 188.6666717529297, + "epoch": 2.1503496503496504, + "grad_norm": 1.3168154954910278, + "kl": 0.2900705933570862, + "learning_rate": 4.372389301972506e-06, + "loss": 0.0116, + "reward": 1.7083333730697632, + "reward_std": 0.591960072517395, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 615 + }, + { + "completion_length": 241.6666717529297, + "epoch": 2.1538461538461537, + "grad_norm": 1.1053791046142578, + "kl": 0.4096168875694275, + "learning_rate": 4.36949522624633e-06, + "loss": 0.0164, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 616 + }, + { + "completion_length": 147.83334350585938, + "epoch": 2.1573426573426575, + "grad_norm": 3.980419874191284, + "kl": 1.5825055837631226, + "learning_rate": 4.366595455713479e-06, + "loss": 0.0633, + "reward": 2.3000001907348633, + "reward_std": 1.4812158346176147, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 617 + }, + { + "completion_length": 197.0, + "epoch": 2.160839160839161, + "grad_norm": 0.8954426050186157, + "kl": 0.23646585643291473, + "learning_rate": 4.3636899992071555e-06, + "loss": 0.0095, + "reward": 2.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 618 + }, + { + "completion_length": 221.33334350585938, + "epoch": 2.164335664335664, + "grad_norm": 0.8455007076263428, + "kl": 0.25921204686164856, + "learning_rate": 4.360778865577885e-06, + "loss": 0.0104, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 619 + }, + { + "completion_length": 196.5, + "epoch": 2.167832167832168, + "grad_norm": 0.8735758662223816, + "kl": 0.27696120738983154, + "learning_rate": 4.357862063693486e-06, + "loss": 0.0111, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 620 + }, + { + "completion_length": 177.83334350585938, + "epoch": 2.1713286713286712, + "grad_norm": 32.12022018432617, + "kl": 2.4454264640808105, + "learning_rate": 4.354939602439041e-06, + "loss": 0.0978, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 621 + }, + { + "completion_length": 184.6666717529297, + "epoch": 2.174825174825175, + "grad_norm": 2.8916237354278564, + "kl": 0.3946024775505066, + "learning_rate": 4.352011490716875e-06, + "loss": 0.0158, + "reward": 3.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 622 + }, + { + "completion_length": 210.33334350585938, + "epoch": 2.1783216783216783, + "grad_norm": 1.4287588596343994, + "kl": 0.32967257499694824, + "learning_rate": 4.349077737446525e-06, + "loss": 0.0132, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 623 + }, + { + "completion_length": 229.83334350585938, + "epoch": 2.1818181818181817, + "grad_norm": 0.04024571180343628, + "kl": 0.2965821325778961, + "learning_rate": 4.346138351564711e-06, + "loss": 0.0142, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 624 + }, + { + "completion_length": 153.83334350585938, + "epoch": 2.1853146853146854, + "grad_norm": 0.9452215433120728, + "kl": 0.26284661889076233, + "learning_rate": 4.34319334202531e-06, + "loss": 0.0105, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 625 + }, + { + "completion_length": 162.1666717529297, + "epoch": 2.1888111888111887, + "grad_norm": 32.100563049316406, + "kl": 7.969426155090332, + "learning_rate": 4.340242717799337e-06, + "loss": 0.3188, + "reward": 2.2916667461395264, + "reward_std": 0.8002604246139526, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 626 + }, + { + "completion_length": 175.5, + "epoch": 2.1923076923076925, + "grad_norm": 6.515329360961914, + "kl": 0.3849031627178192, + "learning_rate": 4.3372864878749e-06, + "loss": 0.0154, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 627 + }, + { + "completion_length": 504.3333435058594, + "epoch": 2.195804195804196, + "grad_norm": 0.6083482503890991, + "kl": 0.19082359969615936, + "learning_rate": 4.334324661257191e-06, + "loss": 0.0076, + "reward": 2.4583334922790527, + "reward_std": 1.5001389980316162, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 628 + }, + { + "completion_length": 196.0, + "epoch": 2.199300699300699, + "grad_norm": 0.9820056557655334, + "kl": 0.2912360727787018, + "learning_rate": 4.331357246968447e-06, + "loss": 0.0116, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 629 + }, + { + "completion_length": 544.0, + "epoch": 2.202797202797203, + "grad_norm": 0.5948340892791748, + "kl": 0.22720639407634735, + "learning_rate": 4.328384254047927e-06, + "loss": 0.0091, + "reward": 2.375, + "reward_std": 0.6509608626365662, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 630 + }, + { + "completion_length": 237.0, + "epoch": 2.2062937062937062, + "grad_norm": 0.0632646456360817, + "kl": 0.2671894431114197, + "learning_rate": 4.3254056915518815e-06, + "loss": 0.0131, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 631 + }, + { + "completion_length": 501.16668701171875, + "epoch": 2.20979020979021, + "grad_norm": 0.44626739621162415, + "kl": 0.2233467698097229, + "learning_rate": 4.322421568553529e-06, + "loss": 0.0089, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 632 + }, + { + "completion_length": 187.5, + "epoch": 2.2132867132867133, + "grad_norm": 0.9024590849876404, + "kl": 0.299750417470932, + "learning_rate": 4.319431894143027e-06, + "loss": 0.012, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 633 + }, + { + "completion_length": 532.5, + "epoch": 2.2167832167832167, + "grad_norm": 0.38001272082328796, + "kl": 0.28776365518569946, + "learning_rate": 4.316436677427441e-06, + "loss": 0.0115, + "reward": 3.566666603088379, + "reward_std": 0.9389711618423462, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 634 + }, + { + "completion_length": 201.6666717529297, + "epoch": 2.2202797202797204, + "grad_norm": 1.1841076612472534, + "kl": 0.3013113737106323, + "learning_rate": 4.313435927530719e-06, + "loss": 0.0121, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 635 + }, + { + "completion_length": 185.6666717529297, + "epoch": 2.2237762237762237, + "grad_norm": 0.8018883466720581, + "kl": 0.2923080325126648, + "learning_rate": 4.3104296535936695e-06, + "loss": 0.0117, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 636 + }, + { + "completion_length": 525.3333740234375, + "epoch": 2.227272727272727, + "grad_norm": 0.4936811923980713, + "kl": 0.25341111421585083, + "learning_rate": 4.3074178647739205e-06, + "loss": 0.0101, + "reward": 3.2083334922790527, + "reward_std": 0.9697508215904236, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 637 + }, + { + "completion_length": 191.83334350585938, + "epoch": 2.230769230769231, + "grad_norm": 0.6575815677642822, + "kl": 0.3100575804710388, + "learning_rate": 4.3044005702459055e-06, + "loss": 0.0124, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 638 + }, + { + "completion_length": 178.5, + "epoch": 2.234265734265734, + "grad_norm": 0.8525052666664124, + "kl": 0.31076908111572266, + "learning_rate": 4.301377779200826e-06, + "loss": 0.0124, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 639 + }, + { + "completion_length": 185.33334350585938, + "epoch": 2.237762237762238, + "grad_norm": 1.0106300115585327, + "kl": 0.30621784925460815, + "learning_rate": 4.2983495008466285e-06, + "loss": 0.0122, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 640 + }, + { + "completion_length": 186.5, + "epoch": 2.2412587412587412, + "grad_norm": 0.885761022567749, + "kl": 0.3738858103752136, + "learning_rate": 4.295315744407972e-06, + "loss": 0.015, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 641 + }, + { + "completion_length": 171.6666717529297, + "epoch": 2.2447552447552446, + "grad_norm": 1.113839030265808, + "kl": 0.3465404212474823, + "learning_rate": 4.2922765191262075e-06, + "loss": 0.0139, + "reward": 2.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 642 + }, + { + "completion_length": 203.0, + "epoch": 2.2482517482517483, + "grad_norm": 0.8950809836387634, + "kl": 0.2658528983592987, + "learning_rate": 4.28923183425934e-06, + "loss": 0.0106, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 643 + }, + { + "completion_length": 198.5, + "epoch": 2.2517482517482517, + "grad_norm": 0.9561752080917358, + "kl": 0.31710129976272583, + "learning_rate": 4.286181699082008e-06, + "loss": 0.0127, + "reward": 2.2833335399627686, + "reward_std": 1.0327956676483154, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 644 + }, + { + "completion_length": 168.1666717529297, + "epoch": 2.2552447552447554, + "grad_norm": 0.8310069441795349, + "kl": 0.27687615156173706, + "learning_rate": 4.283126122885455e-06, + "loss": 0.0111, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 645 + }, + { + "completion_length": 196.83334350585938, + "epoch": 2.2587412587412588, + "grad_norm": 0.09269661456346512, + "kl": 0.2699682414531708, + "learning_rate": 4.280065114977492e-06, + "loss": 0.012, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 646 + }, + { + "completion_length": 163.6666717529297, + "epoch": 2.262237762237762, + "grad_norm": 1.2992812395095825, + "kl": 0.3616819381713867, + "learning_rate": 4.276998684682482e-06, + "loss": 0.0145, + "reward": 2.375, + "reward_std": 1.1847995519638062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 647 + }, + { + "completion_length": 198.6666717529297, + "epoch": 2.265734265734266, + "grad_norm": 0.8000275492668152, + "kl": 0.2609575390815735, + "learning_rate": 4.273926841341303e-06, + "loss": 0.0104, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 648 + }, + { + "completion_length": 196.1666717529297, + "epoch": 2.269230769230769, + "grad_norm": 0.8786153197288513, + "kl": 0.3877195119857788, + "learning_rate": 4.270849594311323e-06, + "loss": 0.0155, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 649 + }, + { + "completion_length": 201.0, + "epoch": 2.2727272727272725, + "grad_norm": 0.9727340936660767, + "kl": 0.3743540942668915, + "learning_rate": 4.267766952966369e-06, + "loss": 0.015, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 650 + }, + { + "completion_length": 205.33334350585938, + "epoch": 2.2762237762237763, + "grad_norm": 0.09209764748811722, + "kl": 0.27989333868026733, + "learning_rate": 4.264678926696703e-06, + "loss": 0.0136, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 651 + }, + { + "completion_length": 202.5, + "epoch": 2.2797202797202796, + "grad_norm": 0.9205158948898315, + "kl": 0.3037436008453369, + "learning_rate": 4.261585524908987e-06, + "loss": 0.0121, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 652 + }, + { + "completion_length": 304.66668701171875, + "epoch": 2.2832167832167833, + "grad_norm": 0.8844843506813049, + "kl": 0.3668223023414612, + "learning_rate": 4.25848675702626e-06, + "loss": 0.0147, + "reward": 1.9500001668930054, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 653 + }, + { + "completion_length": 194.6666717529297, + "epoch": 2.2867132867132867, + "grad_norm": 1.0558805465698242, + "kl": 0.3064219057559967, + "learning_rate": 4.255382632487907e-06, + "loss": 0.0123, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 654 + }, + { + "completion_length": 185.6666717529297, + "epoch": 2.29020979020979, + "grad_norm": 0.9313608407974243, + "kl": 0.31230098009109497, + "learning_rate": 4.2522731607496275e-06, + "loss": 0.0125, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 655 + }, + { + "completion_length": 211.1666717529297, + "epoch": 2.2937062937062938, + "grad_norm": 0.19107016921043396, + "kl": 0.373710036277771, + "learning_rate": 4.249158351283414e-06, + "loss": 0.0173, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 656 + }, + { + "completion_length": 348.8333435058594, + "epoch": 2.297202797202797, + "grad_norm": 0.7309221029281616, + "kl": 0.3733287751674652, + "learning_rate": 4.246038213577516e-06, + "loss": 0.0149, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 657 + }, + { + "completion_length": 180.6666717529297, + "epoch": 2.300699300699301, + "grad_norm": 0.8861889839172363, + "kl": 0.35562607645988464, + "learning_rate": 4.242912757136412e-06, + "loss": 0.0142, + "reward": 2.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 658 + }, + { + "completion_length": 204.5, + "epoch": 2.304195804195804, + "grad_norm": 0.7407400608062744, + "kl": 0.28287678956985474, + "learning_rate": 4.239781991480786e-06, + "loss": 0.0113, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 659 + }, + { + "completion_length": 174.0, + "epoch": 2.3076923076923075, + "grad_norm": 8.534856796264648, + "kl": 1.5403010845184326, + "learning_rate": 4.236645926147493e-06, + "loss": 0.0616, + "reward": 2.2916667461395264, + "reward_std": 0.8002604246139526, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 660 + }, + { + "completion_length": 184.33334350585938, + "epoch": 2.3111888111888113, + "grad_norm": 0.06887773424386978, + "kl": 0.2856985628604889, + "learning_rate": 4.233504570689533e-06, + "loss": 0.0138, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 661 + }, + { + "completion_length": 202.83334350585938, + "epoch": 2.3146853146853146, + "grad_norm": 0.8288156986236572, + "kl": 0.2896421253681183, + "learning_rate": 4.230357934676017e-06, + "loss": 0.0116, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 662 + }, + { + "completion_length": 207.6666717529297, + "epoch": 2.3181818181818183, + "grad_norm": 1.119509220123291, + "kl": 0.4124630391597748, + "learning_rate": 4.227206027692146e-06, + "loss": 0.0165, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 663 + }, + { + "completion_length": 198.1666717529297, + "epoch": 2.3216783216783217, + "grad_norm": 0.8312250971794128, + "kl": 0.3108134865760803, + "learning_rate": 4.224048859339175e-06, + "loss": 0.0124, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 664 + }, + { + "completion_length": 610.8333740234375, + "epoch": 2.325174825174825, + "grad_norm": 0.5707215070724487, + "kl": 0.23091670870780945, + "learning_rate": 4.220886439234385e-06, + "loss": 0.0092, + "reward": 2.383333444595337, + "reward_std": 1.5413198471069336, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 665 + }, + { + "completion_length": 460.0, + "epoch": 2.3286713286713288, + "grad_norm": 10.873461723327637, + "kl": 2.6264634132385254, + "learning_rate": 4.217718777011058e-06, + "loss": 0.1051, + "reward": 1.4666666984558105, + "reward_std": 1.356711745262146, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 666 + }, + { + "completion_length": 207.0, + "epoch": 2.332167832167832, + "grad_norm": 0.6674370765686035, + "kl": 0.2692621350288391, + "learning_rate": 4.2145458823184414e-06, + "loss": 0.0108, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 667 + }, + { + "completion_length": 567.8333740234375, + "epoch": 2.335664335664336, + "grad_norm": 0.42179885506629944, + "kl": 0.2716664671897888, + "learning_rate": 4.211367764821722e-06, + "loss": 0.0109, + "reward": 3.566666603088379, + "reward_std": 0.938971221446991, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 668 + }, + { + "completion_length": 223.5, + "epoch": 2.339160839160839, + "grad_norm": 0.6866164803504944, + "kl": 0.24070698022842407, + "learning_rate": 4.208184434201999e-06, + "loss": 0.0096, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 669 + }, + { + "completion_length": 214.5, + "epoch": 2.3426573426573425, + "grad_norm": 0.9751102924346924, + "kl": 0.2499878704547882, + "learning_rate": 4.204995900156247e-06, + "loss": 0.01, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 670 + }, + { + "completion_length": 182.33334350585938, + "epoch": 2.3461538461538463, + "grad_norm": 3.7804720401763916, + "kl": 0.46188828349113464, + "learning_rate": 4.201802172397295e-06, + "loss": 0.0185, + "reward": 3.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 671 + }, + { + "completion_length": 1019.6666870117188, + "epoch": 2.3496503496503496, + "grad_norm": 0.4247821569442749, + "kl": 0.21799665689468384, + "learning_rate": 4.198603260653792e-06, + "loss": 0.0087, + "reward": 2.7166669368743896, + "reward_std": 1.6418485641479492, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666388511658, + "step": 672 + }, + { + "completion_length": 753.8333740234375, + "epoch": 2.3531468531468533, + "grad_norm": 0.5194523334503174, + "kl": 0.22523364424705505, + "learning_rate": 4.195399174670177e-06, + "loss": 0.009, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 673 + }, + { + "completion_length": 573.6666870117188, + "epoch": 2.3566433566433567, + "grad_norm": 0.5000849366188049, + "kl": 0.22850388288497925, + "learning_rate": 4.192189924206652e-06, + "loss": 0.0091, + "reward": 2.950000286102295, + "reward_std": 1.095445156097412, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 674 + }, + { + "completion_length": 1213.8333740234375, + "epoch": 2.36013986013986, + "grad_norm": 0.5522187352180481, + "kl": 0.177886962890625, + "learning_rate": 4.188975519039151e-06, + "loss": 0.0071, + "reward": 1.3916667699813843, + "reward_std": 1.4026464223861694, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5583333969116211, + "step": 675 + }, + { + "completion_length": 872.6666870117188, + "epoch": 2.3636363636363638, + "grad_norm": 0.4857361912727356, + "kl": 0.20906971395015717, + "learning_rate": 4.185755968959308e-06, + "loss": 0.0084, + "reward": 2.9083335399627686, + "reward_std": 1.696000337600708, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 676 + }, + { + "completion_length": 502.0, + "epoch": 2.367132867132867, + "grad_norm": 0.5935739278793335, + "kl": 0.27800655364990234, + "learning_rate": 4.182531283774434e-06, + "loss": 0.0111, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 677 + }, + { + "completion_length": 738.5, + "epoch": 2.370629370629371, + "grad_norm": 0.5985221862792969, + "kl": 0.2548876702785492, + "learning_rate": 4.179301473307476e-06, + "loss": 0.0102, + "reward": 2.2125000953674316, + "reward_std": 1.3164108991622925, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 678 + }, + { + "completion_length": 173.83334350585938, + "epoch": 2.374125874125874, + "grad_norm": 1.7061294317245483, + "kl": 0.3693540692329407, + "learning_rate": 4.176066547396998e-06, + "loss": 0.0148, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 679 + }, + { + "completion_length": 203.6666717529297, + "epoch": 2.3776223776223775, + "grad_norm": 1.0101178884506226, + "kl": 0.31931599974632263, + "learning_rate": 4.172826515897146e-06, + "loss": 0.0128, + "reward": 2.2083334922790527, + "reward_std": 1.1577637195587158, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 680 + }, + { + "completion_length": 205.83334350585938, + "epoch": 2.3811188811188813, + "grad_norm": 0.8966777920722961, + "kl": 0.3051684498786926, + "learning_rate": 4.169581388677617e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 681 + }, + { + "completion_length": 210.6666717529297, + "epoch": 2.3846153846153846, + "grad_norm": 0.7840998768806458, + "kl": 0.31647345423698425, + "learning_rate": 4.166331175623631e-06, + "loss": 0.0127, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 682 + }, + { + "completion_length": 209.1666717529297, + "epoch": 2.3881118881118883, + "grad_norm": 0.9048584699630737, + "kl": 0.25157231092453003, + "learning_rate": 4.163075886635902e-06, + "loss": 0.0101, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 683 + }, + { + "completion_length": 494.66668701171875, + "epoch": 2.3916083916083917, + "grad_norm": 0.612885057926178, + "kl": 0.1984379142522812, + "learning_rate": 4.159815531630604e-06, + "loss": 0.0079, + "reward": 2.125, + "reward_std": 1.3129165172576904, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 684 + }, + { + "completion_length": 182.83334350585938, + "epoch": 2.395104895104895, + "grad_norm": 1.069145679473877, + "kl": 0.33643895387649536, + "learning_rate": 4.1565501205393445e-06, + "loss": 0.0135, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 685 + }, + { + "completion_length": 192.6666717529297, + "epoch": 2.3986013986013988, + "grad_norm": 0.8116271495819092, + "kl": 0.29202282428741455, + "learning_rate": 4.15327966330913e-06, + "loss": 0.0117, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 686 + }, + { + "completion_length": 196.0, + "epoch": 2.402097902097902, + "grad_norm": 0.9276851415634155, + "kl": 0.31228408217430115, + "learning_rate": 4.150004169902343e-06, + "loss": 0.0125, + "reward": 1.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 687 + }, + { + "completion_length": 213.6666717529297, + "epoch": 2.4055944055944054, + "grad_norm": 1.0499162673950195, + "kl": 0.24672053754329681, + "learning_rate": 4.146723650296701e-06, + "loss": 0.0099, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 688 + }, + { + "completion_length": 219.1666717529297, + "epoch": 2.409090909090909, + "grad_norm": 0.7051374912261963, + "kl": 0.24717721343040466, + "learning_rate": 4.14343811448524e-06, + "loss": 0.0099, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 689 + }, + { + "completion_length": 226.5, + "epoch": 2.4125874125874125, + "grad_norm": 0.7789434194564819, + "kl": 0.2564643919467926, + "learning_rate": 4.140147572476269e-06, + "loss": 0.0103, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 690 + }, + { + "completion_length": 212.0, + "epoch": 2.4160839160839163, + "grad_norm": 0.8126075267791748, + "kl": 0.23958399891853333, + "learning_rate": 4.136852034293349e-06, + "loss": 0.0096, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 691 + }, + { + "completion_length": 210.6666717529297, + "epoch": 2.4195804195804196, + "grad_norm": 0.8626409769058228, + "kl": 0.2777412533760071, + "learning_rate": 4.133551509975264e-06, + "loss": 0.0111, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 692 + }, + { + "completion_length": 529.8333740234375, + "epoch": 2.423076923076923, + "grad_norm": 0.5266372561454773, + "kl": 0.2946487069129944, + "learning_rate": 4.130246009575981e-06, + "loss": 0.0118, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 693 + }, + { + "completion_length": 217.6666717529297, + "epoch": 2.4265734265734267, + "grad_norm": 0.814607560634613, + "kl": 0.31643202900886536, + "learning_rate": 4.126935543164628e-06, + "loss": 0.0127, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 694 + }, + { + "completion_length": 206.1666717529297, + "epoch": 2.43006993006993, + "grad_norm": 0.6121898293495178, + "kl": 0.24353787302970886, + "learning_rate": 4.123620120825459e-06, + "loss": 0.0097, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 695 + }, + { + "completion_length": 416.16668701171875, + "epoch": 2.4335664335664333, + "grad_norm": 0.65854811668396, + "kl": 0.29339665174484253, + "learning_rate": 4.120299752657828e-06, + "loss": 0.0117, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 696 + }, + { + "completion_length": 524.3333740234375, + "epoch": 2.437062937062937, + "grad_norm": 0.5596239566802979, + "kl": 0.26455265283584595, + "learning_rate": 4.11697444877615e-06, + "loss": 0.0106, + "reward": 2.7916667461395264, + "reward_std": 1.4527273178100586, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 697 + }, + { + "completion_length": 173.6666717529297, + "epoch": 2.4405594405594404, + "grad_norm": 2.7013747692108154, + "kl": 0.5755926370620728, + "learning_rate": 4.113644219309877e-06, + "loss": 0.023, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 698 + }, + { + "completion_length": 893.0, + "epoch": 2.444055944055944, + "grad_norm": 0.5892761945724487, + "kl": 0.22364209592342377, + "learning_rate": 4.110309074403467e-06, + "loss": 0.0089, + "reward": 2.433333396911621, + "reward_std": 1.3728317022323608, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 699 + }, + { + "completion_length": 1029.166748046875, + "epoch": 2.4475524475524475, + "grad_norm": 0.41362571716308594, + "kl": 0.20189592242240906, + "learning_rate": 4.106969024216348e-06, + "loss": 0.0081, + "reward": 1.7416666746139526, + "reward_std": 0.9625055193901062, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7416666746139526, + "step": 700 + }, + { + "completion_length": 200.5, + "epoch": 2.451048951048951, + "grad_norm": 0.9199966788291931, + "kl": 0.29405680298805237, + "learning_rate": 4.103624078922895e-06, + "loss": 0.0118, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 701 + }, + { + "completion_length": 551.8333740234375, + "epoch": 2.4545454545454546, + "grad_norm": 0.5847578644752502, + "kl": 0.30494964122772217, + "learning_rate": 4.1002742487123896e-06, + "loss": 0.0122, + "reward": 2.4583334922790527, + "reward_std": 1.3603003025054932, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 702 + }, + { + "completion_length": 158.5, + "epoch": 2.458041958041958, + "grad_norm": 3.148179054260254, + "kl": 0.33209604024887085, + "learning_rate": 4.096919543788995e-06, + "loss": 0.0133, + "reward": 2.9583334922790527, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 703 + }, + { + "completion_length": 635.1666870117188, + "epoch": 2.4615384615384617, + "grad_norm": 0.7368152141571045, + "kl": 0.2001763880252838, + "learning_rate": 4.093559974371725e-06, + "loss": 0.008, + "reward": 3.204166889190674, + "reward_std": 0.42143115401268005, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 704 + }, + { + "completion_length": 206.83334350585938, + "epoch": 2.465034965034965, + "grad_norm": 0.7404118776321411, + "kl": 0.2592664361000061, + "learning_rate": 4.09019555069441e-06, + "loss": 0.0104, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 705 + }, + { + "completion_length": 203.5, + "epoch": 2.4685314685314683, + "grad_norm": 0.7086665630340576, + "kl": 0.28512802720069885, + "learning_rate": 4.086826283005669e-06, + "loss": 0.0114, + "reward": 2.704166889190674, + "reward_std": 0.6021662950515747, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 706 + }, + { + "completion_length": 175.33334350585938, + "epoch": 2.472027972027972, + "grad_norm": 3.0447657108306885, + "kl": 0.38635802268981934, + "learning_rate": 4.083452181568876e-06, + "loss": 0.0155, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 707 + }, + { + "completion_length": 200.6666717529297, + "epoch": 2.4755244755244754, + "grad_norm": 0.7985562682151794, + "kl": 0.30575287342071533, + "learning_rate": 4.080073256662128e-06, + "loss": 0.0122, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 708 + }, + { + "completion_length": 212.5, + "epoch": 2.479020979020979, + "grad_norm": 1.0262845754623413, + "kl": 0.30596381425857544, + "learning_rate": 4.076689518578217e-06, + "loss": 0.0122, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 709 + }, + { + "completion_length": 199.5, + "epoch": 2.4825174825174825, + "grad_norm": 0.8163771629333496, + "kl": 0.23148366808891296, + "learning_rate": 4.073300977624594e-06, + "loss": 0.0093, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 710 + }, + { + "completion_length": 228.33334350585938, + "epoch": 2.486013986013986, + "grad_norm": 0.6531832218170166, + "kl": 0.27565860748291016, + "learning_rate": 4.069907644123346e-06, + "loss": 0.011, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 711 + }, + { + "completion_length": 487.16668701171875, + "epoch": 2.4895104895104896, + "grad_norm": 0.3693908452987671, + "kl": 0.32342347502708435, + "learning_rate": 4.066509528411151e-06, + "loss": 0.0129, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 712 + }, + { + "completion_length": 191.83334350585938, + "epoch": 2.493006993006993, + "grad_norm": 0.822213351726532, + "kl": 0.3490138649940491, + "learning_rate": 4.063106640839264e-06, + "loss": 0.014, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 713 + }, + { + "completion_length": 178.6666717529297, + "epoch": 2.4965034965034967, + "grad_norm": 0.7303230166435242, + "kl": 0.26454809308052063, + "learning_rate": 4.059698991773466e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 714 + }, + { + "completion_length": 213.6666717529297, + "epoch": 2.5, + "grad_norm": 0.792052149772644, + "kl": 0.32973194122314453, + "learning_rate": 4.056286591594049e-06, + "loss": 0.0132, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 715 + }, + { + "completion_length": 211.0, + "epoch": 2.5034965034965033, + "grad_norm": 0.6441434025764465, + "kl": 0.3346059024333954, + "learning_rate": 4.052869450695776e-06, + "loss": 0.0134, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 716 + }, + { + "completion_length": 181.6666717529297, + "epoch": 2.506993006993007, + "grad_norm": 2.2384145259857178, + "kl": 0.4402106702327728, + "learning_rate": 4.049447579487851e-06, + "loss": 0.0176, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 717 + }, + { + "completion_length": 825.8333740234375, + "epoch": 2.5104895104895104, + "grad_norm": 0.4227934777736664, + "kl": 0.19202569127082825, + "learning_rate": 4.046020988393886e-06, + "loss": 0.0077, + "reward": 2.7916667461395264, + "reward_std": 1.5844295024871826, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 718 + }, + { + "completion_length": 199.6666717529297, + "epoch": 2.513986013986014, + "grad_norm": 0.7948997020721436, + "kl": 0.30144181847572327, + "learning_rate": 4.0425896878518725e-06, + "loss": 0.0121, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 719 + }, + { + "completion_length": 200.6666717529297, + "epoch": 2.5174825174825175, + "grad_norm": 0.7969666123390198, + "kl": 0.2623240351676941, + "learning_rate": 4.039153688314146e-06, + "loss": 0.0105, + "reward": 2.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 720 + }, + { + "completion_length": 184.6666717529297, + "epoch": 2.520979020979021, + "grad_norm": 1.1336637735366821, + "kl": 0.2935950756072998, + "learning_rate": 4.035713000247358e-06, + "loss": 0.0117, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 721 + }, + { + "completion_length": 667.3333740234375, + "epoch": 2.5244755244755246, + "grad_norm": 0.39087414741516113, + "kl": 0.2444695681333542, + "learning_rate": 4.032267634132442e-06, + "loss": 0.0098, + "reward": 3.704166889190674, + "reward_std": 0.6021662950515747, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 722 + }, + { + "completion_length": 818.5, + "epoch": 2.527972027972028, + "grad_norm": 0.42902201414108276, + "kl": 0.18485748767852783, + "learning_rate": 4.028817600464579e-06, + "loss": 0.0074, + "reward": 1.9666666984558105, + "reward_std": 1.5233734846115112, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 723 + }, + { + "completion_length": 197.33334350585938, + "epoch": 2.5314685314685317, + "grad_norm": 0.5554837584495544, + "kl": 0.3039252758026123, + "learning_rate": 4.02536290975317e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 724 + }, + { + "completion_length": 519.3333740234375, + "epoch": 2.534965034965035, + "grad_norm": 0.44166073203086853, + "kl": 0.24431876838207245, + "learning_rate": 4.021903572521802e-06, + "loss": 0.0098, + "reward": 2.7916667461395264, + "reward_std": 1.4527273178100586, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 725 + }, + { + "completion_length": 200.0, + "epoch": 2.5384615384615383, + "grad_norm": 0.7037209868431091, + "kl": 0.3631229102611542, + "learning_rate": 4.018439599308217e-06, + "loss": 0.0145, + "reward": 3.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 726 + }, + { + "completion_length": 192.5, + "epoch": 2.541958041958042, + "grad_norm": 0.664789617061615, + "kl": 0.29182663559913635, + "learning_rate": 4.0149710006642775e-06, + "loss": 0.0117, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 727 + }, + { + "completion_length": 198.5, + "epoch": 2.5454545454545454, + "grad_norm": 1.0678514242172241, + "kl": 0.28828293085098267, + "learning_rate": 4.011497787155938e-06, + "loss": 0.0115, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 728 + }, + { + "completion_length": 213.83334350585938, + "epoch": 2.548951048951049, + "grad_norm": 0.8395413756370544, + "kl": 0.3076155185699463, + "learning_rate": 4.008019969363206e-06, + "loss": 0.0123, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 729 + }, + { + "completion_length": 212.0, + "epoch": 2.5524475524475525, + "grad_norm": 0.7780301570892334, + "kl": 0.2876867651939392, + "learning_rate": 4.0045375578801216e-06, + "loss": 0.0115, + "reward": 2.616666793823242, + "reward_std": 1.0327956676483154, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 730 + }, + { + "completion_length": 183.33334350585938, + "epoch": 2.555944055944056, + "grad_norm": 0.043716005980968475, + "kl": 0.40688663721084595, + "learning_rate": 4.001050563314711e-06, + "loss": 0.0187, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 731 + }, + { + "completion_length": 181.6666717529297, + "epoch": 2.5594405594405596, + "grad_norm": 0.7270947098731995, + "kl": 0.2820360064506531, + "learning_rate": 3.997558996288965e-06, + "loss": 0.0113, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 732 + }, + { + "completion_length": 522.0, + "epoch": 2.562937062937063, + "grad_norm": 0.5480185747146606, + "kl": 0.2843058109283447, + "learning_rate": 3.994062867438803e-06, + "loss": 0.0114, + "reward": 3.016666889190674, + "reward_std": 0.9521905183792114, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 733 + }, + { + "completion_length": 192.0, + "epoch": 2.5664335664335667, + "grad_norm": 0.733644962310791, + "kl": 0.27982231974601746, + "learning_rate": 3.9905621874140396e-06, + "loss": 0.0112, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 734 + }, + { + "completion_length": 198.6666717529297, + "epoch": 2.56993006993007, + "grad_norm": 0.7122451066970825, + "kl": 0.36668699979782104, + "learning_rate": 3.987056966878354e-06, + "loss": 0.0147, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 735 + }, + { + "completion_length": 204.33334350585938, + "epoch": 2.5734265734265733, + "grad_norm": 0.07662484794855118, + "kl": 0.3632362484931946, + "learning_rate": 3.983547216509254e-06, + "loss": 0.0169, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 736 + }, + { + "completion_length": 189.1666717529297, + "epoch": 2.5769230769230766, + "grad_norm": 0.34811052680015564, + "kl": 0.4749183654785156, + "learning_rate": 3.9800329469980495e-06, + "loss": 0.0214, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 737 + }, + { + "completion_length": 583.5, + "epoch": 2.5804195804195804, + "grad_norm": 0.3855575919151306, + "kl": 0.2539462447166443, + "learning_rate": 3.976514169049814e-06, + "loss": 0.0102, + "reward": 2.704166889190674, + "reward_std": 0.6021661758422852, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 738 + }, + { + "completion_length": 174.6666717529297, + "epoch": 2.583916083916084, + "grad_norm": 1.0900449752807617, + "kl": 0.3619951605796814, + "learning_rate": 3.972990893383356e-06, + "loss": 0.0145, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 739 + }, + { + "completion_length": 203.1666717529297, + "epoch": 2.5874125874125875, + "grad_norm": 0.9708390831947327, + "kl": 0.28454601764678955, + "learning_rate": 3.969463130731183e-06, + "loss": 0.0114, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 740 + }, + { + "completion_length": 522.1666870117188, + "epoch": 2.590909090909091, + "grad_norm": 0.6295937895774841, + "kl": 0.26834964752197266, + "learning_rate": 3.965930891839473e-06, + "loss": 0.0107, + "reward": 3.370833396911621, + "reward_std": 0.4925486445426941, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 741 + }, + { + "completion_length": 703.5, + "epoch": 2.594405594405594, + "grad_norm": 0.35760697722435, + "kl": 0.28400832414627075, + "learning_rate": 3.96239418746804e-06, + "loss": 0.0114, + "reward": 3.066666603088379, + "reward_std": 0.2857738435268402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8999999761581421, + "step": 742 + }, + { + "completion_length": 833.8333740234375, + "epoch": 2.597902097902098, + "grad_norm": 0.5528135895729065, + "kl": 0.28165918588638306, + "learning_rate": 3.958853028390294e-06, + "loss": 0.0113, + "reward": 2.3583335876464844, + "reward_std": 1.752831220626831, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916666030883789, + "step": 743 + }, + { + "completion_length": 143.33334350585938, + "epoch": 2.6013986013986012, + "grad_norm": 0.7684369683265686, + "kl": 0.3106473684310913, + "learning_rate": 3.955307425393224e-06, + "loss": 0.0124, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 744 + }, + { + "completion_length": 789.1666870117188, + "epoch": 2.604895104895105, + "grad_norm": 0.9867936372756958, + "kl": 0.2591046094894409, + "learning_rate": 3.951757389277349e-06, + "loss": 0.0104, + "reward": 3.3500001430511475, + "reward_std": 0.5224940180778503, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 745 + }, + { + "completion_length": 194.83334350585938, + "epoch": 2.6083916083916083, + "grad_norm": 0.7808223962783813, + "kl": 0.30762046575546265, + "learning_rate": 3.948202930856697e-06, + "loss": 0.0123, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 746 + }, + { + "completion_length": 503.8333435058594, + "epoch": 2.6118881118881117, + "grad_norm": 0.6441946625709534, + "kl": 0.2855534851551056, + "learning_rate": 3.944644060958764e-06, + "loss": 0.0114, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 747 + }, + { + "completion_length": 190.1666717529297, + "epoch": 2.6153846153846154, + "grad_norm": 0.8443914651870728, + "kl": 0.32207822799682617, + "learning_rate": 3.941080790424483e-06, + "loss": 0.0129, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 748 + }, + { + "completion_length": 193.6666717529297, + "epoch": 2.6188811188811187, + "grad_norm": 0.620596706867218, + "kl": 0.33432909846305847, + "learning_rate": 3.9375131301081974e-06, + "loss": 0.0134, + "reward": 2.616666793823242, + "reward_std": 1.0327956676483154, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 749 + }, + { + "completion_length": 218.33334350585938, + "epoch": 2.6223776223776225, + "grad_norm": 0.8599146604537964, + "kl": 0.2318965494632721, + "learning_rate": 3.933941090877615e-06, + "loss": 0.0093, + "reward": 2.0375001430511475, + "reward_std": 0.4857339859008789, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 750 + }, + { + "completion_length": 196.83334350585938, + "epoch": 2.625874125874126, + "grad_norm": 0.042067479342222214, + "kl": 0.25582045316696167, + "learning_rate": 3.930364683613791e-06, + "loss": 0.0114, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 751 + }, + { + "completion_length": 187.6666717529297, + "epoch": 2.629370629370629, + "grad_norm": 0.6770573854446411, + "kl": 0.2656649649143219, + "learning_rate": 3.92678391921108e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 752 + }, + { + "completion_length": 217.5, + "epoch": 2.632867132867133, + "grad_norm": 1.6130694150924683, + "kl": 0.29323238134384155, + "learning_rate": 3.923198808577111e-06, + "loss": 0.0117, + "reward": 2.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 753 + }, + { + "completion_length": 224.83334350585938, + "epoch": 2.6363636363636362, + "grad_norm": 0.7095122933387756, + "kl": 0.27353787422180176, + "learning_rate": 3.9196093626327535e-06, + "loss": 0.0109, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 754 + }, + { + "completion_length": 827.5, + "epoch": 2.63986013986014, + "grad_norm": 0.5739628076553345, + "kl": 0.21068716049194336, + "learning_rate": 3.916015592312083e-06, + "loss": 0.0084, + "reward": 2.883333206176758, + "reward_std": 1.7192052602767944, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 755 + }, + { + "completion_length": 186.0, + "epoch": 2.6433566433566433, + "grad_norm": 0.8608355522155762, + "kl": 0.3407597243785858, + "learning_rate": 3.912417508562345e-06, + "loss": 0.0136, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 756 + }, + { + "completion_length": 556.3333740234375, + "epoch": 2.6468531468531467, + "grad_norm": 0.3163861036300659, + "kl": 0.2427646368741989, + "learning_rate": 3.908815122343929e-06, + "loss": 0.0097, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 757 + }, + { + "completion_length": 187.5, + "epoch": 2.6503496503496504, + "grad_norm": 0.8031748533248901, + "kl": 0.30763155221939087, + "learning_rate": 3.905208444630326e-06, + "loss": 0.0123, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 758 + }, + { + "completion_length": 218.1666717529297, + "epoch": 2.6538461538461537, + "grad_norm": 0.8372368216514587, + "kl": 0.28790879249572754, + "learning_rate": 3.901597486408105e-06, + "loss": 0.0115, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 759 + }, + { + "completion_length": 1181.8333740234375, + "epoch": 2.6573426573426575, + "grad_norm": 0.647392988204956, + "kl": 0.20365619659423828, + "learning_rate": 3.897982258676867e-06, + "loss": 0.0081, + "reward": 1.7208335399627686, + "reward_std": 1.5208892822265625, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5541666746139526, + "step": 760 + }, + { + "completion_length": 180.1666717529297, + "epoch": 2.660839160839161, + "grad_norm": 0.6884165406227112, + "kl": 0.2719978392124176, + "learning_rate": 3.894362772449226e-06, + "loss": 0.0109, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 761 + }, + { + "completion_length": 257.8333435058594, + "epoch": 2.664335664335664, + "grad_norm": 1.337699055671692, + "kl": 0.5194430351257324, + "learning_rate": 3.890739038750763e-06, + "loss": 0.0208, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 762 + }, + { + "completion_length": 498.16668701171875, + "epoch": 2.667832167832168, + "grad_norm": 0.9563208818435669, + "kl": 0.3499029874801636, + "learning_rate": 3.887111068619999e-06, + "loss": 0.014, + "reward": 1.75, + "reward_std": 1.1730302572250366, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.75, + "step": 763 + }, + { + "completion_length": 215.33334350585938, + "epoch": 2.6713286713286712, + "grad_norm": 0.5849650502204895, + "kl": 0.21754197776317596, + "learning_rate": 3.88347887310836e-06, + "loss": 0.0087, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 764 + }, + { + "completion_length": 206.83334350585938, + "epoch": 2.674825174825175, + "grad_norm": 0.5816351771354675, + "kl": 0.2685267925262451, + "learning_rate": 3.879842463280146e-06, + "loss": 0.0107, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 765 + }, + { + "completion_length": 190.1666717529297, + "epoch": 2.6783216783216783, + "grad_norm": 0.7096436023712158, + "kl": 0.3302849531173706, + "learning_rate": 3.876201850212489e-06, + "loss": 0.0132, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 766 + }, + { + "completion_length": 205.83334350585938, + "epoch": 2.6818181818181817, + "grad_norm": 0.7019976377487183, + "kl": 0.3386441469192505, + "learning_rate": 3.87255704499533e-06, + "loss": 0.0135, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 767 + }, + { + "completion_length": 220.83334350585938, + "epoch": 2.6853146853146854, + "grad_norm": 0.7764424681663513, + "kl": 0.25025084614753723, + "learning_rate": 3.868908058731376e-06, + "loss": 0.01, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 768 + }, + { + "completion_length": 191.1666717529297, + "epoch": 2.6888111888111887, + "grad_norm": 0.6796668767929077, + "kl": 0.2684442698955536, + "learning_rate": 3.865254902536073e-06, + "loss": 0.0107, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 769 + }, + { + "completion_length": 898.3333740234375, + "epoch": 2.6923076923076925, + "grad_norm": 0.38831865787506104, + "kl": 0.14873462915420532, + "learning_rate": 3.861597587537568e-06, + "loss": 0.0059, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 770 + }, + { + "completion_length": 1279.166748046875, + "epoch": 2.695804195804196, + "grad_norm": 0.6360457539558411, + "kl": 0.1556037813425064, + "learning_rate": 3.857936124876677e-06, + "loss": 0.0062, + "reward": 2.25, + "reward_std": 1.957294225692749, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5833333730697632, + "step": 771 + }, + { + "completion_length": 185.6666717529297, + "epoch": 2.699300699300699, + "grad_norm": 0.8891352415084839, + "kl": 0.2973707318305969, + "learning_rate": 3.85427052570685e-06, + "loss": 0.0119, + "reward": 2.450000286102295, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 772 + }, + { + "completion_length": 223.0, + "epoch": 2.702797202797203, + "grad_norm": 0.9200516939163208, + "kl": 0.2344827651977539, + "learning_rate": 3.850600801194138e-06, + "loss": 0.0094, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 773 + }, + { + "completion_length": 181.6666717529297, + "epoch": 2.7062937062937062, + "grad_norm": 1.2495554685592651, + "kl": 0.4023559093475342, + "learning_rate": 3.846926962517158e-06, + "loss": 0.0161, + "reward": 2.4000000953674316, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 774 + }, + { + "completion_length": 186.83334350585938, + "epoch": 2.70979020979021, + "grad_norm": 0.7409746646881104, + "kl": 0.2839186489582062, + "learning_rate": 3.8432490208670605e-06, + "loss": 0.0114, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 775 + }, + { + "completion_length": 187.1666717529297, + "epoch": 2.7132867132867133, + "grad_norm": 0.9320999383926392, + "kl": 0.2990000247955322, + "learning_rate": 3.839566987447492e-06, + "loss": 0.012, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 776 + }, + { + "completion_length": 531.8333740234375, + "epoch": 2.7167832167832167, + "grad_norm": 0.3263534903526306, + "kl": 0.2381911277770996, + "learning_rate": 3.835880873474567e-06, + "loss": 0.0095, + "reward": 3.2916667461395264, + "reward_std": 1.6125807762145996, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 777 + }, + { + "completion_length": 1248.666748046875, + "epoch": 2.7202797202797204, + "grad_norm": 0.5097912549972534, + "kl": 0.1756594479084015, + "learning_rate": 3.832190690176825e-06, + "loss": 0.007, + "reward": 2.25, + "reward_std": 1.957294225692749, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5833333730697632, + "step": 778 + }, + { + "completion_length": 546.5, + "epoch": 2.7237762237762237, + "grad_norm": 0.38489583134651184, + "kl": 0.233808696269989, + "learning_rate": 3.828496448795208e-06, + "loss": 0.0094, + "reward": 2.7916667461395264, + "reward_std": 1.4527273178100586, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 779 + }, + { + "completion_length": 200.5, + "epoch": 2.7272727272727275, + "grad_norm": 0.6196880340576172, + "kl": 0.28656402230262756, + "learning_rate": 3.824798160583012e-06, + "loss": 0.0115, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 780 + }, + { + "completion_length": 206.83334350585938, + "epoch": 2.730769230769231, + "grad_norm": 0.06716328859329224, + "kl": 0.35444962978363037, + "learning_rate": 3.821095836805868e-06, + "loss": 0.0166, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 781 + }, + { + "completion_length": 1108.0, + "epoch": 2.734265734265734, + "grad_norm": 0.46010759472846985, + "kl": 0.2134471833705902, + "learning_rate": 3.817389488741694e-06, + "loss": 0.0085, + "reward": 2.9375, + "reward_std": 1.2437593936920166, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7708332538604736, + "step": 782 + }, + { + "completion_length": 192.83334350585938, + "epoch": 2.737762237762238, + "grad_norm": 0.7892248034477234, + "kl": 0.27930283546447754, + "learning_rate": 3.8136791276806695e-06, + "loss": 0.0112, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 783 + }, + { + "completion_length": 526.1666870117188, + "epoch": 2.7412587412587412, + "grad_norm": 0.5663818120956421, + "kl": 0.23246847093105316, + "learning_rate": 3.8099647649251984e-06, + "loss": 0.0093, + "reward": 2.183333396911621, + "reward_std": 1.0230673551559448, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 784 + }, + { + "completion_length": 302.0, + "epoch": 2.744755244755245, + "grad_norm": 0.8390914797782898, + "kl": 0.304746150970459, + "learning_rate": 3.806246411789872e-06, + "loss": 0.0122, + "reward": 2.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 785 + }, + { + "completion_length": 463.66668701171875, + "epoch": 2.7482517482517483, + "grad_norm": 0.4586171507835388, + "kl": 0.2490534633398056, + "learning_rate": 3.802524079601442e-06, + "loss": 0.01, + "reward": 2.2916667461395264, + "reward_std": 1.19181227684021, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 786 + }, + { + "completion_length": 538.1666870117188, + "epoch": 2.7517482517482517, + "grad_norm": 0.4255636930465698, + "kl": 0.21123819053173065, + "learning_rate": 3.798797779698774e-06, + "loss": 0.0084, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 787 + }, + { + "completion_length": 499.0, + "epoch": 2.755244755244755, + "grad_norm": 0.5292470455169678, + "kl": 0.24850648641586304, + "learning_rate": 3.795067523432826e-06, + "loss": 0.0099, + "reward": 1.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 788 + }, + { + "completion_length": 527.6666870117188, + "epoch": 2.7587412587412588, + "grad_norm": 0.6640042662620544, + "kl": 0.21320059895515442, + "learning_rate": 3.791333322166605e-06, + "loss": 0.0085, + "reward": 2.3500001430511475, + "reward_std": 0.6928203105926514, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 789 + }, + { + "completion_length": 212.5, + "epoch": 2.762237762237762, + "grad_norm": 0.7885140776634216, + "kl": 0.25268059968948364, + "learning_rate": 3.787595187275136e-06, + "loss": 0.0101, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 790 + }, + { + "completion_length": 185.0, + "epoch": 2.765734265734266, + "grad_norm": 1.2679868936538696, + "kl": 0.35767948627471924, + "learning_rate": 3.7838531301454257e-06, + "loss": 0.0143, + "reward": 2.3500001430511475, + "reward_std": 0.9380831718444824, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 791 + }, + { + "completion_length": 202.0, + "epoch": 2.769230769230769, + "grad_norm": 0.6652596592903137, + "kl": 0.2619841694831848, + "learning_rate": 3.780107162176429e-06, + "loss": 0.0105, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 792 + }, + { + "completion_length": 474.0, + "epoch": 2.7727272727272725, + "grad_norm": 8.084759712219238, + "kl": 2.9472758769989014, + "learning_rate": 3.776357294779015e-06, + "loss": 0.1179, + "reward": 2.133333444595337, + "reward_std": 1.6972527503967285, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 793 + }, + { + "completion_length": 487.3333435058594, + "epoch": 2.7762237762237763, + "grad_norm": 0.43876194953918457, + "kl": 0.234140545129776, + "learning_rate": 3.772603539375929e-06, + "loss": 0.0094, + "reward": 2.875, + "reward_std": 1.2624380588531494, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 794 + }, + { + "completion_length": 216.5, + "epoch": 2.7797202797202796, + "grad_norm": 0.7178113460540771, + "kl": 0.3248441517353058, + "learning_rate": 3.768845907401761e-06, + "loss": 0.013, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 795 + }, + { + "completion_length": 425.66668701171875, + "epoch": 2.7832167832167833, + "grad_norm": 0.4357425570487976, + "kl": 0.24865001440048218, + "learning_rate": 3.7650844103029093e-06, + "loss": 0.0099, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 796 + }, + { + "completion_length": 827.1666870117188, + "epoch": 2.7867132867132867, + "grad_norm": 0.36945709586143494, + "kl": 0.26294025778770447, + "learning_rate": 3.7613190595375484e-06, + "loss": 0.0105, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 797 + }, + { + "completion_length": 193.33334350585938, + "epoch": 2.79020979020979, + "grad_norm": 1.0428582429885864, + "kl": 0.3159600496292114, + "learning_rate": 3.7575498665755884e-06, + "loss": 0.0126, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 798 + }, + { + "completion_length": 573.1666870117188, + "epoch": 2.7937062937062938, + "grad_norm": 0.7567842602729797, + "kl": 0.37068232893943787, + "learning_rate": 3.753776842898644e-06, + "loss": 0.0148, + "reward": 2.704166889190674, + "reward_std": 0.8732721209526062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 799 + }, + { + "completion_length": 500.66668701171875, + "epoch": 2.797202797202797, + "grad_norm": 0.5451098680496216, + "kl": 0.24930475652217865, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.01, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 800 + } + ], + "logging_steps": 1, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-800/training_args.bin b/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..404a67ca1097568ef818195412e92eb5df6df003 --- /dev/null +++ b/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9b809202c83316443ca7c3596f9666d891e249e918f031374256726d85b5070 +size 6008 diff --git a/checkpoint-850/README.md b/checkpoint-850/README.md new file mode 100644 index 0000000000000000000000000000000000000000..342a23987f57b711334f1f7c4b72004ab4751d11 --- /dev/null +++ b/checkpoint-850/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/checkpoint-850/adapter_config.json b/checkpoint-850/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca69f90ffbea02ffd530ac27f43588458c02af39 --- /dev/null +++ b/checkpoint-850/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "up_proj", + "o_proj", + "k_proj", + "gate_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-850/adapter_model.safetensors b/checkpoint-850/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d75f6578f2f41184a6b9e4628015ac314d63550a --- /dev/null +++ b/checkpoint-850/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:249f51b2baf902beb4e1285e7708fd882cb04c1881bd63ec10a551a90ad0418e +size 778096664 diff --git a/checkpoint-850/optimizer.pt b/checkpoint-850/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4313e3f2e07ef14223406c2de59d766304047359 --- /dev/null +++ b/checkpoint-850/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:027ebd38f372dcfcd59ca6279dbccd8a993043808c5ce8b769b670084061fc9c +size 395571252 diff --git a/checkpoint-850/rng_state.pth b/checkpoint-850/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..53262239a5089628204d69f8543948e3f4837a7a --- /dev/null +++ b/checkpoint-850/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02c3c904a7fb82ec770214188dba51ef045d89ba08b29cacd8c2df4bbcca0696 +size 14244 diff --git a/checkpoint-850/scheduler.pt b/checkpoint-850/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..788a38797691dacc66de2a4e34562c019ec1fbf6 --- /dev/null +++ b/checkpoint-850/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ecb7833a81a2a69c969de339f3f8a30e42b2c124e99cfa43297047bc97fb5e2 +size 1064 diff --git a/checkpoint-850/special_tokens_map.json b/checkpoint-850/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-850/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-850/tokenizer.json b/checkpoint-850/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-850/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-850/tokenizer_config.json b/checkpoint-850/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f29bafcf7d24e386a389486e71a4e81dfef0f5c2 --- /dev/null +++ b/checkpoint-850/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-850/trainer_state.json b/checkpoint-850/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bcced227ff7cb162673e9b9b83dd5b5e82efd29d --- /dev/null +++ b/checkpoint-850/trainer_state.json @@ -0,0 +1,12783 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.972027972027972, + "eval_steps": 500, + "global_step": 850, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 399.0, + "epoch": 0.0034965034965034965, + "grad_norm": 0.9857833385467529, + "kl": 0.0, + "learning_rate": 2.5000000000000002e-08, + "loss": 0.0, + "reward": 1.75, + "reward_std": 1.069111704826355, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4166666865348816, + "step": 1 + }, + { + "completion_length": 305.3333435058594, + "epoch": 0.006993006993006993, + "grad_norm": 1.3122953176498413, + "kl": 0.0, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.0, + "reward": 1.0500000715255737, + "reward_std": 0.6340347528457642, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 2 + }, + { + "completion_length": 475.3333435058594, + "epoch": 0.01048951048951049, + "grad_norm": 6.344944953918457, + "kl": 0.0006356238736771047, + "learning_rate": 7.500000000000001e-08, + "loss": 0.0, + "reward": 1.4750001430511475, + "reward_std": 1.025061011314392, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 3 + }, + { + "completion_length": 378.3333435058594, + "epoch": 0.013986013986013986, + "grad_norm": 0.9831988215446472, + "kl": 0.0006719424272887409, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.0, + "reward": 1.2208333015441895, + "reward_std": 1.3383214473724365, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.22083334624767303, + "step": 4 + }, + { + "completion_length": 925.0, + "epoch": 0.017482517482517484, + "grad_norm": 1.042701005935669, + "kl": 0.000699286290910095, + "learning_rate": 1.2500000000000002e-07, + "loss": 0.0, + "reward": 2.4666666984558105, + "reward_std": 1.618847370147705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 5 + }, + { + "completion_length": 130.6666717529297, + "epoch": 0.02097902097902098, + "grad_norm": 1.276957631111145, + "kl": 0.0007741473382338881, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.0, + "reward": 0.38333332538604736, + "reward_std": 0.7222649455070496, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 6 + }, + { + "completion_length": 185.5, + "epoch": 0.024475524475524476, + "grad_norm": 1.277024507522583, + "kl": 0.0007853443967178464, + "learning_rate": 1.7500000000000002e-07, + "loss": 0.0, + "reward": 0.7250000238418579, + "reward_std": 0.44017040729522705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 7 + }, + { + "completion_length": 113.83333587646484, + "epoch": 0.027972027972027972, + "grad_norm": 4.894377708435059, + "kl": 0.0010196010116487741, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.0, + "reward": 0.7250000238418579, + "reward_std": 0.5777109861373901, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 8 + }, + { + "completion_length": 195.33334350585938, + "epoch": 0.03146853146853147, + "grad_norm": 0.9491543769836426, + "kl": 0.0009398699621669948, + "learning_rate": 2.2500000000000002e-07, + "loss": 0.0, + "reward": 1.2750000953674316, + "reward_std": 0.673609733581543, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10833333432674408, + "step": 9 + }, + { + "completion_length": 168.83334350585938, + "epoch": 0.03496503496503497, + "grad_norm": 4.634313583374023, + "kl": 0.0008446139981970191, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.0, + "reward": 0.5791666507720947, + "reward_std": 1.011238932609558, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.07916666567325592, + "step": 10 + }, + { + "completion_length": 181.0, + "epoch": 0.038461538461538464, + "grad_norm": 0.9203607439994812, + "kl": 0.0005472182529047132, + "learning_rate": 2.75e-07, + "loss": 0.0, + "reward": 1.2833333015441895, + "reward_std": 0.9125057458877563, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.11666666716337204, + "step": 11 + }, + { + "completion_length": 181.1666717529297, + "epoch": 0.04195804195804196, + "grad_norm": 1.4339206218719482, + "kl": 0.0007050944259390235, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.0, + "reward": 1.7333333492279053, + "reward_std": 1.0063133239746094, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.23333333432674408, + "step": 12 + }, + { + "completion_length": 130.0, + "epoch": 0.045454545454545456, + "grad_norm": 1.073473334312439, + "kl": 0.0007636564550921321, + "learning_rate": 3.25e-07, + "loss": 0.0, + "reward": 1.058333396911621, + "reward_std": 0.7486097812652588, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 13 + }, + { + "completion_length": 356.16668701171875, + "epoch": 0.04895104895104895, + "grad_norm": 0.8452476859092712, + "kl": 0.0006562608177773654, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.0, + "reward": 0.7416666746139526, + "reward_std": 0.8662659525871277, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.24166667461395264, + "step": 14 + }, + { + "completion_length": 143.1666717529297, + "epoch": 0.05244755244755245, + "grad_norm": 0.9590725302696228, + "kl": 0.0008172739762812853, + "learning_rate": 3.75e-07, + "loss": 0.0, + "reward": 0.5541666746139526, + "reward_std": 0.9553031921386719, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05416666716337204, + "step": 15 + }, + { + "completion_length": 454.16668701171875, + "epoch": 0.055944055944055944, + "grad_norm": 1.2272268533706665, + "kl": 0.0007388863014057279, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.0, + "reward": 1.2083333730697632, + "reward_std": 1.0360583066940308, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 16 + }, + { + "completion_length": 152.5, + "epoch": 0.05944055944055944, + "grad_norm": 1.0074872970581055, + "kl": 0.0006766216829419136, + "learning_rate": 4.2500000000000006e-07, + "loss": 0.0, + "reward": 0.8916666507720947, + "reward_std": 0.8662659525871277, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 17 + }, + { + "completion_length": 250.1666717529297, + "epoch": 0.06293706293706294, + "grad_norm": 1.305372953414917, + "kl": 0.001035388559103012, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.0, + "reward": 0.7166666984558105, + "reward_std": 1.2201093435287476, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 18 + }, + { + "completion_length": 243.0, + "epoch": 0.06643356643356643, + "grad_norm": 1.0690687894821167, + "kl": 0.0006665514083579183, + "learning_rate": 4.7500000000000006e-07, + "loss": 0.0, + "reward": 0.9916666746139526, + "reward_std": 0.6167792677879333, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.15833333134651184, + "step": 19 + }, + { + "completion_length": 276.16668701171875, + "epoch": 0.06993006993006994, + "grad_norm": 1.052300214767456, + "kl": 0.0005925261066295207, + "learning_rate": 5.000000000000001e-07, + "loss": 0.0, + "reward": 1.5333333015441895, + "reward_std": 1.0186593532562256, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.20000001788139343, + "step": 20 + }, + { + "completion_length": 333.3333435058594, + "epoch": 0.07342657342657342, + "grad_norm": 0.95088130235672, + "kl": 0.0006341444095596671, + "learning_rate": 5.250000000000001e-07, + "loss": 0.0, + "reward": 1.8583333492279053, + "reward_std": 0.8458231687545776, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3583333194255829, + "step": 21 + }, + { + "completion_length": 166.6666717529297, + "epoch": 0.07692307692307693, + "grad_norm": 1.2825149297714233, + "kl": 0.0007712479564361274, + "learning_rate": 5.5e-07, + "loss": 0.0, + "reward": 0.7666666507720947, + "reward_std": 1.1881358623504639, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10000000894069672, + "step": 22 + }, + { + "completion_length": 380.0, + "epoch": 0.08041958041958042, + "grad_norm": 1.2229748964309692, + "kl": 0.0007141837850213051, + "learning_rate": 5.750000000000001e-07, + "loss": 0.0, + "reward": 1.933333396911621, + "reward_std": 0.7672461867332458, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 23 + }, + { + "completion_length": 250.0, + "epoch": 0.08391608391608392, + "grad_norm": 1.1869820356369019, + "kl": 0.0007901927456259727, + "learning_rate": 6.000000000000001e-07, + "loss": 0.0, + "reward": 0.9666666984558105, + "reward_std": 0.797287106513977, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 24 + }, + { + "completion_length": 224.33334350585938, + "epoch": 0.08741258741258741, + "grad_norm": 1.1140718460083008, + "kl": 0.0006676652701571584, + "learning_rate": 6.25e-07, + "loss": 0.0, + "reward": 1.125, + "reward_std": 1.069462537765503, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.125, + "step": 25 + }, + { + "completion_length": 112.33333587646484, + "epoch": 0.09090909090909091, + "grad_norm": 1.20625901222229, + "kl": 0.0006995900766924024, + "learning_rate": 6.5e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0, + "step": 26 + }, + { + "completion_length": 398.8333435058594, + "epoch": 0.0944055944055944, + "grad_norm": 5.332723617553711, + "kl": 0.0007186655420809984, + "learning_rate": 6.750000000000001e-07, + "loss": 0.0, + "reward": 1.6625001430511475, + "reward_std": 0.9664044380187988, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3291666805744171, + "step": 27 + }, + { + "completion_length": 336.3333435058594, + "epoch": 0.0979020979020979, + "grad_norm": 0.7707162499427795, + "kl": 0.0007305681938305497, + "learning_rate": 7.000000000000001e-07, + "loss": 0.0, + "reward": 1.441666603088379, + "reward_std": 0.9876319766044617, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.2750000059604645, + "step": 28 + }, + { + "completion_length": 355.8333435058594, + "epoch": 0.10139860139860139, + "grad_norm": 0.999113142490387, + "kl": 0.0006821553106419742, + "learning_rate": 7.25e-07, + "loss": 0.0, + "reward": 1.1583333015441895, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.15833333134651184, + "step": 29 + }, + { + "completion_length": 188.1666717529297, + "epoch": 0.1048951048951049, + "grad_norm": 1.1029480695724487, + "kl": 0.0007804523920640349, + "learning_rate": 7.5e-07, + "loss": 0.0, + "reward": 1.183333396911621, + "reward_std": 1.0680201053619385, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.18333333730697632, + "step": 30 + }, + { + "completion_length": 380.3333435058594, + "epoch": 0.10839160839160839, + "grad_norm": 0.9132871627807617, + "kl": 0.0008556495886296034, + "learning_rate": 7.750000000000001e-07, + "loss": 0.0, + "reward": 2.2375001907348633, + "reward_std": 1.4762918949127197, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40416666865348816, + "step": 31 + }, + { + "completion_length": 348.0, + "epoch": 0.11188811188811189, + "grad_norm": 1.549122929573059, + "kl": 0.0009064790210686624, + "learning_rate": 8.000000000000001e-07, + "loss": 0.0, + "reward": 0.8291666507720947, + "reward_std": 1.029613733291626, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.16250000894069672, + "step": 32 + }, + { + "completion_length": 349.5, + "epoch": 0.11538461538461539, + "grad_norm": 0.8771302700042725, + "kl": 0.0008574656676501036, + "learning_rate": 8.250000000000001e-07, + "loss": 0.0, + "reward": 1.133333444595337, + "reward_std": 0.9867455363273621, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.30000001192092896, + "step": 33 + }, + { + "completion_length": 698.8333740234375, + "epoch": 0.11888111888111888, + "grad_norm": 0.7568854689598083, + "kl": 0.0007735582767054439, + "learning_rate": 8.500000000000001e-07, + "loss": 0.0, + "reward": 1.933333396911621, + "reward_std": 1.1737406253814697, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 34 + }, + { + "completion_length": 655.3333740234375, + "epoch": 0.12237762237762238, + "grad_norm": 1.5077099800109863, + "kl": 0.0007145506679080427, + "learning_rate": 8.75e-07, + "loss": 0.0, + "reward": 1.337499976158142, + "reward_std": 0.7572566270828247, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5041667222976685, + "step": 35 + }, + { + "completion_length": 156.0, + "epoch": 0.1258741258741259, + "grad_norm": 1.1091190576553345, + "kl": 0.0010963345412164927, + "learning_rate": 9.000000000000001e-07, + "loss": 0.0, + "reward": 1.1583333015441895, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.15833333134651184, + "step": 36 + }, + { + "completion_length": 184.6666717529297, + "epoch": 0.12937062937062938, + "grad_norm": 1.1978340148925781, + "kl": 0.000993944238871336, + "learning_rate": 9.25e-07, + "loss": 0.0, + "reward": 0.8333333730697632, + "reward_std": 1.2944754362106323, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 37 + }, + { + "completion_length": 170.1666717529297, + "epoch": 0.13286713286713286, + "grad_norm": 0.9296630620956421, + "kl": 0.0012741987593472004, + "learning_rate": 9.500000000000001e-07, + "loss": 0.0001, + "reward": 1.25, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 38 + }, + { + "completion_length": 284.3333435058594, + "epoch": 0.13636363636363635, + "grad_norm": 1.3948841094970703, + "kl": 0.0010804318590089679, + "learning_rate": 9.750000000000002e-07, + "loss": 0.0, + "reward": 1.1083333492279053, + "reward_std": 1.263098120689392, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2750000059604645, + "step": 39 + }, + { + "completion_length": 132.1666717529297, + "epoch": 0.13986013986013987, + "grad_norm": 1.0202951431274414, + "kl": 0.0013121496886014938, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0001, + "reward": 0.3333333432674408, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0, + "step": 40 + }, + { + "completion_length": 156.1666717529297, + "epoch": 0.14335664335664336, + "grad_norm": 0.9724128246307373, + "kl": 0.0010785979684442282, + "learning_rate": 1.025e-06, + "loss": 0.0, + "reward": 0.6083333492279053, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10833333432674408, + "step": 41 + }, + { + "completion_length": 603.1666870117188, + "epoch": 0.14685314685314685, + "grad_norm": 0.7776791453361511, + "kl": 0.0006764258723706007, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.0, + "reward": 1.4500001668930054, + "reward_std": 0.30659419298171997, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.45000001788139343, + "step": 42 + }, + { + "completion_length": 183.1666717529297, + "epoch": 0.15034965034965034, + "grad_norm": 1.2581369876861572, + "kl": 0.0012429999187588692, + "learning_rate": 1.075e-06, + "loss": 0.0, + "reward": 1.1749999523162842, + "reward_std": 1.0567638874053955, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17499999701976776, + "step": 43 + }, + { + "completion_length": 379.16668701171875, + "epoch": 0.15384615384615385, + "grad_norm": 2.0310208797454834, + "kl": 0.0011767616961151361, + "learning_rate": 1.1e-06, + "loss": 0.0, + "reward": 2.633333683013916, + "reward_std": 1.0595598220825195, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.46666663885116577, + "step": 44 + }, + { + "completion_length": 637.3333740234375, + "epoch": 0.15734265734265734, + "grad_norm": 1.2500090599060059, + "kl": 0.001643048133701086, + "learning_rate": 1.125e-06, + "loss": 0.0001, + "reward": 1.1500000953674316, + "reward_std": 0.7307531237602234, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 45 + }, + { + "completion_length": 182.0, + "epoch": 0.16083916083916083, + "grad_norm": 2.3323163986206055, + "kl": 0.003556631039828062, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.0001, + "reward": 0.9666666984558105, + "reward_std": 1.0230672359466553, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.13333334028720856, + "step": 46 + }, + { + "completion_length": 109.83333587646484, + "epoch": 0.16433566433566432, + "grad_norm": 1.834832787513733, + "kl": 0.002168774139136076, + "learning_rate": 1.175e-06, + "loss": 0.0001, + "reward": 0.5583333373069763, + "reward_std": 0.6248332858085632, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 47 + }, + { + "completion_length": 337.16668701171875, + "epoch": 0.16783216783216784, + "grad_norm": 1.1725846529006958, + "kl": 0.002405840437859297, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0001, + "reward": 0.6500000357627869, + "reward_std": 0.7962412238121033, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 48 + }, + { + "completion_length": 437.3333435058594, + "epoch": 0.17132867132867133, + "grad_norm": 0.743201494216919, + "kl": 0.0013375936541706324, + "learning_rate": 1.2250000000000001e-06, + "loss": 0.0001, + "reward": 1.183333396911621, + "reward_std": 1.3611271381378174, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3499999940395355, + "step": 49 + }, + { + "completion_length": 533.8333740234375, + "epoch": 0.17482517482517482, + "grad_norm": 0.7576809525489807, + "kl": 0.0019401045283302665, + "learning_rate": 1.25e-06, + "loss": 0.0001, + "reward": 1.7291667461395264, + "reward_std": 0.7050561308860779, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5625, + "step": 50 + }, + { + "completion_length": 203.5, + "epoch": 0.17832167832167833, + "grad_norm": 1.4076164960861206, + "kl": 0.0030774520710110664, + "learning_rate": 1.275e-06, + "loss": 0.0001, + "reward": 0.7750000357627869, + "reward_std": 0.5135659575462341, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2750000059604645, + "step": 51 + }, + { + "completion_length": 409.0, + "epoch": 0.18181818181818182, + "grad_norm": 0.8726016879081726, + "kl": 0.0025800741277635098, + "learning_rate": 1.3e-06, + "loss": 0.0001, + "reward": 0.5916666984558105, + "reward_std": 0.7324047088623047, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 52 + }, + { + "completion_length": 356.5, + "epoch": 0.1853146853146853, + "grad_norm": 0.877477765083313, + "kl": 0.0021268115378916264, + "learning_rate": 1.3250000000000002e-06, + "loss": 0.0001, + "reward": 1.6166666746139526, + "reward_std": 0.6976150274276733, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.28333336114883423, + "step": 53 + }, + { + "completion_length": 243.33334350585938, + "epoch": 0.1888111888111888, + "grad_norm": 0.9792532324790955, + "kl": 0.0043938253074884415, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.0002, + "reward": 1.1708333492279053, + "reward_std": 1.282616138458252, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17083333432674408, + "step": 54 + }, + { + "completion_length": 393.3333435058594, + "epoch": 0.19230769230769232, + "grad_norm": 1.205925703048706, + "kl": 0.0031106050591915846, + "learning_rate": 1.3750000000000002e-06, + "loss": 0.0001, + "reward": 0.9666666984558105, + "reward_std": 0.8084965944290161, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 55 + }, + { + "completion_length": 228.83334350585938, + "epoch": 0.1958041958041958, + "grad_norm": 0.7984407544136047, + "kl": 0.007072250358760357, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.0003, + "reward": 0.6916667222976685, + "reward_std": 1.1655113697052002, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19166666269302368, + "step": 56 + }, + { + "completion_length": 361.66668701171875, + "epoch": 0.1993006993006993, + "grad_norm": 3.0838680267333984, + "kl": 0.006738494616001844, + "learning_rate": 1.425e-06, + "loss": 0.0003, + "reward": 1.3041667938232422, + "reward_std": 0.2600080370903015, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30416667461395264, + "step": 57 + }, + { + "completion_length": 502.66668701171875, + "epoch": 0.20279720279720279, + "grad_norm": 0.7226095795631409, + "kl": 0.0058082761242985725, + "learning_rate": 1.45e-06, + "loss": 0.0002, + "reward": 1.399999976158142, + "reward_std": 1.3337916135787964, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.40000003576278687, + "step": 58 + }, + { + "completion_length": 210.5, + "epoch": 0.2062937062937063, + "grad_norm": 1.079681158065796, + "kl": 0.009464471600949764, + "learning_rate": 1.475e-06, + "loss": 0.0004, + "reward": 0.9750000238418579, + "reward_std": 1.1890122890472412, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.14166668057441711, + "step": 59 + }, + { + "completion_length": 208.5, + "epoch": 0.2097902097902098, + "grad_norm": 1.8312753438949585, + "kl": 0.03959222882986069, + "learning_rate": 1.5e-06, + "loss": 0.0016, + "reward": 0.5333333611488342, + "reward_std": 0.8553751707077026, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.20000001788139343, + "step": 60 + }, + { + "completion_length": 285.5, + "epoch": 0.21328671328671328, + "grad_norm": 0.9337784051895142, + "kl": 0.011914614588022232, + "learning_rate": 1.525e-06, + "loss": 0.0005, + "reward": 1.4458332061767578, + "reward_std": 0.4955846071243286, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.27916666865348816, + "step": 61 + }, + { + "completion_length": 276.3333435058594, + "epoch": 0.21678321678321677, + "grad_norm": 1.4266396760940552, + "kl": 0.02391706220805645, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.001, + "reward": 1.1583333015441895, + "reward_std": 0.8598934412002563, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32500001788139343, + "step": 62 + }, + { + "completion_length": 381.3333435058594, + "epoch": 0.2202797202797203, + "grad_norm": 1.1708087921142578, + "kl": 0.012987270019948483, + "learning_rate": 1.5750000000000002e-06, + "loss": 0.0005, + "reward": 1.5416667461395264, + "reward_std": 1.3807305097579956, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 63 + }, + { + "completion_length": 237.0, + "epoch": 0.22377622377622378, + "grad_norm": 1.3068374395370483, + "kl": 0.027782242745161057, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0011, + "reward": 1.433333396911621, + "reward_std": 1.162611961364746, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2666666805744171, + "step": 64 + }, + { + "completion_length": 797.6666870117188, + "epoch": 0.22727272727272727, + "grad_norm": 0.7319328784942627, + "kl": 0.013491494581103325, + "learning_rate": 1.6250000000000001e-06, + "loss": 0.0005, + "reward": 1.3166667222976685, + "reward_std": 0.8604747653007507, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3166666626930237, + "step": 65 + }, + { + "completion_length": 237.1666717529297, + "epoch": 0.23076923076923078, + "grad_norm": 1.9626200199127197, + "kl": 0.015099573880434036, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0006, + "reward": 0.9666666388511658, + "reward_std": 0.797287106513977, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 66 + }, + { + "completion_length": 221.1666717529297, + "epoch": 0.23426573426573427, + "grad_norm": 0.7815642952919006, + "kl": 0.03964684158563614, + "learning_rate": 1.6750000000000003e-06, + "loss": 0.0016, + "reward": 1.6416667699813843, + "reward_std": 1.0584973096847534, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.14166668057441711, + "step": 67 + }, + { + "completion_length": 227.33334350585938, + "epoch": 0.23776223776223776, + "grad_norm": 1.5282418727874756, + "kl": 0.0695306807756424, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0028, + "reward": 0.75, + "reward_std": 0.7375635504722595, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.25, + "step": 68 + }, + { + "completion_length": 673.3333740234375, + "epoch": 0.24125874125874125, + "grad_norm": 0.8560697436332703, + "kl": 0.03540939837694168, + "learning_rate": 1.725e-06, + "loss": 0.0014, + "reward": 2.200000047683716, + "reward_std": 0.9581232070922852, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5333333611488342, + "step": 69 + }, + { + "completion_length": 254.6666717529297, + "epoch": 0.24475524475524477, + "grad_norm": 1.2371562719345093, + "kl": 0.03692096844315529, + "learning_rate": 1.75e-06, + "loss": 0.0015, + "reward": 1.8249998092651367, + "reward_std": 0.9968700408935547, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32499998807907104, + "step": 70 + }, + { + "completion_length": 234.6666717529297, + "epoch": 0.24825174825174826, + "grad_norm": 0.9824966192245483, + "kl": 0.07421376556158066, + "learning_rate": 1.7750000000000002e-06, + "loss": 0.003, + "reward": 1.1666667461395264, + "reward_std": 0.6485882997512817, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3333333432674408, + "step": 71 + }, + { + "completion_length": 580.0, + "epoch": 0.2517482517482518, + "grad_norm": 1.0504631996154785, + "kl": 0.048039551824331284, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0019, + "reward": 1.808333396911621, + "reward_std": 1.302849531173706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 72 + }, + { + "completion_length": 788.1666870117188, + "epoch": 0.25524475524475526, + "grad_norm": 0.6447965502738953, + "kl": 0.04130098968744278, + "learning_rate": 1.825e-06, + "loss": 0.0017, + "reward": 1.3875000476837158, + "reward_std": 1.093817949295044, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5541666746139526, + "step": 73 + }, + { + "completion_length": 376.16668701171875, + "epoch": 0.25874125874125875, + "grad_norm": 1.347108244895935, + "kl": 0.19923770427703857, + "learning_rate": 1.85e-06, + "loss": 0.008, + "reward": 1.529166579246521, + "reward_std": 0.6618943214416504, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19583332538604736, + "step": 74 + }, + { + "completion_length": 227.1666717529297, + "epoch": 0.26223776223776224, + "grad_norm": 0.8091520667076111, + "kl": 0.06355344504117966, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.0025, + "reward": 0.75, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 75 + }, + { + "completion_length": 502.3333435058594, + "epoch": 0.26573426573426573, + "grad_norm": 1.1315293312072754, + "kl": 0.11514662951231003, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0046, + "reward": 1.504166603088379, + "reward_std": 1.256027102470398, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.33750003576278687, + "step": 76 + }, + { + "completion_length": 306.16668701171875, + "epoch": 0.2692307692307692, + "grad_norm": 1.6002874374389648, + "kl": 0.07964249700307846, + "learning_rate": 1.925e-06, + "loss": 0.0032, + "reward": 1.7083333730697632, + "reward_std": 1.2195971012115479, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5416666865348816, + "step": 77 + }, + { + "completion_length": 253.0, + "epoch": 0.2727272727272727, + "grad_norm": 1.134474754333496, + "kl": 0.09407778084278107, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0038, + "reward": 1.8333333730697632, + "reward_std": 1.0842816829681396, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3333333432674408, + "step": 78 + }, + { + "completion_length": 456.3333435058594, + "epoch": 0.2762237762237762, + "grad_norm": 1.4590799808502197, + "kl": 0.08163408935070038, + "learning_rate": 1.975e-06, + "loss": 0.0033, + "reward": 1.1875, + "reward_std": 1.164232611656189, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3541666865348816, + "step": 79 + }, + { + "completion_length": 273.0, + "epoch": 0.27972027972027974, + "grad_norm": 1.589087724685669, + "kl": 0.08010071516036987, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0032, + "reward": 0.9125000238418579, + "reward_std": 0.9088110327720642, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.07916666567325592, + "step": 80 + }, + { + "completion_length": 196.1666717529297, + "epoch": 0.28321678321678323, + "grad_norm": 1.4217482805252075, + "kl": 0.0619954913854599, + "learning_rate": 2.025e-06, + "loss": 0.0025, + "reward": 1.058333396911621, + "reward_std": 0.7486097812652588, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 81 + }, + { + "completion_length": 340.8333435058594, + "epoch": 0.2867132867132867, + "grad_norm": 1.056475043296814, + "kl": 0.05495650693774223, + "learning_rate": 2.05e-06, + "loss": 0.0022, + "reward": 0.8625000715255737, + "reward_std": 0.5305068492889404, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19583332538604736, + "step": 82 + }, + { + "completion_length": 410.66668701171875, + "epoch": 0.2902097902097902, + "grad_norm": 0.5162915587425232, + "kl": 0.04134432598948479, + "learning_rate": 2.075e-06, + "loss": 0.0017, + "reward": 1.1875, + "reward_std": 0.7466174364089966, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1875, + "step": 83 + }, + { + "completion_length": 510.66668701171875, + "epoch": 0.2937062937062937, + "grad_norm": 0.9501734972000122, + "kl": 0.047528013586997986, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0019, + "reward": 1.258333444595337, + "reward_std": 1.1069854497909546, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 84 + }, + { + "completion_length": 476.0, + "epoch": 0.2972027972027972, + "grad_norm": 1.0745543241500854, + "kl": 0.04738708958029747, + "learning_rate": 2.125e-06, + "loss": 0.0019, + "reward": 0.7541666030883789, + "reward_std": 0.6050654649734497, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2541666626930237, + "step": 85 + }, + { + "completion_length": 346.16668701171875, + "epoch": 0.3006993006993007, + "grad_norm": 0.7894018888473511, + "kl": 0.03818603605031967, + "learning_rate": 2.15e-06, + "loss": 0.0015, + "reward": 1.5499999523162842, + "reward_std": 0.8729261159896851, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 86 + }, + { + "completion_length": 157.5, + "epoch": 0.3041958041958042, + "grad_norm": 1.2285088300704956, + "kl": 0.04852033406496048, + "learning_rate": 2.1750000000000004e-06, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 1.2284135818481445, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 87 + }, + { + "completion_length": 853.5, + "epoch": 0.3076923076923077, + "grad_norm": 1.1314716339111328, + "kl": 0.03052813559770584, + "learning_rate": 2.2e-06, + "loss": 0.0012, + "reward": 1.5625, + "reward_std": 1.093817949295044, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3958333432674408, + "step": 88 + }, + { + "completion_length": 372.66668701171875, + "epoch": 0.3111888111888112, + "grad_norm": 0.9353286623954773, + "kl": 0.027921725064516068, + "learning_rate": 2.2250000000000003e-06, + "loss": 0.0011, + "reward": 1.8250000476837158, + "reward_std": 0.9234446287155151, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32500001788139343, + "step": 89 + }, + { + "completion_length": 296.3333435058594, + "epoch": 0.3146853146853147, + "grad_norm": 1.140289306640625, + "kl": 0.04811665043234825, + "learning_rate": 2.25e-06, + "loss": 0.0019, + "reward": 1.125, + "reward_std": 1.1268318891525269, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 90 + }, + { + "completion_length": 99.83333587646484, + "epoch": 0.3181818181818182, + "grad_norm": 4.178561687469482, + "kl": 0.09318779408931732, + "learning_rate": 2.2750000000000002e-06, + "loss": 0.0037, + "reward": 0.5583333373069763, + "reward_std": 0.9645810127258301, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 91 + }, + { + "completion_length": 192.1666717529297, + "epoch": 0.32167832167832167, + "grad_norm": 1.560648798942566, + "kl": 0.03698144853115082, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0015, + "reward": 1.9249999523162842, + "reward_std": 0.718853235244751, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.25833335518836975, + "step": 92 + }, + { + "completion_length": 576.5, + "epoch": 0.32517482517482516, + "grad_norm": 1.093043327331543, + "kl": 0.021529672667384148, + "learning_rate": 2.325e-06, + "loss": 0.0009, + "reward": 1.070833444595337, + "reward_std": 0.6477686166763306, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.23749999701976776, + "step": 93 + }, + { + "completion_length": 335.8333435058594, + "epoch": 0.32867132867132864, + "grad_norm": 0.8303731679916382, + "kl": 0.019405633211135864, + "learning_rate": 2.35e-06, + "loss": 0.0008, + "reward": 0.8416666984558105, + "reward_std": 0.9183772802352905, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17499999701976776, + "step": 94 + }, + { + "completion_length": 569.5, + "epoch": 0.3321678321678322, + "grad_norm": 1.4912625551223755, + "kl": 0.014733041636645794, + "learning_rate": 2.375e-06, + "loss": 0.0006, + "reward": 1.4541667699813843, + "reward_std": 1.1459076404571533, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4541666507720947, + "step": 95 + }, + { + "completion_length": 232.83334350585938, + "epoch": 0.3356643356643357, + "grad_norm": 0.9174475073814392, + "kl": 0.018923718482255936, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0008, + "reward": 1.3333333730697632, + "reward_std": 0.9877583980560303, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 96 + }, + { + "completion_length": 742.1666870117188, + "epoch": 0.33916083916083917, + "grad_norm": 1.258750557899475, + "kl": 0.017664968967437744, + "learning_rate": 2.425e-06, + "loss": 0.0007, + "reward": 1.4583333730697632, + "reward_std": 0.6202150583267212, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 97 + }, + { + "completion_length": 270.8333435058594, + "epoch": 0.34265734265734266, + "grad_norm": 0.9259786605834961, + "kl": 0.05115365609526634, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.002, + "reward": 1.5500000715255737, + "reward_std": 0.8729261159896851, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.21666666865348816, + "step": 98 + }, + { + "completion_length": 476.3333435058594, + "epoch": 0.34615384615384615, + "grad_norm": 1.240902066230774, + "kl": 0.036602895706892014, + "learning_rate": 2.475e-06, + "loss": 0.0015, + "reward": 1.2791666984558105, + "reward_std": 1.1935679912567139, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.27916669845581055, + "step": 99 + }, + { + "completion_length": 213.6666717529297, + "epoch": 0.34965034965034963, + "grad_norm": 0.943215548992157, + "kl": 0.04590342566370964, + "learning_rate": 2.5e-06, + "loss": 0.0018, + "reward": 1.841666579246521, + "reward_std": 0.6873258352279663, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.34166666865348816, + "step": 100 + }, + { + "completion_length": 401.0, + "epoch": 0.3531468531468531, + "grad_norm": 0.7366496324539185, + "kl": 0.016905900090932846, + "learning_rate": 2.5250000000000004e-06, + "loss": 0.0007, + "reward": 1.3000000715255737, + "reward_std": 1.1256110668182373, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 101 + }, + { + "completion_length": 854.5, + "epoch": 0.35664335664335667, + "grad_norm": 8.089740753173828, + "kl": 0.08785610646009445, + "learning_rate": 2.55e-06, + "loss": 0.0035, + "reward": 1.316666603088379, + "reward_std": 1.2330517768859863, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 102 + }, + { + "completion_length": 455.16668701171875, + "epoch": 0.36013986013986016, + "grad_norm": 1.6066083908081055, + "kl": 0.03349429741501808, + "learning_rate": 2.5750000000000003e-06, + "loss": 0.0013, + "reward": 1.7333333492279053, + "reward_std": 1.6448911428451538, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40000003576278687, + "step": 103 + }, + { + "completion_length": 558.6666870117188, + "epoch": 0.36363636363636365, + "grad_norm": 1.2461860179901123, + "kl": 0.0453556627035141, + "learning_rate": 2.6e-06, + "loss": 0.0018, + "reward": 1.933333396911621, + "reward_std": 1.1851863861083984, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 104 + }, + { + "completion_length": 462.16668701171875, + "epoch": 0.36713286713286714, + "grad_norm": 0.9176071286201477, + "kl": 0.05445032939314842, + "learning_rate": 2.6250000000000003e-06, + "loss": 0.0022, + "reward": 1.2916667461395264, + "reward_std": 0.9144214391708374, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 105 + }, + { + "completion_length": 357.5, + "epoch": 0.3706293706293706, + "grad_norm": 1.1796709299087524, + "kl": 0.08697855472564697, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0035, + "reward": 0.9833333492279053, + "reward_std": 1.2412359714508057, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 106 + }, + { + "completion_length": 556.8333740234375, + "epoch": 0.3741258741258741, + "grad_norm": 1.1719709634780884, + "kl": 0.09557916224002838, + "learning_rate": 2.6750000000000002e-06, + "loss": 0.0038, + "reward": 0.9541666507720947, + "reward_std": 1.0742924213409424, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2875000238418579, + "step": 107 + }, + { + "completion_length": 490.8333435058594, + "epoch": 0.3776223776223776, + "grad_norm": 0.9839584827423096, + "kl": 0.07620736211538315, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.003, + "reward": 1.3416666984558105, + "reward_std": 1.0551856756210327, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5083333253860474, + "step": 108 + }, + { + "completion_length": 459.8333435058594, + "epoch": 0.3811188811188811, + "grad_norm": 1.0232492685317993, + "kl": 0.09754881262779236, + "learning_rate": 2.7250000000000006e-06, + "loss": 0.0039, + "reward": 1.7916667461395264, + "reward_std": 1.201422929763794, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 109 + }, + { + "completion_length": 432.5, + "epoch": 0.38461538461538464, + "grad_norm": 0.7946304082870483, + "kl": 0.043154411017894745, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0017, + "reward": 2.1000001430511475, + "reward_std": 0.8933085203170776, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 110 + }, + { + "completion_length": 346.8333435058594, + "epoch": 0.3881118881118881, + "grad_norm": 0.9842674136161804, + "kl": 0.1046643778681755, + "learning_rate": 2.7750000000000005e-06, + "loss": 0.0042, + "reward": 0.8166667222976685, + "reward_std": 0.7353004217147827, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 111 + }, + { + "completion_length": 214.5, + "epoch": 0.3916083916083916, + "grad_norm": 1.1671849489212036, + "kl": 0.1281026154756546, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0051, + "reward": 1.0500000715255737, + "reward_std": 0.14832398295402527, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 112 + }, + { + "completion_length": 908.6666870117188, + "epoch": 0.3951048951048951, + "grad_norm": 0.3388780951499939, + "kl": 0.022495290264487267, + "learning_rate": 2.825e-06, + "loss": 0.0009, + "reward": 2.3375000953674316, + "reward_std": 0.3727431893348694, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6708333492279053, + "step": 113 + }, + { + "completion_length": 891.6666870117188, + "epoch": 0.3986013986013986, + "grad_norm": 0.467278391122818, + "kl": 0.025123490020632744, + "learning_rate": 2.85e-06, + "loss": 0.001, + "reward": 1.8541667461395264, + "reward_std": 0.7543899416923523, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6875, + "step": 114 + }, + { + "completion_length": 546.1666870117188, + "epoch": 0.4020979020979021, + "grad_norm": 1.054366111755371, + "kl": 0.0783834159374237, + "learning_rate": 2.875e-06, + "loss": 0.0031, + "reward": 2.4000000953674316, + "reward_std": 1.306904673576355, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5666666626930237, + "step": 115 + }, + { + "completion_length": 835.1666870117188, + "epoch": 0.40559440559440557, + "grad_norm": 0.7376688122749329, + "kl": 0.04768560454249382, + "learning_rate": 2.9e-06, + "loss": 0.0019, + "reward": 1.5291666984558105, + "reward_std": 0.32841163873672485, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5291666984558105, + "step": 116 + }, + { + "completion_length": 368.3333435058594, + "epoch": 0.4090909090909091, + "grad_norm": 1.456405758857727, + "kl": 0.1393664926290512, + "learning_rate": 2.925e-06, + "loss": 0.0056, + "reward": 0.9541666507720947, + "reward_std": 0.7450531721115112, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2875000238418579, + "step": 117 + }, + { + "completion_length": 485.5, + "epoch": 0.4125874125874126, + "grad_norm": 1.4957919120788574, + "kl": 0.1291833370923996, + "learning_rate": 2.95e-06, + "loss": 0.0052, + "reward": 1.5833333730697632, + "reward_std": 1.4998888969421387, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4166666865348816, + "step": 118 + }, + { + "completion_length": 356.3333435058594, + "epoch": 0.4160839160839161, + "grad_norm": 1.178475022315979, + "kl": 0.10108506679534912, + "learning_rate": 2.9750000000000003e-06, + "loss": 0.004, + "reward": 0.7083333730697632, + "reward_std": 0.7506109476089478, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 119 + }, + { + "completion_length": 140.33334350585938, + "epoch": 0.4195804195804196, + "grad_norm": 1.4624924659729004, + "kl": 0.2249661386013031, + "learning_rate": 3e-06, + "loss": 0.009, + "reward": 0.9166666865348816, + "reward_std": 0.9174239635467529, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 120 + }, + { + "completion_length": 673.1666870117188, + "epoch": 0.4230769230769231, + "grad_norm": 1.0837116241455078, + "kl": 0.09312133491039276, + "learning_rate": 3.0250000000000003e-06, + "loss": 0.0037, + "reward": 2.2208335399627686, + "reward_std": 0.9818881750106812, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.38749998807907104, + "step": 121 + }, + { + "completion_length": 238.1666717529297, + "epoch": 0.42657342657342656, + "grad_norm": 1.0982871055603027, + "kl": 0.05689762160181999, + "learning_rate": 3.05e-06, + "loss": 0.0023, + "reward": 1.1166666746139526, + "reward_std": 0.7567474246025085, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.11666666716337204, + "step": 122 + }, + { + "completion_length": 576.1666870117188, + "epoch": 0.43006993006993005, + "grad_norm": 1.0922025442123413, + "kl": 0.04579655081033707, + "learning_rate": 3.075e-06, + "loss": 0.0018, + "reward": 2.4000000953674316, + "reward_std": 1.0807406902313232, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5666666626930237, + "step": 123 + }, + { + "completion_length": 736.6666870117188, + "epoch": 0.43356643356643354, + "grad_norm": 1.5019290447235107, + "kl": 0.030428007245063782, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0012, + "reward": 1.504166603088379, + "reward_std": 1.2472386360168457, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5041667222976685, + "step": 124 + }, + { + "completion_length": 603.5, + "epoch": 0.4370629370629371, + "grad_norm": 4.212569713592529, + "kl": 0.37697991728782654, + "learning_rate": 3.125e-06, + "loss": 0.0151, + "reward": 1.6416667699813843, + "reward_std": 0.8303112387657166, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6416667103767395, + "step": 125 + }, + { + "completion_length": 492.0, + "epoch": 0.4405594405594406, + "grad_norm": 0.9634215831756592, + "kl": 0.06763506680727005, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0027, + "reward": 2.125, + "reward_std": 1.2069590091705322, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 126 + }, + { + "completion_length": 792.1666870117188, + "epoch": 0.44405594405594406, + "grad_norm": 0.4220138192176819, + "kl": 0.03986603766679764, + "learning_rate": 3.175e-06, + "loss": 0.0016, + "reward": 1.1375000476837158, + "reward_std": 0.5137485265731812, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6375000476837158, + "step": 127 + }, + { + "completion_length": 535.5, + "epoch": 0.44755244755244755, + "grad_norm": 4.797938823699951, + "kl": 0.13327616453170776, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0053, + "reward": 1.1791666746139526, + "reward_std": 1.1582764387130737, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.34583336114883423, + "step": 128 + }, + { + "completion_length": 444.8333435058594, + "epoch": 0.45104895104895104, + "grad_norm": 0.7808079719543457, + "kl": 0.055326174944639206, + "learning_rate": 3.2250000000000005e-06, + "loss": 0.0022, + "reward": 1.495833396911621, + "reward_std": 0.7681823968887329, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.16250000894069672, + "step": 129 + }, + { + "completion_length": 454.66668701171875, + "epoch": 0.45454545454545453, + "grad_norm": 0.8776301741600037, + "kl": 0.11162035167217255, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0045, + "reward": 1.5750001668930054, + "reward_std": 1.4812158346176147, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.24166665971279144, + "step": 130 + }, + { + "completion_length": 769.6666870117188, + "epoch": 0.458041958041958, + "grad_norm": 0.4391367733478546, + "kl": 0.025292951613664627, + "learning_rate": 3.2750000000000004e-06, + "loss": 0.001, + "reward": 2.433333396911621, + "reward_std": 0.2746209502220154, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6000000238418579, + "step": 131 + }, + { + "completion_length": 528.6666870117188, + "epoch": 0.46153846153846156, + "grad_norm": 0.8809014558792114, + "kl": 0.12223925441503525, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0049, + "reward": 2.120833396911621, + "reward_std": 1.101410150527954, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4541666507720947, + "step": 132 + }, + { + "completion_length": 491.3333435058594, + "epoch": 0.46503496503496505, + "grad_norm": 1.0070464611053467, + "kl": 0.05908138304948807, + "learning_rate": 3.3250000000000004e-06, + "loss": 0.0024, + "reward": 0.5916666984558105, + "reward_std": 0.5335416197776794, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 133 + }, + { + "completion_length": 892.5, + "epoch": 0.46853146853146854, + "grad_norm": 0.4570764899253845, + "kl": 0.037701599299907684, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0015, + "reward": 1.7249999046325684, + "reward_std": 1.292478322982788, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 134 + }, + { + "completion_length": 806.8333740234375, + "epoch": 0.47202797202797203, + "grad_norm": 0.5572299361228943, + "kl": 0.05404336377978325, + "learning_rate": 3.3750000000000003e-06, + "loss": 0.0022, + "reward": 1.4583333730697632, + "reward_std": 0.990033745765686, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7916666269302368, + "step": 135 + }, + { + "completion_length": 589.0, + "epoch": 0.4755244755244755, + "grad_norm": 0.7575751543045044, + "kl": 0.04170485585927963, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0017, + "reward": 2.683333396911621, + "reward_std": 1.1075499057769775, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8500000238418579, + "step": 136 + }, + { + "completion_length": 1060.166748046875, + "epoch": 0.479020979020979, + "grad_norm": 0.5119641423225403, + "kl": 0.04976843297481537, + "learning_rate": 3.4250000000000007e-06, + "loss": 0.002, + "reward": 1.1125000715255737, + "reward_std": 0.39457258582115173, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6124999523162842, + "step": 137 + }, + { + "completion_length": 559.8333740234375, + "epoch": 0.4825174825174825, + "grad_norm": 0.6115387082099915, + "kl": 0.05675242468714714, + "learning_rate": 3.45e-06, + "loss": 0.0023, + "reward": 2.0416667461395264, + "reward_std": 0.5715476274490356, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5416666865348816, + "step": 138 + }, + { + "completion_length": 685.6666870117188, + "epoch": 0.486013986013986, + "grad_norm": 1.2578071355819702, + "kl": 0.07080799341201782, + "learning_rate": 3.475e-06, + "loss": 0.0028, + "reward": 1.379166603088379, + "reward_std": 1.0072758197784424, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7125000357627869, + "step": 139 + }, + { + "completion_length": 987.5, + "epoch": 0.48951048951048953, + "grad_norm": 0.6280319690704346, + "kl": 0.03268418833613396, + "learning_rate": 3.5e-06, + "loss": 0.0013, + "reward": 0.9291666746139526, + "reward_std": 0.6654728651046753, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5958333015441895, + "step": 140 + }, + { + "completion_length": 728.5, + "epoch": 0.493006993006993, + "grad_norm": 0.8773026466369629, + "kl": 0.032183535397052765, + "learning_rate": 3.525e-06, + "loss": 0.0013, + "reward": 2.862499952316284, + "reward_std": 0.7864078879356384, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6958333253860474, + "step": 141 + }, + { + "completion_length": 405.8333435058594, + "epoch": 0.4965034965034965, + "grad_norm": 0.8974792957305908, + "kl": 0.059865664690732956, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0024, + "reward": 1.6875, + "reward_std": 0.8300225734710693, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.3541666865348816, + "step": 142 + }, + { + "completion_length": 1081.666748046875, + "epoch": 0.5, + "grad_norm": 0.5286564230918884, + "kl": 0.022505857050418854, + "learning_rate": 3.575e-06, + "loss": 0.0009, + "reward": 2.370833396911621, + "reward_std": 0.4925486445426941, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8708332777023315, + "step": 143 + }, + { + "completion_length": 1141.3333740234375, + "epoch": 0.5034965034965035, + "grad_norm": 0.527409017086029, + "kl": 0.021072231233119965, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0008, + "reward": 1.9291666746139526, + "reward_std": 0.7955214381217957, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5958333611488342, + "step": 144 + }, + { + "completion_length": 515.5, + "epoch": 0.506993006993007, + "grad_norm": 2.5036261081695557, + "kl": 0.3181736469268799, + "learning_rate": 3.625e-06, + "loss": 0.0127, + "reward": 1.5833333730697632, + "reward_std": 0.9988327026367188, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5833333730697632, + "step": 145 + }, + { + "completion_length": 599.5, + "epoch": 0.5104895104895105, + "grad_norm": 0.7538139224052429, + "kl": 0.041587017476558685, + "learning_rate": 3.65e-06, + "loss": 0.0017, + "reward": 1.3583334684371948, + "reward_std": 0.6873258352279663, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6916666030883789, + "step": 146 + }, + { + "completion_length": 645.6666870117188, + "epoch": 0.513986013986014, + "grad_norm": 0.6815938353538513, + "kl": 0.031590305268764496, + "learning_rate": 3.6750000000000004e-06, + "loss": 0.0013, + "reward": 2.445833683013916, + "reward_std": 1.186003565788269, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6124999523162842, + "step": 147 + }, + { + "completion_length": 731.0, + "epoch": 0.5174825174825175, + "grad_norm": 1.4654277563095093, + "kl": 0.11272114515304565, + "learning_rate": 3.7e-06, + "loss": 0.0045, + "reward": 1.2125000953674316, + "reward_std": 0.7435977458953857, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7124999761581421, + "step": 148 + }, + { + "completion_length": 476.16668701171875, + "epoch": 0.5209790209790209, + "grad_norm": 3.388495683670044, + "kl": 0.9080104827880859, + "learning_rate": 3.7250000000000003e-06, + "loss": 0.0363, + "reward": 1.8958333730697632, + "reward_std": 0.9965461492538452, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3958333432674408, + "step": 149 + }, + { + "completion_length": 1053.166748046875, + "epoch": 0.5244755244755245, + "grad_norm": 0.4761454164981842, + "kl": 0.027715642005205154, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0011, + "reward": 3.2916667461395264, + "reward_std": 0.7417322397232056, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 150 + }, + { + "completion_length": 751.1666870117188, + "epoch": 0.527972027972028, + "grad_norm": 0.6827074885368347, + "kl": 0.0386313796043396, + "learning_rate": 3.7750000000000003e-06, + "loss": 0.0015, + "reward": 2.495833396911621, + "reward_std": 1.0227923393249512, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6625000238418579, + "step": 151 + }, + { + "completion_length": 721.8333740234375, + "epoch": 0.5314685314685315, + "grad_norm": 1.2814685106277466, + "kl": 0.041070081293582916, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0016, + "reward": 2.4666666984558105, + "reward_std": 0.8834120631217957, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 152 + }, + { + "completion_length": 513.0, + "epoch": 0.534965034965035, + "grad_norm": 0.6044140458106995, + "kl": 0.08036690950393677, + "learning_rate": 3.825000000000001e-06, + "loss": 0.0032, + "reward": 1.7875001430511475, + "reward_std": 1.1646621227264404, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6208333373069763, + "step": 153 + }, + { + "completion_length": 720.8333740234375, + "epoch": 0.5384615384615384, + "grad_norm": 0.7732751965522766, + "kl": 0.04927179962396622, + "learning_rate": 3.85e-06, + "loss": 0.002, + "reward": 2.383333206176758, + "reward_std": 1.4126808643341064, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 154 + }, + { + "completion_length": 708.8333740234375, + "epoch": 0.541958041958042, + "grad_norm": 0.6660548448562622, + "kl": 0.07937665283679962, + "learning_rate": 3.875e-06, + "loss": 0.0032, + "reward": 2.183333396911621, + "reward_std": 0.6377042531967163, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8500000238418579, + "step": 155 + }, + { + "completion_length": 1192.0, + "epoch": 0.5454545454545454, + "grad_norm": 0.3896901309490204, + "kl": 0.025209862738847733, + "learning_rate": 3.900000000000001e-06, + "loss": 0.001, + "reward": 1.8833332061767578, + "reward_std": 0.8691471815109253, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 156 + }, + { + "completion_length": 705.1666870117188, + "epoch": 0.548951048951049, + "grad_norm": 0.5750932097434998, + "kl": 0.04517858847975731, + "learning_rate": 3.9250000000000005e-06, + "loss": 0.0018, + "reward": 2.9541664123535156, + "reward_std": 0.6458360552787781, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6208333373069763, + "step": 157 + }, + { + "completion_length": 465.5, + "epoch": 0.5524475524475524, + "grad_norm": 0.8335661888122559, + "kl": 0.08351196348667145, + "learning_rate": 3.95e-06, + "loss": 0.0033, + "reward": 2.424999952316284, + "reward_std": 0.941673994064331, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5916666984558105, + "step": 158 + }, + { + "completion_length": 539.6666870117188, + "epoch": 0.5559440559440559, + "grad_norm": 1.1459757089614868, + "kl": 0.12647944688796997, + "learning_rate": 3.975000000000001e-06, + "loss": 0.0051, + "reward": 1.6416667699813843, + "reward_std": 1.2068209648132324, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 159 + }, + { + "completion_length": 798.0, + "epoch": 0.5594405594405595, + "grad_norm": 0.4939272105693817, + "kl": 0.051064085215330124, + "learning_rate": 4.000000000000001e-06, + "loss": 0.002, + "reward": 2.183333396911621, + "reward_std": 1.2081665992736816, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 160 + }, + { + "completion_length": 338.8333435058594, + "epoch": 0.5629370629370629, + "grad_norm": 0.8890612125396729, + "kl": 0.12327366322278976, + "learning_rate": 4.0250000000000004e-06, + "loss": 0.0049, + "reward": 2.575000286102295, + "reward_std": 0.9913375377655029, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40833336114883423, + "step": 161 + }, + { + "completion_length": 809.6666870117188, + "epoch": 0.5664335664335665, + "grad_norm": 0.3928314447402954, + "kl": 0.040153808891773224, + "learning_rate": 4.05e-06, + "loss": 0.0016, + "reward": 1.7208335399627686, + "reward_std": 0.5225937366485596, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7208333015441895, + "step": 162 + }, + { + "completion_length": 766.0, + "epoch": 0.5699300699300699, + "grad_norm": 0.7869060039520264, + "kl": 0.04531605541706085, + "learning_rate": 4.075e-06, + "loss": 0.0018, + "reward": 2.120833396911621, + "reward_std": 0.8866251707077026, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4541667103767395, + "step": 163 + }, + { + "completion_length": 1085.666748046875, + "epoch": 0.5734265734265734, + "grad_norm": 1.0671396255493164, + "kl": 0.06464602053165436, + "learning_rate": 4.1e-06, + "loss": 0.0026, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666865348816, + "step": 164 + }, + { + "completion_length": 628.1666870117188, + "epoch": 0.5769230769230769, + "grad_norm": 0.9583672285079956, + "kl": 0.06743767857551575, + "learning_rate": 4.125e-06, + "loss": 0.0027, + "reward": 2.137500286102295, + "reward_std": 1.376930594444275, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.637499988079071, + "step": 165 + }, + { + "completion_length": 351.8333435058594, + "epoch": 0.5804195804195804, + "grad_norm": 0.6946209669113159, + "kl": 0.09894745796918869, + "learning_rate": 4.15e-06, + "loss": 0.004, + "reward": 2.7750000953674316, + "reward_std": 0.7055140733718872, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4416666626930237, + "step": 166 + }, + { + "completion_length": 448.16668701171875, + "epoch": 0.583916083916084, + "grad_norm": 0.6712130308151245, + "kl": 0.0714031383395195, + "learning_rate": 4.175e-06, + "loss": 0.0029, + "reward": 1.9583333730697632, + "reward_std": 0.6499359011650085, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6250000596046448, + "step": 167 + }, + { + "completion_length": 763.0, + "epoch": 0.5874125874125874, + "grad_norm": 0.5934569239616394, + "kl": 0.039833370596170425, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0016, + "reward": 1.7208335399627686, + "reward_std": 0.6870983839035034, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.720833420753479, + "step": 168 + }, + { + "completion_length": 813.8333740234375, + "epoch": 0.5909090909090909, + "grad_norm": 0.46408811211586, + "kl": 0.0639135017991066, + "learning_rate": 4.225e-06, + "loss": 0.0026, + "reward": 2.6625001430511475, + "reward_std": 0.271454393863678, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6625000238418579, + "step": 169 + }, + { + "completion_length": 621.3333740234375, + "epoch": 0.5944055944055944, + "grad_norm": 1.6175382137298584, + "kl": 0.23431169986724854, + "learning_rate": 4.25e-06, + "loss": 0.0094, + "reward": 1.5250000953674316, + "reward_std": 1.00784432888031, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 170 + }, + { + "completion_length": 685.1666870117188, + "epoch": 0.5979020979020979, + "grad_norm": 0.7504808306694031, + "kl": 0.06654171645641327, + "learning_rate": 4.2750000000000006e-06, + "loss": 0.0027, + "reward": 2.4583334922790527, + "reward_std": 0.8187898397445679, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 171 + }, + { + "completion_length": 772.6666870117188, + "epoch": 0.6013986013986014, + "grad_norm": 0.39892545342445374, + "kl": 0.030765770003199577, + "learning_rate": 4.3e-06, + "loss": 0.0012, + "reward": 1.7333333492279053, + "reward_std": 0.40207797288894653, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 172 + }, + { + "completion_length": 600.8333740234375, + "epoch": 0.6048951048951049, + "grad_norm": 0.6147928833961487, + "kl": 0.07108036428689957, + "learning_rate": 4.325e-06, + "loss": 0.0028, + "reward": 2.054166793823242, + "reward_std": 0.5684225559234619, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7208333015441895, + "step": 173 + }, + { + "completion_length": 761.3333740234375, + "epoch": 0.6083916083916084, + "grad_norm": 1.1690645217895508, + "kl": 0.11572085320949554, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0046, + "reward": 1.9583333730697632, + "reward_std": 1.2491663694381714, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7916666865348816, + "step": 174 + }, + { + "completion_length": 800.6666870117188, + "epoch": 0.6118881118881119, + "grad_norm": 1.141146183013916, + "kl": 0.0763167217373848, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.0031, + "reward": 1.4458335638046265, + "reward_std": 1.0782413482666016, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6124999523162842, + "step": 175 + }, + { + "completion_length": 582.0, + "epoch": 0.6153846153846154, + "grad_norm": 0.9667629599571228, + "kl": 0.04065123200416565, + "learning_rate": 4.4e-06, + "loss": 0.0016, + "reward": 1.5625, + "reward_std": 1.3656271696090698, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5625, + "step": 176 + }, + { + "completion_length": 653.6666870117188, + "epoch": 0.6188811188811189, + "grad_norm": 0.7743256092071533, + "kl": 0.07254478335380554, + "learning_rate": 4.425e-06, + "loss": 0.0029, + "reward": 1.308333396911621, + "reward_std": 0.7324048280715942, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6416666507720947, + "step": 177 + }, + { + "completion_length": 624.8333740234375, + "epoch": 0.6223776223776224, + "grad_norm": 1.7900493144989014, + "kl": 0.2500300407409668, + "learning_rate": 4.450000000000001e-06, + "loss": 0.01, + "reward": 1.3583333492279053, + "reward_std": 0.7825705409049988, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916667222976685, + "step": 178 + }, + { + "completion_length": 1285.0, + "epoch": 0.6258741258741258, + "grad_norm": 0.3387628197669983, + "kl": 0.025821728631854057, + "learning_rate": 4.475e-06, + "loss": 0.001, + "reward": 2.7916667461395264, + "reward_std": 0.678355872631073, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666269302368, + "step": 179 + }, + { + "completion_length": 975.8333740234375, + "epoch": 0.6293706293706294, + "grad_norm": 0.41932833194732666, + "kl": 0.04700490087270737, + "learning_rate": 4.5e-06, + "loss": 0.0019, + "reward": 1.8500001430511475, + "reward_std": 0.6782330274581909, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8499999046325684, + "step": 180 + }, + { + "completion_length": 771.8333740234375, + "epoch": 0.6328671328671329, + "grad_norm": 0.6049262881278992, + "kl": 0.05856431648135185, + "learning_rate": 4.525000000000001e-06, + "loss": 0.0023, + "reward": 1.6624999046325684, + "reward_std": 1.5213277339935303, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6625000238418579, + "step": 181 + }, + { + "completion_length": 718.3333740234375, + "epoch": 0.6363636363636364, + "grad_norm": 0.519266664981842, + "kl": 0.05408002436161041, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0022, + "reward": 3.012500286102295, + "reward_std": 1.0839452743530273, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8458333015441895, + "step": 182 + }, + { + "completion_length": 417.3333435058594, + "epoch": 0.6398601398601399, + "grad_norm": 1.159592866897583, + "kl": 0.06883987784385681, + "learning_rate": 4.575e-06, + "loss": 0.0028, + "reward": 2.308333396911621, + "reward_std": 1.089686393737793, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416666507720947, + "step": 183 + }, + { + "completion_length": 403.66668701171875, + "epoch": 0.6433566433566433, + "grad_norm": 0.9109689593315125, + "kl": 0.12938742339611053, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0052, + "reward": 2.829166889190674, + "reward_std": 0.9263390898704529, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4958333373069763, + "step": 184 + }, + { + "completion_length": 584.1666870117188, + "epoch": 0.6468531468531469, + "grad_norm": 1.3091282844543457, + "kl": 0.1182996854186058, + "learning_rate": 4.625000000000001e-06, + "loss": 0.0047, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 185 + }, + { + "completion_length": 715.8333740234375, + "epoch": 0.6503496503496503, + "grad_norm": 0.8944427967071533, + "kl": 0.07471362501382828, + "learning_rate": 4.65e-06, + "loss": 0.003, + "reward": 2.5500001907348633, + "reward_std": 1.0044898986816406, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 186 + }, + { + "completion_length": 328.66668701171875, + "epoch": 0.6538461538461539, + "grad_norm": 2.0265045166015625, + "kl": 0.3070363402366638, + "learning_rate": 4.675000000000001e-06, + "loss": 0.0123, + "reward": 2.0291666984558105, + "reward_std": 0.9910117983818054, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.36250001192092896, + "step": 187 + }, + { + "completion_length": 463.8333435058594, + "epoch": 0.6573426573426573, + "grad_norm": 1.1863874197006226, + "kl": 0.07772837579250336, + "learning_rate": 4.7e-06, + "loss": 0.0031, + "reward": 2.5333335399627686, + "reward_std": 0.9558593034744263, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5333333611488342, + "step": 188 + }, + { + "completion_length": 516.5, + "epoch": 0.6608391608391608, + "grad_norm": 0.690477192401886, + "kl": 0.08707510679960251, + "learning_rate": 4.7250000000000005e-06, + "loss": 0.0035, + "reward": 3.4000000953674316, + "reward_std": 1.2024973630905151, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 189 + }, + { + "completion_length": 656.8333740234375, + "epoch": 0.6643356643356644, + "grad_norm": 0.7191756963729858, + "kl": 0.05152536556124687, + "learning_rate": 4.75e-06, + "loss": 0.0021, + "reward": 1.7833335399627686, + "reward_std": 0.5288351774215698, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 190 + }, + { + "completion_length": 510.16668701171875, + "epoch": 0.6678321678321678, + "grad_norm": 1.589722990989685, + "kl": 0.11165278404951096, + "learning_rate": 4.775e-06, + "loss": 0.0045, + "reward": 1.5916666984558105, + "reward_std": 1.1620744466781616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5916666984558105, + "step": 191 + }, + { + "completion_length": 463.3333435058594, + "epoch": 0.6713286713286714, + "grad_norm": 1.1402506828308105, + "kl": 0.12224837392568588, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0049, + "reward": 3.0166664123535156, + "reward_std": 0.46224093437194824, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6833333373069763, + "step": 192 + }, + { + "completion_length": 668.8333740234375, + "epoch": 0.6748251748251748, + "grad_norm": 0.829407811164856, + "kl": 0.04827030003070831, + "learning_rate": 4.825e-06, + "loss": 0.0019, + "reward": 2.516666889190674, + "reward_std": 0.9416297674179077, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 193 + }, + { + "completion_length": 653.1666870117188, + "epoch": 0.6783216783216783, + "grad_norm": 0.8737359642982483, + "kl": 0.11687206476926804, + "learning_rate": 4.85e-06, + "loss": 0.0047, + "reward": 1.883333444595337, + "reward_std": 0.9978310465812683, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666388511658, + "step": 194 + }, + { + "completion_length": 521.1666870117188, + "epoch": 0.6818181818181818, + "grad_norm": 1.265020728111267, + "kl": 0.1497541069984436, + "learning_rate": 4.875e-06, + "loss": 0.006, + "reward": 1.6666667461395264, + "reward_std": 1.1578716039657593, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6666666865348816, + "step": 195 + }, + { + "completion_length": 720.3333740234375, + "epoch": 0.6853146853146853, + "grad_norm": 0.5844486355781555, + "kl": 0.07905390858650208, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0032, + "reward": 2.683333396911621, + "reward_std": 0.7659417986869812, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 196 + }, + { + "completion_length": 654.3333740234375, + "epoch": 0.6888111888111889, + "grad_norm": 1.0279442071914673, + "kl": 0.05869147181510925, + "learning_rate": 4.925e-06, + "loss": 0.0023, + "reward": 1.8250000476837158, + "reward_std": 1.047735571861267, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.824999988079071, + "step": 197 + }, + { + "completion_length": 696.5, + "epoch": 0.6923076923076923, + "grad_norm": 0.5949178338050842, + "kl": 0.10564576834440231, + "learning_rate": 4.95e-06, + "loss": 0.0042, + "reward": 2.7958333492279053, + "reward_std": 0.8044278621673584, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 198 + }, + { + "completion_length": 667.3333740234375, + "epoch": 0.6958041958041958, + "grad_norm": 1.4045933485031128, + "kl": 0.2249039262533188, + "learning_rate": 4.975000000000001e-06, + "loss": 0.009, + "reward": 1.7833333015441895, + "reward_std": 1.2967909574508667, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 199 + }, + { + "completion_length": 549.0, + "epoch": 0.6993006993006993, + "grad_norm": 11.491266250610352, + "kl": 2.7085909843444824, + "learning_rate": 5e-06, + "loss": 0.1083, + "reward": 1.883333444595337, + "reward_std": 1.0934655666351318, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 200 + }, + { + "completion_length": 1157.666748046875, + "epoch": 0.7027972027972028, + "grad_norm": 0.3758504092693329, + "kl": 0.03439244627952576, + "learning_rate": 4.99999619228322e-06, + "loss": 0.0014, + "reward": 1.5375001430511475, + "reward_std": 0.490853875875473, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 201 + }, + { + "completion_length": 276.66668701171875, + "epoch": 0.7062937062937062, + "grad_norm": 1.4240407943725586, + "kl": 0.09711845219135284, + "learning_rate": 4.999984769144476e-06, + "loss": 0.0039, + "reward": 1.774999976158142, + "reward_std": 1.4250439405441284, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.44166669249534607, + "step": 202 + }, + { + "completion_length": 506.16668701171875, + "epoch": 0.7097902097902098, + "grad_norm": 0.8863720893859863, + "kl": 0.0886097177863121, + "learning_rate": 4.999965730618567e-06, + "loss": 0.0035, + "reward": 2.4166667461395264, + "reward_std": 0.7717944979667664, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.75, + "step": 203 + }, + { + "completion_length": 558.8333740234375, + "epoch": 0.7132867132867133, + "grad_norm": 1.036176323890686, + "kl": 0.11752279102802277, + "learning_rate": 4.999939076763487e-06, + "loss": 0.0047, + "reward": 1.8583334684371948, + "reward_std": 0.7761551141738892, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916666030883789, + "step": 204 + }, + { + "completion_length": 590.3333740234375, + "epoch": 0.7167832167832168, + "grad_norm": 1.2968803644180298, + "kl": 0.1260688155889511, + "learning_rate": 4.9999048076604286e-06, + "loss": 0.005, + "reward": 1.883333444595337, + "reward_std": 1.0934655666351318, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666984558105, + "step": 205 + }, + { + "completion_length": 653.3333740234375, + "epoch": 0.7202797202797203, + "grad_norm": 1.9041389226913452, + "kl": 0.350026935338974, + "learning_rate": 4.999862923413781e-06, + "loss": 0.014, + "reward": 1.8041666746139526, + "reward_std": 0.5104941129684448, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6375000476837158, + "step": 206 + }, + { + "completion_length": 359.3333435058594, + "epoch": 0.7237762237762237, + "grad_norm": 1.4652067422866821, + "kl": 0.09337612986564636, + "learning_rate": 4.9998134241511305e-06, + "loss": 0.0037, + "reward": 1.875, + "reward_std": 1.1440061330795288, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5416666865348816, + "step": 207 + }, + { + "completion_length": 393.3333435058594, + "epoch": 0.7272727272727273, + "grad_norm": 0.8172839879989624, + "kl": 0.11479752510786057, + "learning_rate": 4.999756310023261e-06, + "loss": 0.0046, + "reward": 3.2916667461395264, + "reward_std": 0.46627962589263916, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.625, + "step": 208 + }, + { + "completion_length": 1035.166748046875, + "epoch": 0.7307692307692307, + "grad_norm": 0.45489755272865295, + "kl": 0.03647574782371521, + "learning_rate": 4.9996915812041515e-06, + "loss": 0.0015, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 209 + }, + { + "completion_length": 561.5, + "epoch": 0.7342657342657343, + "grad_norm": 0.7732179164886475, + "kl": 0.10910838097333908, + "learning_rate": 4.9996192378909785e-06, + "loss": 0.0044, + "reward": 3.075000286102295, + "reward_std": 0.9852665662765503, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416667342185974, + "step": 210 + }, + { + "completion_length": 327.3333435058594, + "epoch": 0.7377622377622378, + "grad_norm": 1.1959446668624878, + "kl": 0.18659886717796326, + "learning_rate": 4.999539280304111e-06, + "loss": 0.0075, + "reward": 1.7333333492279053, + "reward_std": 0.6875075697898865, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5666666626930237, + "step": 211 + }, + { + "completion_length": 698.1666870117188, + "epoch": 0.7412587412587412, + "grad_norm": 0.5885636806488037, + "kl": 0.06670037657022476, + "learning_rate": 4.999451708687114e-06, + "loss": 0.0027, + "reward": 2.7750003337860107, + "reward_std": 0.8341163396835327, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7749999761581421, + "step": 212 + }, + { + "completion_length": 679.8333740234375, + "epoch": 0.7447552447552448, + "grad_norm": 0.9122396111488342, + "kl": 0.10316199064254761, + "learning_rate": 4.999356523306746e-06, + "loss": 0.0041, + "reward": 2.008333444595337, + "reward_std": 1.2973692417144775, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5083333253860474, + "step": 213 + }, + { + "completion_length": 604.1666870117188, + "epoch": 0.7482517482517482, + "grad_norm": 0.7414869070053101, + "kl": 0.08340045064687729, + "learning_rate": 4.9992537244529585e-06, + "loss": 0.0033, + "reward": 3.299999952316284, + "reward_std": 0.41713306307792664, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8000000715255737, + "step": 214 + }, + { + "completion_length": 704.5, + "epoch": 0.7517482517482518, + "grad_norm": 2.09073543548584, + "kl": 0.10594753921031952, + "learning_rate": 4.999143312438893e-06, + "loss": 0.0042, + "reward": 1.7416666746139526, + "reward_std": 0.9259679317474365, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7416667342185974, + "step": 215 + }, + { + "completion_length": 587.8333740234375, + "epoch": 0.7552447552447552, + "grad_norm": 1.304240107536316, + "kl": 0.1295248121023178, + "learning_rate": 4.999025287600886e-06, + "loss": 0.0052, + "reward": 2.616666793823242, + "reward_std": 1.6061341762542725, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6166666746139526, + "step": 216 + }, + { + "completion_length": 495.8333435058594, + "epoch": 0.7587412587412588, + "grad_norm": 1.2090598344802856, + "kl": 0.11880560964345932, + "learning_rate": 4.9988996502984604e-06, + "loss": 0.0048, + "reward": 2.7333333492279053, + "reward_std": 1.022578477859497, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5666667222976685, + "step": 217 + }, + { + "completion_length": 565.6666870117188, + "epoch": 0.7622377622377622, + "grad_norm": 0.553954005241394, + "kl": 0.052788302302360535, + "learning_rate": 4.998766400914329e-06, + "loss": 0.0021, + "reward": 2.6999998092651367, + "reward_std": 0.9705669283866882, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.699999988079071, + "step": 218 + }, + { + "completion_length": 645.6666870117188, + "epoch": 0.7657342657342657, + "grad_norm": 2.507683038711548, + "kl": 0.2849184274673462, + "learning_rate": 4.998625539854394e-06, + "loss": 0.0114, + "reward": 2.6000001430511475, + "reward_std": 1.0089600086212158, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7666666507720947, + "step": 219 + }, + { + "completion_length": 321.66668701171875, + "epoch": 0.7692307692307693, + "grad_norm": 1.2175945043563843, + "kl": 0.0842239186167717, + "learning_rate": 4.99847706754774e-06, + "loss": 0.0034, + "reward": 2.933333158493042, + "reward_std": 0.6516644954681396, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6000000238418579, + "step": 220 + }, + { + "completion_length": 700.5, + "epoch": 0.7727272727272727, + "grad_norm": 2.048892021179199, + "kl": 0.16157689690589905, + "learning_rate": 4.9983209844466404e-06, + "loss": 0.0065, + "reward": 1.625, + "reward_std": 1.0167349576950073, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 221 + }, + { + "completion_length": 833.5, + "epoch": 0.7762237762237763, + "grad_norm": 0.9171572327613831, + "kl": 0.06645169854164124, + "learning_rate": 4.998157291026553e-06, + "loss": 0.0027, + "reward": 2.9083335399627686, + "reward_std": 1.2068209648132324, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7416666746139526, + "step": 222 + }, + { + "completion_length": 506.3333435058594, + "epoch": 0.7797202797202797, + "grad_norm": 19.220211029052734, + "kl": 3.192702293395996, + "learning_rate": 4.9979859877861155e-06, + "loss": 0.1277, + "reward": 3.191666603088379, + "reward_std": 1.2146673202514648, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.6916667222976685, + "step": 223 + }, + { + "completion_length": 593.0, + "epoch": 0.7832167832167832, + "grad_norm": 0.8852243423461914, + "kl": 0.09442658722400665, + "learning_rate": 4.997807075247147e-06, + "loss": 0.0038, + "reward": 3.2750003337860107, + "reward_std": 0.6691412925720215, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7749999761581421, + "step": 224 + }, + { + "completion_length": 831.1666870117188, + "epoch": 0.7867132867132867, + "grad_norm": 0.4429211914539337, + "kl": 0.04310205578804016, + "learning_rate": 4.997620553954645e-06, + "loss": 0.0017, + "reward": 3.1541666984558105, + "reward_std": 1.132741928100586, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8208333849906921, + "step": 225 + }, + { + "completion_length": 731.0, + "epoch": 0.7902097902097902, + "grad_norm": 0.4210525155067444, + "kl": 0.0507250651717186, + "learning_rate": 4.997426424476787e-06, + "loss": 0.002, + "reward": 3.758333206176758, + "reward_std": 0.40052053332328796, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 226 + }, + { + "completion_length": 683.1666870117188, + "epoch": 0.7937062937062938, + "grad_norm": 1.443489670753479, + "kl": 0.1432674527168274, + "learning_rate": 4.9972246874049254e-06, + "loss": 0.0057, + "reward": 2.7166666984558105, + "reward_std": 1.075019359588623, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 227 + }, + { + "completion_length": 749.0, + "epoch": 0.7972027972027972, + "grad_norm": 0.4731828272342682, + "kl": 0.05084119364619255, + "learning_rate": 4.9970153433535855e-06, + "loss": 0.002, + "reward": 2.5250000953674316, + "reward_std": 0.49371039867401123, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8583332896232605, + "step": 228 + }, + { + "completion_length": 749.6666870117188, + "epoch": 0.8006993006993007, + "grad_norm": 1.1463042497634888, + "kl": 0.0917380303144455, + "learning_rate": 4.996798392960466e-06, + "loss": 0.0037, + "reward": 3.1000001430511475, + "reward_std": 1.1304867267608643, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 229 + }, + { + "completion_length": 444.3333435058594, + "epoch": 0.8041958041958042, + "grad_norm": 2.1588308811187744, + "kl": 0.2637466788291931, + "learning_rate": 4.9965738368864345e-06, + "loss": 0.0105, + "reward": 1.4583333730697632, + "reward_std": 0.665895402431488, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4583333432674408, + "step": 230 + }, + { + "completion_length": 563.8333740234375, + "epoch": 0.8076923076923077, + "grad_norm": 1.7064660787582397, + "kl": 0.15527644753456116, + "learning_rate": 4.99634167581553e-06, + "loss": 0.0062, + "reward": 2.9208335876464844, + "reward_std": 1.1095513105392456, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5874999761581421, + "step": 231 + }, + { + "completion_length": 571.6666870117188, + "epoch": 0.8111888111888111, + "grad_norm": 0.7909032106399536, + "kl": 0.10144728422164917, + "learning_rate": 4.996101910454953e-06, + "loss": 0.0041, + "reward": 3.200000286102295, + "reward_std": 0.6928204298019409, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.699999988079071, + "step": 232 + }, + { + "completion_length": 442.16668701171875, + "epoch": 0.8146853146853147, + "grad_norm": 2.3640758991241455, + "kl": 0.1561039686203003, + "learning_rate": 4.995854541535072e-06, + "loss": 0.0062, + "reward": 2.8583333492279053, + "reward_std": 1.5499732494354248, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 233 + }, + { + "completion_length": 635.0, + "epoch": 0.8181818181818182, + "grad_norm": 1.519736409187317, + "kl": 0.08059443533420563, + "learning_rate": 4.995599569809414e-06, + "loss": 0.0032, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 234 + }, + { + "completion_length": 867.1666870117188, + "epoch": 0.8216783216783217, + "grad_norm": 1.0411657094955444, + "kl": 0.18848155438899994, + "learning_rate": 4.995336996054668e-06, + "loss": 0.0075, + "reward": 2.566666603088379, + "reward_std": 0.8010410666465759, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 235 + }, + { + "completion_length": 767.0, + "epoch": 0.8251748251748252, + "grad_norm": 1.3162877559661865, + "kl": 0.1943603754043579, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.0078, + "reward": 2.8458335399627686, + "reward_std": 1.271457552909851, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8458333015441895, + "step": 236 + }, + { + "completion_length": 971.0, + "epoch": 0.8286713286713286, + "grad_norm": 0.7847824096679688, + "kl": 0.07626049965620041, + "learning_rate": 4.994789045680448e-06, + "loss": 0.0031, + "reward": 2.766666889190674, + "reward_std": 1.1245739459991455, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7666666507720947, + "step": 237 + }, + { + "completion_length": 552.0, + "epoch": 0.8321678321678322, + "grad_norm": 0.7410560250282288, + "kl": 0.10457824170589447, + "learning_rate": 4.994503670730126e-06, + "loss": 0.0042, + "reward": 3.391666889190674, + "reward_std": 0.7059863805770874, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7250000238418579, + "step": 238 + }, + { + "completion_length": 725.6666870117188, + "epoch": 0.8356643356643356, + "grad_norm": 0.4836815595626831, + "kl": 0.05600851774215698, + "learning_rate": 4.9942106970890136e-06, + "loss": 0.0022, + "reward": 2.7333333492279053, + "reward_std": 0.40207791328430176, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8999999761581421, + "step": 239 + }, + { + "completion_length": 670.1666870117188, + "epoch": 0.8391608391608392, + "grad_norm": 1.1572860479354858, + "kl": 0.09645780920982361, + "learning_rate": 4.993910125649561e-06, + "loss": 0.0039, + "reward": 1.945833444595337, + "reward_std": 1.1002748012542725, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.612500011920929, + "step": 240 + }, + { + "completion_length": 716.0, + "epoch": 0.8426573426573427, + "grad_norm": 0.6385201811790466, + "kl": 0.10877624154090881, + "learning_rate": 4.993601957327361e-06, + "loss": 0.0044, + "reward": 1.7999999523162842, + "reward_std": 1.3168143033981323, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 241 + }, + { + "completion_length": 783.0, + "epoch": 0.8461538461538461, + "grad_norm": 0.4785465598106384, + "kl": 0.06399235874414444, + "learning_rate": 4.993286193061145e-06, + "loss": 0.0026, + "reward": 2.258333444595337, + "reward_std": 0.5389031767845154, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9249999523162842, + "step": 242 + }, + { + "completion_length": 660.6666870117188, + "epoch": 0.8496503496503497, + "grad_norm": 0.7678278684616089, + "kl": 0.07323874533176422, + "learning_rate": 4.9929628338127904e-06, + "loss": 0.0029, + "reward": 2.575000047683716, + "reward_std": 1.0048632621765137, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7416666746139526, + "step": 243 + }, + { + "completion_length": 904.5, + "epoch": 0.8531468531468531, + "grad_norm": 0.41908255219459534, + "kl": 0.049275174736976624, + "learning_rate": 4.992631880567301e-06, + "loss": 0.002, + "reward": 1.9250000715255737, + "reward_std": 0.6354132890701294, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.9249999523162842, + "step": 244 + }, + { + "completion_length": 524.8333740234375, + "epoch": 0.8566433566433567, + "grad_norm": 0.9670363068580627, + "kl": 0.17363564670085907, + "learning_rate": 4.992293334332821e-06, + "loss": 0.0069, + "reward": 1.558333396911621, + "reward_std": 1.3331979513168335, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333373069763, + "step": 245 + }, + { + "completion_length": 869.1666870117188, + "epoch": 0.8601398601398601, + "grad_norm": 0.45620983839035034, + "kl": 0.0668826699256897, + "learning_rate": 4.991947196140619e-06, + "loss": 0.0027, + "reward": 2.5416667461395264, + "reward_std": 0.9057685732841492, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 246 + }, + { + "completion_length": 841.3333740234375, + "epoch": 0.8636363636363636, + "grad_norm": 0.559363603591919, + "kl": 0.0583985298871994, + "learning_rate": 4.991593467045092e-06, + "loss": 0.0023, + "reward": 2.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 247 + }, + { + "completion_length": 599.1666870117188, + "epoch": 0.8671328671328671, + "grad_norm": 0.9642091989517212, + "kl": 0.11994724720716476, + "learning_rate": 4.9912321481237616e-06, + "loss": 0.0048, + "reward": 2.5250000953674316, + "reward_std": 1.0810874700546265, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 248 + }, + { + "completion_length": 462.16668701171875, + "epoch": 0.8706293706293706, + "grad_norm": 36.93287658691406, + "kl": 9.688800811767578, + "learning_rate": 4.990863240477266e-06, + "loss": 0.3876, + "reward": 2.133333444595337, + "reward_std": 1.5154757499694824, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.46666666865348816, + "step": 249 + }, + { + "completion_length": 339.0, + "epoch": 0.8741258741258742, + "grad_norm": 26.625389099121094, + "kl": 0.959087610244751, + "learning_rate": 4.990486745229364e-06, + "loss": 0.0384, + "reward": 2.4000000953674316, + "reward_std": 1.4926488399505615, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5666666626930237, + "step": 250 + }, + { + "completion_length": 618.1666870117188, + "epoch": 0.8776223776223776, + "grad_norm": 0.8756181597709656, + "kl": 0.1540575623512268, + "learning_rate": 4.990102663526925e-06, + "loss": 0.0062, + "reward": 2.3583335876464844, + "reward_std": 0.7564169764518738, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916666030883789, + "step": 251 + }, + { + "completion_length": 659.0, + "epoch": 0.8811188811188811, + "grad_norm": 1.4729007482528687, + "kl": 0.22244331240653992, + "learning_rate": 4.989710996539926e-06, + "loss": 0.0089, + "reward": 2.6666667461395264, + "reward_std": 1.386602759361267, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6666666865348816, + "step": 252 + }, + { + "completion_length": 471.0, + "epoch": 0.8846153846153846, + "grad_norm": 1.7183626890182495, + "kl": 0.19531545042991638, + "learning_rate": 4.989311745461456e-06, + "loss": 0.0078, + "reward": 2.2624998092651367, + "reward_std": 1.547720193862915, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.42916664481163025, + "step": 253 + }, + { + "completion_length": 809.5, + "epoch": 0.8881118881118881, + "grad_norm": 1.3393943309783936, + "kl": 0.06276177614927292, + "learning_rate": 4.9889049115077e-06, + "loss": 0.0025, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 254 + }, + { + "completion_length": 696.0, + "epoch": 0.8916083916083916, + "grad_norm": 0.5159295201301575, + "kl": 0.06829811632633209, + "learning_rate": 4.988490495917948e-06, + "loss": 0.0027, + "reward": 2.375, + "reward_std": 0.8226482272148132, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.875, + "step": 255 + }, + { + "completion_length": 469.8333435058594, + "epoch": 0.8951048951048951, + "grad_norm": 15.731892585754395, + "kl": 5.195942401885986, + "learning_rate": 4.988068499954578e-06, + "loss": 0.2078, + "reward": 2.5333333015441895, + "reward_std": 1.7218208312988281, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5333333611488342, + "step": 256 + }, + { + "completion_length": 267.66668701171875, + "epoch": 0.8986013986013986, + "grad_norm": 2.6494510173797607, + "kl": 0.2645886242389679, + "learning_rate": 4.987638924903066e-06, + "loss": 0.0106, + "reward": 1.9833333492279053, + "reward_std": 1.6277797222137451, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4833333194255829, + "step": 257 + }, + { + "completion_length": 772.3333740234375, + "epoch": 0.9020979020979021, + "grad_norm": 0.4527927339076996, + "kl": 0.06693247705698013, + "learning_rate": 4.987201772071971e-06, + "loss": 0.0027, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 258 + }, + { + "completion_length": 585.6666870117188, + "epoch": 0.9055944055944056, + "grad_norm": 0.689224362373352, + "kl": 0.08530323952436447, + "learning_rate": 4.9867570427929356e-06, + "loss": 0.0034, + "reward": 0.7916666865348816, + "reward_std": 0.9183772802352905, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4583333432674408, + "step": 259 + }, + { + "completion_length": 537.1666870117188, + "epoch": 0.9090909090909091, + "grad_norm": 0.6728858947753906, + "kl": 0.0897747129201889, + "learning_rate": 4.986304738420684e-06, + "loss": 0.0036, + "reward": 3.129167079925537, + "reward_std": 1.1996268033981323, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7958333492279053, + "step": 260 + }, + { + "completion_length": 407.8333435058594, + "epoch": 0.9125874125874126, + "grad_norm": 1.1994887590408325, + "kl": 0.09183052182197571, + "learning_rate": 4.985844860333012e-06, + "loss": 0.0037, + "reward": 2.3583335876464844, + "reward_std": 1.752831220626831, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 261 + }, + { + "completion_length": 677.5, + "epoch": 0.916083916083916, + "grad_norm": 0.508855402469635, + "kl": 0.07326661795377731, + "learning_rate": 4.985377409930789e-06, + "loss": 0.0029, + "reward": 3.375, + "reward_std": 0.8635681867599487, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 262 + }, + { + "completion_length": 736.8333740234375, + "epoch": 0.9195804195804196, + "grad_norm": 0.9614912271499634, + "kl": 0.09196578711271286, + "learning_rate": 4.98490238863795e-06, + "loss": 0.0037, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 263 + }, + { + "completion_length": 770.8333740234375, + "epoch": 0.9230769230769231, + "grad_norm": 0.47455278038978577, + "kl": 0.06785900890827179, + "learning_rate": 4.984419797901491e-06, + "loss": 0.0027, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 264 + }, + { + "completion_length": 623.6666870117188, + "epoch": 0.9265734265734266, + "grad_norm": 0.5573136210441589, + "kl": 0.08627455681562424, + "learning_rate": 4.9839296391914696e-06, + "loss": 0.0035, + "reward": 3.116666793823242, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 265 + }, + { + "completion_length": 391.3333435058594, + "epoch": 0.9300699300699301, + "grad_norm": 1.9462356567382812, + "kl": 0.16661277413368225, + "learning_rate": 4.983431914000991e-06, + "loss": 0.0067, + "reward": 2.4749999046325684, + "reward_std": 1.4665435552597046, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 266 + }, + { + "completion_length": 397.3333435058594, + "epoch": 0.9335664335664335, + "grad_norm": 1.011677622795105, + "kl": 0.23764805495738983, + "learning_rate": 4.982926623846216e-06, + "loss": 0.0095, + "reward": 3.366666793823242, + "reward_std": 0.6274287104606628, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7000000476837158, + "step": 267 + }, + { + "completion_length": 417.0, + "epoch": 0.9370629370629371, + "grad_norm": 1.4490914344787598, + "kl": 0.13754335045814514, + "learning_rate": 4.9824137702663424e-06, + "loss": 0.0055, + "reward": 1.7125000953674316, + "reward_std": 0.9684717655181885, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7124999761581421, + "step": 268 + }, + { + "completion_length": 410.5, + "epoch": 0.9405594405594405, + "grad_norm": 0.8436146974563599, + "kl": 0.14260268211364746, + "learning_rate": 4.981893354823614e-06, + "loss": 0.0057, + "reward": 1.8125, + "reward_std": 1.1806514263153076, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6458333730697632, + "step": 269 + }, + { + "completion_length": 644.6666870117188, + "epoch": 0.9440559440559441, + "grad_norm": 0.7549885511398315, + "kl": 0.09023593366146088, + "learning_rate": 4.981365379103306e-06, + "loss": 0.0036, + "reward": 2.3500001430511475, + "reward_std": 1.3856406211853027, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 270 + }, + { + "completion_length": 195.5, + "epoch": 0.9475524475524476, + "grad_norm": 1.895914077758789, + "kl": 0.29670989513397217, + "learning_rate": 4.980829844713722e-06, + "loss": 0.0119, + "reward": 1.649999976158142, + "reward_std": 1.0168579816818237, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.3166666626930237, + "step": 271 + }, + { + "completion_length": 359.8333435058594, + "epoch": 0.951048951048951, + "grad_norm": 1.0856112241744995, + "kl": 0.255443274974823, + "learning_rate": 4.980286753286196e-06, + "loss": 0.0102, + "reward": 2.2916667461395264, + "reward_std": 1.2310227155685425, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.625, + "step": 272 + }, + { + "completion_length": 726.8333740234375, + "epoch": 0.9545454545454546, + "grad_norm": 0.2943981885910034, + "kl": 0.12990406155586243, + "learning_rate": 4.979736106475075e-06, + "loss": 0.0064, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 273 + }, + { + "completion_length": 680.0, + "epoch": 0.958041958041958, + "grad_norm": 0.5072641372680664, + "kl": 0.07472037523984909, + "learning_rate": 4.979177905957726e-06, + "loss": 0.003, + "reward": 3.012500286102295, + "reward_std": 1.1379531621932983, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8458333015441895, + "step": 274 + }, + { + "completion_length": 491.5, + "epoch": 0.9615384615384616, + "grad_norm": 0.6770206689834595, + "kl": 0.13075995445251465, + "learning_rate": 4.978612153434527e-06, + "loss": 0.0052, + "reward": 2.008333444595337, + "reward_std": 0.7618508338928223, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6750000715255737, + "step": 275 + }, + { + "completion_length": 749.6666870117188, + "epoch": 0.965034965034965, + "grad_norm": 0.5412439107894897, + "kl": 0.10561086982488632, + "learning_rate": 4.978038850628855e-06, + "loss": 0.0042, + "reward": 2.870833396911621, + "reward_std": 0.6615166068077087, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 276 + }, + { + "completion_length": 511.5, + "epoch": 0.9685314685314685, + "grad_norm": 1.1368520259857178, + "kl": 0.14474637806415558, + "learning_rate": 4.977457999287091e-06, + "loss": 0.0058, + "reward": 1.7583332061767578, + "reward_std": 1.0646204948425293, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 277 + }, + { + "completion_length": 750.6666870117188, + "epoch": 0.972027972027972, + "grad_norm": 1.0957084894180298, + "kl": 0.10108073800802231, + "learning_rate": 4.9768696011786095e-06, + "loss": 0.004, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 278 + }, + { + "completion_length": 324.3333435058594, + "epoch": 0.9755244755244755, + "grad_norm": 1.0172570943832397, + "kl": 0.31204575300216675, + "learning_rate": 4.976273658095772e-06, + "loss": 0.0125, + "reward": 0.908333420753479, + "reward_std": 1.0532886981964111, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40833330154418945, + "step": 279 + }, + { + "completion_length": 329.66668701171875, + "epoch": 0.9790209790209791, + "grad_norm": 0.753690242767334, + "kl": 0.09907300770282745, + "learning_rate": 4.975670171853926e-06, + "loss": 0.004, + "reward": 2.7750003337860107, + "reward_std": 1.0994317531585693, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7749999761581421, + "step": 280 + }, + { + "completion_length": 615.3333740234375, + "epoch": 0.9825174825174825, + "grad_norm": 0.8215593695640564, + "kl": 0.09376661479473114, + "learning_rate": 4.975059144291395e-06, + "loss": 0.0038, + "reward": 2.375, + "reward_std": 0.6509608626365662, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8749999403953552, + "step": 281 + }, + { + "completion_length": 435.8333435058594, + "epoch": 0.986013986013986, + "grad_norm": 1.3309355974197388, + "kl": 0.21346941590309143, + "learning_rate": 4.974440577269473e-06, + "loss": 0.0085, + "reward": 2.0333333015441895, + "reward_std": 1.6485350131988525, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5333333611488342, + "step": 282 + }, + { + "completion_length": 470.3333435058594, + "epoch": 0.9895104895104895, + "grad_norm": 1.1230376958847046, + "kl": 0.1047142893075943, + "learning_rate": 4.973814472672424e-06, + "loss": 0.0042, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 283 + }, + { + "completion_length": 887.5, + "epoch": 0.993006993006993, + "grad_norm": 0.6477030515670776, + "kl": 0.08142790198326111, + "learning_rate": 4.973180832407471e-06, + "loss": 0.0033, + "reward": 1.4250000715255737, + "reward_std": 0.9661781191825867, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5916666388511658, + "step": 284 + }, + { + "completion_length": 566.3333740234375, + "epoch": 0.9965034965034965, + "grad_norm": 0.7089259624481201, + "kl": 0.1486695259809494, + "learning_rate": 4.972539658404793e-06, + "loss": 0.0059, + "reward": 1.7166666984558105, + "reward_std": 0.7332576513290405, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 285 + }, + { + "completion_length": 899.3333740234375, + "epoch": 1.0, + "grad_norm": 0.6575971841812134, + "kl": 0.0989997610449791, + "learning_rate": 4.971890952617515e-06, + "loss": 0.004, + "reward": 2.8583335876464844, + "reward_std": 0.9960757493972778, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 286 + }, + { + "completion_length": 414.5, + "epoch": 1.0034965034965035, + "grad_norm": 1.0364247560501099, + "kl": 0.19011634588241577, + "learning_rate": 4.971234717021709e-06, + "loss": 0.0076, + "reward": 1.7916667461395264, + "reward_std": 1.7468304634094238, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6250000596046448, + "step": 287 + }, + { + "completion_length": 524.0, + "epoch": 1.006993006993007, + "grad_norm": 0.9833644032478333, + "kl": 0.14835724234580994, + "learning_rate": 4.970570953616383e-06, + "loss": 0.0059, + "reward": 2.3583335876464844, + "reward_std": 1.1191142797470093, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916666030883789, + "step": 288 + }, + { + "completion_length": 681.1666870117188, + "epoch": 1.0104895104895104, + "grad_norm": 0.6175888180732727, + "kl": 0.10941031575202942, + "learning_rate": 4.969899664423473e-06, + "loss": 0.0044, + "reward": 2.704166889190674, + "reward_std": 0.7567061185836792, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8708333373069763, + "step": 289 + }, + { + "completion_length": 386.5, + "epoch": 1.013986013986014, + "grad_norm": 2.7495882511138916, + "kl": 0.5513795614242554, + "learning_rate": 4.9692208514878445e-06, + "loss": 0.0221, + "reward": 1.3666666746139526, + "reward_std": 1.0023306608200073, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5333333611488342, + "step": 290 + }, + { + "completion_length": 679.6666870117188, + "epoch": 1.0174825174825175, + "grad_norm": 0.9174596667289734, + "kl": 0.14350205659866333, + "learning_rate": 4.968534516877279e-06, + "loss": 0.0057, + "reward": 2.879167079925537, + "reward_std": 1.0047906637191772, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7124999761581421, + "step": 291 + }, + { + "completion_length": 322.0, + "epoch": 1.020979020979021, + "grad_norm": 6.856034278869629, + "kl": 3.479478597640991, + "learning_rate": 4.96784066268247e-06, + "loss": 0.1392, + "reward": 0.875, + "reward_std": 0.9832345247268677, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.375, + "step": 292 + }, + { + "completion_length": 500.5, + "epoch": 1.0244755244755244, + "grad_norm": 0.8394511938095093, + "kl": 0.14955884218215942, + "learning_rate": 4.967139291017018e-06, + "loss": 0.006, + "reward": 2.133333206176758, + "reward_std": 1.149202585220337, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 293 + }, + { + "completion_length": 470.5, + "epoch": 1.027972027972028, + "grad_norm": 1.0547795295715332, + "kl": 0.26865124702453613, + "learning_rate": 4.966430404017424e-06, + "loss": 0.0107, + "reward": 1.7916667461395264, + "reward_std": 1.1534368991851807, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 294 + }, + { + "completion_length": 357.3333435058594, + "epoch": 1.0314685314685315, + "grad_norm": 1.61123788356781, + "kl": 0.2728823125362396, + "learning_rate": 4.965714003843079e-06, + "loss": 0.0109, + "reward": 3.266666889190674, + "reward_std": 1.6014575958251953, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7666666507720947, + "step": 295 + }, + { + "completion_length": 388.3333435058594, + "epoch": 1.034965034965035, + "grad_norm": 0.8229731917381287, + "kl": 0.33708059787750244, + "learning_rate": 4.964990092676263e-06, + "loss": 0.0135, + "reward": 0.7125000357627869, + "reward_std": 0.5300353765487671, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3791666626930237, + "step": 296 + }, + { + "completion_length": 667.0, + "epoch": 1.0384615384615385, + "grad_norm": 1.0831242799758911, + "kl": 0.26999422907829285, + "learning_rate": 4.964258672722135e-06, + "loss": 0.0108, + "reward": 2.5458335876464844, + "reward_std": 1.5425965785980225, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7124999761581421, + "step": 297 + }, + { + "completion_length": 804.1666870117188, + "epoch": 1.0419580419580419, + "grad_norm": 0.625715434551239, + "kl": 0.12136679887771606, + "learning_rate": 4.963519746208726e-06, + "loss": 0.0049, + "reward": 1.5791667699813843, + "reward_std": 1.2249915599822998, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7458333373069763, + "step": 298 + }, + { + "completion_length": 615.3333740234375, + "epoch": 1.0454545454545454, + "grad_norm": 0.9705678820610046, + "kl": 0.2214520424604416, + "learning_rate": 4.962773315386935e-06, + "loss": 0.0089, + "reward": 2.516666889190674, + "reward_std": 1.2355836629867554, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 299 + }, + { + "completion_length": 836.1666870117188, + "epoch": 1.048951048951049, + "grad_norm": 1.5465428829193115, + "kl": 0.24709966778755188, + "learning_rate": 4.962019382530521e-06, + "loss": 0.0099, + "reward": 2.0458333492279053, + "reward_std": 1.097544550895691, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 300 + }, + { + "completion_length": 597.6666870117188, + "epoch": 1.0524475524475525, + "grad_norm": 3.8257570266723633, + "kl": 0.9686455130577087, + "learning_rate": 4.961257949936092e-06, + "loss": 0.0387, + "reward": 1.4750001430511475, + "reward_std": 1.025061011314392, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 301 + }, + { + "completion_length": 516.6666870117188, + "epoch": 1.055944055944056, + "grad_norm": 2.1578736305236816, + "kl": 0.25257474184036255, + "learning_rate": 4.960489019923105e-06, + "loss": 0.0101, + "reward": 1.712499976158142, + "reward_std": 1.2360976934432983, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7125000357627869, + "step": 302 + }, + { + "completion_length": 390.3333435058594, + "epoch": 1.0594405594405594, + "grad_norm": 1.1851695775985718, + "kl": 0.30646514892578125, + "learning_rate": 4.959712594833855e-06, + "loss": 0.0123, + "reward": 1.3875000476837158, + "reward_std": 1.3440377712249756, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5541666746139526, + "step": 303 + }, + { + "completion_length": 329.66668701171875, + "epoch": 1.062937062937063, + "grad_norm": 1.7874314785003662, + "kl": 0.5978689193725586, + "learning_rate": 4.958928677033465e-06, + "loss": 0.0239, + "reward": 2.5625, + "reward_std": 1.447562575340271, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5625, + "step": 304 + }, + { + "completion_length": 676.5, + "epoch": 1.0664335664335665, + "grad_norm": 1.6353819370269775, + "kl": 0.2865048348903656, + "learning_rate": 4.958137268909887e-06, + "loss": 0.0115, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 305 + }, + { + "completion_length": 685.1666870117188, + "epoch": 1.06993006993007, + "grad_norm": 0.5405178666114807, + "kl": 0.16403402388095856, + "learning_rate": 4.957338372873886e-06, + "loss": 0.0066, + "reward": 1.1166666746139526, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 306 + }, + { + "completion_length": 377.16668701171875, + "epoch": 1.0734265734265733, + "grad_norm": 1.3861095905303955, + "kl": 0.5912900567054749, + "learning_rate": 4.956531991359038e-06, + "loss": 0.0237, + "reward": 0.9541667699813843, + "reward_std": 0.9423928260803223, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4541666507720947, + "step": 307 + }, + { + "completion_length": 568.1666870117188, + "epoch": 1.0769230769230769, + "grad_norm": 2.0841739177703857, + "kl": 0.3946326673030853, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.0158, + "reward": 1.2583333253860474, + "reward_std": 1.1876096725463867, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666388511658, + "step": 308 + }, + { + "completion_length": 610.1666870117188, + "epoch": 1.0804195804195804, + "grad_norm": 0.7838713526725769, + "kl": 0.20940952003002167, + "learning_rate": 4.95489678174111e-06, + "loss": 0.0084, + "reward": 1.1750000715255737, + "reward_std": 1.1035170555114746, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6750000715255737, + "step": 309 + }, + { + "completion_length": 780.3333740234375, + "epoch": 1.083916083916084, + "grad_norm": 0.91953444480896, + "kl": 0.13563194870948792, + "learning_rate": 4.9540679586191605e-06, + "loss": 0.0054, + "reward": 1.8500001430511475, + "reward_std": 1.006479024887085, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 310 + }, + { + "completion_length": 468.66668701171875, + "epoch": 1.0874125874125875, + "grad_norm": 1.1062681674957275, + "kl": 0.36474311351776123, + "learning_rate": 4.953231659980613e-06, + "loss": 0.0146, + "reward": 2.058333396911621, + "reward_std": 1.7576736211776733, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 311 + }, + { + "completion_length": 571.3333740234375, + "epoch": 1.0909090909090908, + "grad_norm": 0.7562583088874817, + "kl": 0.17403468489646912, + "learning_rate": 4.9523878883729794e-06, + "loss": 0.007, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 312 + }, + { + "completion_length": 580.6666870117188, + "epoch": 1.0944055944055944, + "grad_norm": 0.7236371040344238, + "kl": 0.20542237162590027, + "learning_rate": 4.9515366463665324e-06, + "loss": 0.0082, + "reward": 2.4000000953674316, + "reward_std": 0.8803409337997437, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 313 + }, + { + "completion_length": 372.5, + "epoch": 1.097902097902098, + "grad_norm": 0.736242949962616, + "kl": 0.19798314571380615, + "learning_rate": 4.9506779365543054e-06, + "loss": 0.0079, + "reward": 3.0916666984558105, + "reward_std": 0.4247548282146454, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9250000715255737, + "step": 314 + }, + { + "completion_length": 660.8333740234375, + "epoch": 1.1013986013986015, + "grad_norm": 0.7641960978507996, + "kl": 0.29524654150009155, + "learning_rate": 4.949811761552074e-06, + "loss": 0.0118, + "reward": 2.4166669845581055, + "reward_std": 1.2176480293273926, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7499999403953552, + "step": 315 + }, + { + "completion_length": 838.3333740234375, + "epoch": 1.104895104895105, + "grad_norm": 0.5717921853065491, + "kl": 0.14558419585227966, + "learning_rate": 4.94893812399836e-06, + "loss": 0.0058, + "reward": 2.258333206176758, + "reward_std": 0.8662660121917725, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9249999523162842, + "step": 316 + }, + { + "completion_length": 308.8333435058594, + "epoch": 1.1083916083916083, + "grad_norm": 1.5407124757766724, + "kl": 0.36382099986076355, + "learning_rate": 4.948057026554415e-06, + "loss": 0.0146, + "reward": 1.2291667461395264, + "reward_std": 1.2054479122161865, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5625, + "step": 317 + }, + { + "completion_length": 582.1666870117188, + "epoch": 1.1118881118881119, + "grad_norm": 0.5300387144088745, + "kl": 0.19406351447105408, + "learning_rate": 4.947168471904213e-06, + "loss": 0.0078, + "reward": 1.375, + "reward_std": 0.4937104880809784, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8749999403953552, + "step": 318 + }, + { + "completion_length": 889.3333740234375, + "epoch": 1.1153846153846154, + "grad_norm": 0.7921298146247864, + "kl": 0.14385448396205902, + "learning_rate": 4.946272462754447e-06, + "loss": 0.0058, + "reward": 1.629166603088379, + "reward_std": 0.8614546656608582, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 319 + }, + { + "completion_length": 576.6666870117188, + "epoch": 1.118881118881119, + "grad_norm": 2.1564207077026367, + "kl": 0.8259252309799194, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.033, + "reward": 1.399999976158142, + "reward_std": 1.3337916135787964, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40000003576278687, + "step": 320 + }, + { + "completion_length": 471.8333435058594, + "epoch": 1.1223776223776223, + "grad_norm": 1.2515596151351929, + "kl": 0.24163812398910522, + "learning_rate": 4.944458091896515e-06, + "loss": 0.0097, + "reward": 1.883333444595337, + "reward_std": 1.3325413465499878, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666984558105, + "step": 321 + }, + { + "completion_length": 416.66668701171875, + "epoch": 1.1258741258741258, + "grad_norm": 0.7721207141876221, + "kl": 0.2213769555091858, + "learning_rate": 4.9435397357152406e-06, + "loss": 0.0089, + "reward": 1.899999976158142, + "reward_std": 0.6442049741744995, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 322 + }, + { + "completion_length": 1349.5, + "epoch": 1.1293706293706294, + "grad_norm": 0.3130567967891693, + "kl": 0.10197386145591736, + "learning_rate": 4.94261393608816e-06, + "loss": 0.0041, + "reward": 1.9666666984558105, + "reward_std": 0.9277212023735046, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7999999523162842, + "step": 323 + }, + { + "completion_length": 669.5, + "epoch": 1.132867132867133, + "grad_norm": 0.9291994571685791, + "kl": 0.22598087787628174, + "learning_rate": 4.9416806958354206e-06, + "loss": 0.009, + "reward": 0.949999988079071, + "reward_std": 0.6595453023910522, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 324 + }, + { + "completion_length": 184.1666717529297, + "epoch": 1.1363636363636362, + "grad_norm": 2.9357590675354004, + "kl": 0.44805118441581726, + "learning_rate": 4.9407400177998335e-06, + "loss": 0.0179, + "reward": 2.450000047683716, + "reward_std": 1.4673106670379639, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 325 + }, + { + "completion_length": 786.8333740234375, + "epoch": 1.1398601398601398, + "grad_norm": 0.7112540006637573, + "kl": 0.23709163069725037, + "learning_rate": 4.939791904846869e-06, + "loss": 0.0095, + "reward": 2.7333335876464844, + "reward_std": 0.40207797288894653, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 326 + }, + { + "completion_length": 391.16668701171875, + "epoch": 1.1433566433566433, + "grad_norm": 1.6311299800872803, + "kl": 0.31598275899887085, + "learning_rate": 4.938836359864641e-06, + "loss": 0.0126, + "reward": 2.2791666984558105, + "reward_std": 0.9937827587127686, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6124999523162842, + "step": 327 + }, + { + "completion_length": 325.8333435058594, + "epoch": 1.1468531468531469, + "grad_norm": 1.6858141422271729, + "kl": 0.40026235580444336, + "learning_rate": 4.937873385763909e-06, + "loss": 0.016, + "reward": 2.0250000953674316, + "reward_std": 1.1339092254638672, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 328 + }, + { + "completion_length": 313.3333435058594, + "epoch": 1.1503496503496504, + "grad_norm": 1.9852374792099, + "kl": 0.36842843890190125, + "learning_rate": 4.936902985478055e-06, + "loss": 0.0147, + "reward": 2.5250000953674316, + "reward_std": 1.1206024885177612, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 329 + }, + { + "completion_length": 333.66668701171875, + "epoch": 1.1538461538461537, + "grad_norm": 1.0456072092056274, + "kl": 0.3002980351448059, + "learning_rate": 4.935925161963089e-06, + "loss": 0.012, + "reward": 2.1083335876464844, + "reward_std": 0.9068719744682312, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7749999761581421, + "step": 330 + }, + { + "completion_length": 419.16668701171875, + "epoch": 1.1573426573426573, + "grad_norm": 0.9209095239639282, + "kl": 0.19463126361370087, + "learning_rate": 4.93493991819763e-06, + "loss": 0.0078, + "reward": 3.566666603088379, + "reward_std": 0.9389710426330566, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 331 + }, + { + "completion_length": 501.3333435058594, + "epoch": 1.1608391608391608, + "grad_norm": 0.9894822239875793, + "kl": 0.23653444647789001, + "learning_rate": 4.933947257182901e-06, + "loss": 0.0095, + "reward": 2.4583334922790527, + "reward_std": 1.6280101537704468, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 332 + }, + { + "completion_length": 283.8333435058594, + "epoch": 1.1643356643356644, + "grad_norm": 1.3056206703186035, + "kl": 0.3558562397956848, + "learning_rate": 4.932947181942721e-06, + "loss": 0.0142, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 333 + }, + { + "completion_length": 617.8333740234375, + "epoch": 1.167832167832168, + "grad_norm": 0.7905691266059875, + "kl": 0.2221965491771698, + "learning_rate": 4.9319396955234925e-06, + "loss": 0.0089, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 334 + }, + { + "completion_length": 802.3333740234375, + "epoch": 1.1713286713286712, + "grad_norm": 0.650930643081665, + "kl": 0.2902371287345886, + "learning_rate": 4.930924800994192e-06, + "loss": 0.0116, + "reward": 2.9375, + "reward_std": 0.9523326754570007, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7708332538604736, + "step": 335 + }, + { + "completion_length": 571.5, + "epoch": 1.1748251748251748, + "grad_norm": 2.592233180999756, + "kl": 0.44388240575790405, + "learning_rate": 4.9299025014463665e-06, + "loss": 0.0178, + "reward": 2.625, + "reward_std": 1.0167349576950073, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 336 + }, + { + "completion_length": 765.0, + "epoch": 1.1783216783216783, + "grad_norm": 0.8478806018829346, + "kl": 0.23496964573860168, + "learning_rate": 4.928872799994116e-06, + "loss": 0.0094, + "reward": 2.4166665077209473, + "reward_std": 1.0943796634674072, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.75, + "step": 337 + }, + { + "completion_length": 369.5, + "epoch": 1.1818181818181819, + "grad_norm": 1.2003388404846191, + "kl": 0.283313125371933, + "learning_rate": 4.92783569977409e-06, + "loss": 0.0113, + "reward": 2.4625000953674316, + "reward_std": 1.1056389808654785, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 338 + }, + { + "completion_length": 241.1666717529297, + "epoch": 1.1853146853146854, + "grad_norm": 1.1362509727478027, + "kl": 0.36542683839797974, + "learning_rate": 4.926791203945477e-06, + "loss": 0.0146, + "reward": 2.941667079925537, + "reward_std": 1.237908124923706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7749999761581421, + "step": 339 + }, + { + "completion_length": 262.3333435058594, + "epoch": 1.1888111888111887, + "grad_norm": 2.5425589084625244, + "kl": 0.46542689204216003, + "learning_rate": 4.925739315689991e-06, + "loss": 0.0186, + "reward": 2.2166666984558105, + "reward_std": 1.3840761184692383, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666388511658, + "step": 340 + }, + { + "completion_length": 458.8333435058594, + "epoch": 1.1923076923076923, + "grad_norm": 1.0685269832611084, + "kl": 0.28533288836479187, + "learning_rate": 4.924680038211868e-06, + "loss": 0.0114, + "reward": 3.0375001430511475, + "reward_std": 0.7974568605422974, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 341 + }, + { + "completion_length": 680.6666870117188, + "epoch": 1.1958041958041958, + "grad_norm": 1.049636960029602, + "kl": 0.2565695643424988, + "learning_rate": 4.923613374737848e-06, + "loss": 0.0103, + "reward": 2.875, + "reward_std": 0.7834221124649048, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.875, + "step": 342 + }, + { + "completion_length": 669.5, + "epoch": 1.1993006993006994, + "grad_norm": 0.47562330961227417, + "kl": 0.15911276638507843, + "learning_rate": 4.922539328517174e-06, + "loss": 0.0064, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 343 + }, + { + "completion_length": 533.1666870117188, + "epoch": 1.2027972027972027, + "grad_norm": 2.7278823852539062, + "kl": 0.42878812551498413, + "learning_rate": 4.921457902821578e-06, + "loss": 0.0172, + "reward": 2.191666603088379, + "reward_std": 1.1499637365341187, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 344 + }, + { + "completion_length": 410.5, + "epoch": 1.2062937062937062, + "grad_norm": 1.2009421586990356, + "kl": 0.30361247062683105, + "learning_rate": 4.92036910094527e-06, + "loss": 0.0121, + "reward": 2.2958333492279053, + "reward_std": 0.7362772822380066, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7958333492279053, + "step": 345 + }, + { + "completion_length": 678.0, + "epoch": 1.2097902097902098, + "grad_norm": 1.1339452266693115, + "kl": 0.36994367837905884, + "learning_rate": 4.9192729262049285e-06, + "loss": 0.0148, + "reward": 1.375, + "reward_std": 1.7195203304290771, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.375, + "step": 346 + }, + { + "completion_length": 364.66668701171875, + "epoch": 1.2132867132867133, + "grad_norm": 1.0105022192001343, + "kl": 0.22824347019195557, + "learning_rate": 4.918169381939693e-06, + "loss": 0.0091, + "reward": 1.75, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.75, + "step": 347 + }, + { + "completion_length": 231.83334350585938, + "epoch": 1.2167832167832167, + "grad_norm": 2.2665371894836426, + "kl": 0.5012367963790894, + "learning_rate": 4.917058471511149e-06, + "loss": 0.02, + "reward": 0.8916667699813843, + "reward_std": 0.8929818868637085, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 348 + }, + { + "completion_length": 149.6666717529297, + "epoch": 1.2202797202797202, + "grad_norm": 1.465401530265808, + "kl": 0.71610426902771, + "learning_rate": 4.915940198303324e-06, + "loss": 0.0286, + "reward": 2.183333396911621, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5166666507720947, + "step": 349 + }, + { + "completion_length": 265.66668701171875, + "epoch": 1.2237762237762237, + "grad_norm": 1.1324924230575562, + "kl": 0.39196571707725525, + "learning_rate": 4.914814565722671e-06, + "loss": 0.0157, + "reward": 2.016666889190674, + "reward_std": 0.9521905779838562, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 350 + }, + { + "completion_length": 228.1666717529297, + "epoch": 1.2272727272727273, + "grad_norm": 2.361294746398926, + "kl": 0.5443918704986572, + "learning_rate": 4.913681577198063e-06, + "loss": 0.0218, + "reward": 2.7916667461395264, + "reward_std": 1.5844295024871826, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 351 + }, + { + "completion_length": 645.1666870117188, + "epoch": 1.2307692307692308, + "grad_norm": 1.6541866064071655, + "kl": 0.3587082326412201, + "learning_rate": 4.912541236180779e-06, + "loss": 0.0143, + "reward": 3.0208334922790527, + "reward_std": 1.1969144344329834, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6875, + "step": 352 + }, + { + "completion_length": 592.1666870117188, + "epoch": 1.2342657342657342, + "grad_norm": 3.038172483444214, + "kl": 0.6741119623184204, + "learning_rate": 4.9113935461444955e-06, + "loss": 0.027, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 353 + }, + { + "completion_length": 416.16668701171875, + "epoch": 1.2377622377622377, + "grad_norm": 1.0763347148895264, + "kl": 0.32444697618484497, + "learning_rate": 4.910238510585275e-06, + "loss": 0.013, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 354 + }, + { + "completion_length": 276.3333435058594, + "epoch": 1.2412587412587412, + "grad_norm": 2.7986843585968018, + "kl": 0.9174998998641968, + "learning_rate": 4.909076133021558e-06, + "loss": 0.0367, + "reward": 2.625, + "reward_std": 1.616709589958191, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 355 + }, + { + "completion_length": 269.16668701171875, + "epoch": 1.2447552447552448, + "grad_norm": 0.9633187055587769, + "kl": 0.3955456614494324, + "learning_rate": 4.907906416994146e-06, + "loss": 0.0158, + "reward": 3.066667079925537, + "reward_std": 0.4490731656551361, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8999999761581421, + "step": 356 + }, + { + "completion_length": 313.16668701171875, + "epoch": 1.2482517482517483, + "grad_norm": 2.313849449157715, + "kl": 0.662523627281189, + "learning_rate": 4.906729366066197e-06, + "loss": 0.0265, + "reward": 1.7666667699813843, + "reward_std": 1.1767185926437378, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 357 + }, + { + "completion_length": 216.0, + "epoch": 1.2517482517482517, + "grad_norm": 4.379472255706787, + "kl": 0.7677586078643799, + "learning_rate": 4.905544983823214e-06, + "loss": 0.0307, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 358 + }, + { + "completion_length": 860.3333740234375, + "epoch": 1.2552447552447552, + "grad_norm": 2.9275009632110596, + "kl": 0.6438803672790527, + "learning_rate": 4.904353273873029e-06, + "loss": 0.0258, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 359 + }, + { + "completion_length": 217.83334350585938, + "epoch": 1.2587412587412588, + "grad_norm": 2.738201141357422, + "kl": 0.6947124004364014, + "learning_rate": 4.903154239845798e-06, + "loss": 0.0278, + "reward": 2.125, + "reward_std": 1.3129165172576904, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 360 + }, + { + "completion_length": 850.6666870117188, + "epoch": 1.2622377622377623, + "grad_norm": 0.6407853364944458, + "kl": 0.21777069568634033, + "learning_rate": 4.901947885393986e-06, + "loss": 0.0087, + "reward": 3.066667079925537, + "reward_std": 0.9389710426330566, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 361 + }, + { + "completion_length": 430.5, + "epoch": 1.2657342657342658, + "grad_norm": 3.934774398803711, + "kl": 1.3171093463897705, + "learning_rate": 4.900734214192358e-06, + "loss": 0.0527, + "reward": 2.4666666984558105, + "reward_std": 1.7380064725875854, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 362 + }, + { + "completion_length": 1049.0, + "epoch": 1.2692307692307692, + "grad_norm": 1.0587317943572998, + "kl": 0.3339938521385193, + "learning_rate": 4.899513229937968e-06, + "loss": 0.0134, + "reward": 1.183333396911621, + "reward_std": 0.6088240146636963, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8499999046325684, + "step": 363 + }, + { + "completion_length": 752.5, + "epoch": 1.2727272727272727, + "grad_norm": 0.9463182687759399, + "kl": 0.2867739796638489, + "learning_rate": 4.898284936350144e-06, + "loss": 0.0115, + "reward": 1.445833444595337, + "reward_std": 1.1011831760406494, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.612500011920929, + "step": 364 + }, + { + "completion_length": 302.3333435058594, + "epoch": 1.2762237762237763, + "grad_norm": 1.0470837354660034, + "kl": 0.4384109377861023, + "learning_rate": 4.897049337170483e-06, + "loss": 0.0175, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 365 + }, + { + "completion_length": 299.5, + "epoch": 1.2797202797202798, + "grad_norm": 1.4532350301742554, + "kl": 0.48457586765289307, + "learning_rate": 4.8958064361628334e-06, + "loss": 0.0194, + "reward": 2.183333396911621, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 366 + }, + { + "completion_length": 591.1666870117188, + "epoch": 1.2832167832167833, + "grad_norm": 1.7987697124481201, + "kl": 0.44638824462890625, + "learning_rate": 4.894556237113287e-06, + "loss": 0.0179, + "reward": 1.1166666746139526, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 367 + }, + { + "completion_length": 1384.5, + "epoch": 1.2867132867132867, + "grad_norm": 0.4040040373802185, + "kl": 0.12767352163791656, + "learning_rate": 4.893298743830168e-06, + "loss": 0.0051, + "reward": 1.691666841506958, + "reward_std": 1.4019334316253662, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 368 + }, + { + "completion_length": 440.3333435058594, + "epoch": 1.2902097902097902, + "grad_norm": 1.9347208738327026, + "kl": 0.46111249923706055, + "learning_rate": 4.89203396014402e-06, + "loss": 0.0184, + "reward": 1.9333332777023315, + "reward_std": 1.0510313510894775, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7666666507720947, + "step": 369 + }, + { + "completion_length": 602.8333740234375, + "epoch": 1.2937062937062938, + "grad_norm": 1.7568728923797607, + "kl": 0.5643346309661865, + "learning_rate": 4.890761889907589e-06, + "loss": 0.0226, + "reward": 1.2333333492279053, + "reward_std": 1.1513760089874268, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40000003576278687, + "step": 370 + }, + { + "completion_length": 584.1666870117188, + "epoch": 1.297202797202797, + "grad_norm": 2.6727964878082275, + "kl": 0.5424228310585022, + "learning_rate": 4.889482536995826e-06, + "loss": 0.0217, + "reward": 2.7916667461395264, + "reward_std": 1.4527275562286377, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 371 + }, + { + "completion_length": 302.16668701171875, + "epoch": 1.3006993006993006, + "grad_norm": 1.0215359926223755, + "kl": 0.38776999711990356, + "learning_rate": 4.888195905305859e-06, + "loss": 0.0155, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 372 + }, + { + "completion_length": 1038.5, + "epoch": 1.3041958041958042, + "grad_norm": 0.8328973054885864, + "kl": 0.31271958351135254, + "learning_rate": 4.886901998756995e-06, + "loss": 0.0125, + "reward": 1.4750001430511475, + "reward_std": 1.0486897230148315, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 373 + }, + { + "completion_length": 407.16668701171875, + "epoch": 1.3076923076923077, + "grad_norm": 1.812672734260559, + "kl": 0.3156376779079437, + "learning_rate": 4.885600821290692e-06, + "loss": 0.0126, + "reward": 3.183333396911621, + "reward_std": 1.2027745246887207, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 374 + }, + { + "completion_length": 264.16668701171875, + "epoch": 1.3111888111888113, + "grad_norm": 4.727421760559082, + "kl": 1.329188585281372, + "learning_rate": 4.884292376870567e-06, + "loss": 0.0532, + "reward": 2.0916666984558105, + "reward_std": 0.94890296459198, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 375 + }, + { + "completion_length": 516.5, + "epoch": 1.3146853146853146, + "grad_norm": 2.27711820602417, + "kl": 0.6330995559692383, + "learning_rate": 4.882976669482368e-06, + "loss": 0.0253, + "reward": 1.3583333492279053, + "reward_std": 1.1029127836227417, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6916667222976685, + "step": 376 + }, + { + "completion_length": 420.66668701171875, + "epoch": 1.3181818181818181, + "grad_norm": 2.9678735733032227, + "kl": 0.8875288367271423, + "learning_rate": 4.881653703133966e-06, + "loss": 0.0355, + "reward": 1.883333444595337, + "reward_std": 1.3325413465499878, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 377 + }, + { + "completion_length": 753.1666870117188, + "epoch": 1.3216783216783217, + "grad_norm": 0.774476945400238, + "kl": 0.36767667531967163, + "learning_rate": 4.880323481855347e-06, + "loss": 0.0147, + "reward": 2.3583335876464844, + "reward_std": 1.55962073802948, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 378 + }, + { + "completion_length": 182.5, + "epoch": 1.3251748251748252, + "grad_norm": 1.207739233970642, + "kl": 0.43915602564811707, + "learning_rate": 4.878986009698596e-06, + "loss": 0.0176, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 379 + }, + { + "completion_length": 341.0, + "epoch": 1.3286713286713288, + "grad_norm": 0.7512596249580383, + "kl": 0.3403867483139038, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.0136, + "reward": 3.0416667461395264, + "reward_std": 1.4800056219100952, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.875, + "step": 380 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.332167832167832, + "grad_norm": 2.4150354862213135, + "kl": 0.6687287092208862, + "learning_rate": 4.87628932906946e-06, + "loss": 0.0267, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 381 + }, + { + "completion_length": 657.5, + "epoch": 1.3356643356643356, + "grad_norm": 1.1033812761306763, + "kl": 0.2525772750377655, + "learning_rate": 4.874930128811631e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 382 + }, + { + "completion_length": 655.6666870117188, + "epoch": 1.3391608391608392, + "grad_norm": 2.7283008098602295, + "kl": 0.7087686061859131, + "learning_rate": 4.87356369410476e-06, + "loss": 0.0284, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6333333253860474, + "step": 383 + }, + { + "completion_length": 1037.166748046875, + "epoch": 1.3426573426573427, + "grad_norm": 1.4860605001449585, + "kl": 0.35516053438186646, + "learning_rate": 4.8721900291112415e-06, + "loss": 0.0142, + "reward": 1.3416666984558105, + "reward_std": 1.0956352949142456, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6750000715255737, + "step": 384 + }, + { + "completion_length": 776.0, + "epoch": 1.3461538461538463, + "grad_norm": 2.1169064044952393, + "kl": 0.6649973392486572, + "learning_rate": 4.870809138015499e-06, + "loss": 0.0266, + "reward": 1.4750001430511475, + "reward_std": 1.2451908588409424, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 385 + }, + { + "completion_length": 803.8333740234375, + "epoch": 1.3496503496503496, + "grad_norm": 1.5138658285140991, + "kl": 0.5593903064727783, + "learning_rate": 4.869421025023965e-06, + "loss": 0.0224, + "reward": 1.2250001430511475, + "reward_std": 1.229125738143921, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5583333373069763, + "step": 386 + }, + { + "completion_length": 579.8333740234375, + "epoch": 1.3531468531468531, + "grad_norm": 0.8988491892814636, + "kl": 0.2851899266242981, + "learning_rate": 4.868025694365073e-06, + "loss": 0.0114, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 387 + }, + { + "completion_length": 173.5, + "epoch": 1.3566433566433567, + "grad_norm": 1.3644022941589355, + "kl": 0.5744073390960693, + "learning_rate": 4.866623150289241e-06, + "loss": 0.023, + "reward": 1.9666666984558105, + "reward_std": 1.2412359714508057, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 388 + }, + { + "completion_length": 578.3333740234375, + "epoch": 1.3601398601398602, + "grad_norm": 0.8156600594520569, + "kl": 0.2687755227088928, + "learning_rate": 4.865213397068864e-06, + "loss": 0.0108, + "reward": 2.2916667461395264, + "reward_std": 1.4901063442230225, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 389 + }, + { + "completion_length": 1756.8333740234375, + "epoch": 1.3636363636363638, + "grad_norm": 0.36968812346458435, + "kl": 0.11372655630111694, + "learning_rate": 4.863796438998293e-06, + "loss": 0.0045, + "reward": 1.4666666984558105, + "reward_std": 0.9174239635467529, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6333333253860474, + "step": 390 + }, + { + "completion_length": 605.5, + "epoch": 1.367132867132867, + "grad_norm": 1.086455225944519, + "kl": 0.2938157916069031, + "learning_rate": 4.862372280393828e-06, + "loss": 0.0118, + "reward": 2.4375, + "reward_std": 1.2702115774154663, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7708333730697632, + "step": 391 + }, + { + "completion_length": 736.0, + "epoch": 1.3706293706293706, + "grad_norm": 3.411510705947876, + "kl": 0.9218753576278687, + "learning_rate": 4.860940925593703e-06, + "loss": 0.0369, + "reward": 1.4583333730697632, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666865348816, + "step": 392 + }, + { + "completion_length": 166.5, + "epoch": 1.3741258741258742, + "grad_norm": 1.464406132698059, + "kl": 0.34225571155548096, + "learning_rate": 4.8595023789580745e-06, + "loss": 0.0137, + "reward": 1.6041667461395264, + "reward_std": 0.7573666572570801, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7708332538604736, + "step": 393 + }, + { + "completion_length": 646.5, + "epoch": 1.3776223776223775, + "grad_norm": 1.6122732162475586, + "kl": 0.4424184560775757, + "learning_rate": 4.858056644869002e-06, + "loss": 0.0177, + "reward": 1.3250000476837158, + "reward_std": 0.9527591466903687, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8250000476837158, + "step": 394 + }, + { + "completion_length": 641.1666870117188, + "epoch": 1.381118881118881, + "grad_norm": 0.6985570192337036, + "kl": 0.23967330157756805, + "learning_rate": 4.856603727730446e-06, + "loss": 0.0096, + "reward": 2.5458333492279053, + "reward_std": 1.5425965785980225, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7125000357627869, + "step": 395 + }, + { + "completion_length": 161.83334350585938, + "epoch": 1.3846153846153846, + "grad_norm": 1.9270485639572144, + "kl": 0.7514389753341675, + "learning_rate": 4.855143631968242e-06, + "loss": 0.0301, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 396 + }, + { + "completion_length": 166.0, + "epoch": 1.3881118881118881, + "grad_norm": 1.2144757509231567, + "kl": 0.35039469599723816, + "learning_rate": 4.853676362030095e-06, + "loss": 0.014, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 397 + }, + { + "completion_length": 569.0, + "epoch": 1.3916083916083917, + "grad_norm": 6.755039215087891, + "kl": 0.7890805006027222, + "learning_rate": 4.852201922385564e-06, + "loss": 0.0316, + "reward": 2.1083333492279053, + "reward_std": 1.7987264394760132, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6083333492279053, + "step": 398 + }, + { + "completion_length": 909.0, + "epoch": 1.395104895104895, + "grad_norm": 0.7347401976585388, + "kl": 0.18117789924144745, + "learning_rate": 4.850720317526047e-06, + "loss": 0.0072, + "reward": 1.962499976158142, + "reward_std": 0.534263551235199, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7958333492279053, + "step": 399 + }, + { + "completion_length": 793.5, + "epoch": 1.3986013986013985, + "grad_norm": 0.849243700504303, + "kl": 0.27008673548698425, + "learning_rate": 4.849231551964771e-06, + "loss": 0.0108, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 400 + }, + { + "completion_length": 554.1666870117188, + "epoch": 1.402097902097902, + "grad_norm": 2.7050747871398926, + "kl": 0.5240260362625122, + "learning_rate": 4.847735630236773e-06, + "loss": 0.021, + "reward": 3.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 401 + }, + { + "completion_length": 215.83334350585938, + "epoch": 1.4055944055944056, + "grad_norm": 0.9243234992027283, + "kl": 0.3121068477630615, + "learning_rate": 4.84623255689889e-06, + "loss": 0.0125, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 402 + }, + { + "completion_length": 235.6666717529297, + "epoch": 1.4090909090909092, + "grad_norm": 3.3891875743865967, + "kl": 0.5218031406402588, + "learning_rate": 4.844722336529745e-06, + "loss": 0.0209, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 403 + }, + { + "completion_length": 923.5, + "epoch": 1.4125874125874125, + "grad_norm": 3.197908878326416, + "kl": 0.7076524496078491, + "learning_rate": 4.84320497372973e-06, + "loss": 0.0283, + "reward": 2.0458335876464844, + "reward_std": 1.3396285772323608, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7125000357627869, + "step": 404 + }, + { + "completion_length": 197.83334350585938, + "epoch": 1.416083916083916, + "grad_norm": 1.1261261701583862, + "kl": 0.3264281153678894, + "learning_rate": 4.841680473120994e-06, + "loss": 0.0131, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 405 + }, + { + "completion_length": 554.5, + "epoch": 1.4195804195804196, + "grad_norm": 3.3561604022979736, + "kl": 0.8642048835754395, + "learning_rate": 4.840148839347434e-06, + "loss": 0.0346, + "reward": 1.8500001430511475, + "reward_std": 1.0315039157867432, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 406 + }, + { + "completion_length": 795.8333740234375, + "epoch": 1.4230769230769231, + "grad_norm": 4.25921630859375, + "kl": 0.770601749420166, + "learning_rate": 4.838610077074669e-06, + "loss": 0.0308, + "reward": 1.2916667461395264, + "reward_std": 1.0551856756210327, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 407 + }, + { + "completion_length": 915.0, + "epoch": 1.4265734265734267, + "grad_norm": 0.571506142616272, + "kl": 0.20412606000900269, + "learning_rate": 4.837064190990036e-06, + "loss": 0.0082, + "reward": 2.241666793823242, + "reward_std": 1.3698238134384155, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7416666746139526, + "step": 408 + }, + { + "completion_length": 520.6666870117188, + "epoch": 1.43006993006993, + "grad_norm": 0.9773194193840027, + "kl": 0.29276588559150696, + "learning_rate": 4.835511185802574e-06, + "loss": 0.0117, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 409 + }, + { + "completion_length": 357.5, + "epoch": 1.4335664335664335, + "grad_norm": 2.5951545238494873, + "kl": 0.4989779591560364, + "learning_rate": 4.833951066243004e-06, + "loss": 0.02, + "reward": 1.945833444595337, + "reward_std": 1.279689073562622, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.612500011920929, + "step": 410 + }, + { + "completion_length": 794.3333740234375, + "epoch": 1.437062937062937, + "grad_norm": 0.761000394821167, + "kl": 0.20721551775932312, + "learning_rate": 4.832383837063723e-06, + "loss": 0.0083, + "reward": 2.0416667461395264, + "reward_std": 1.100189447402954, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 411 + }, + { + "completion_length": 1086.5, + "epoch": 1.4405594405594406, + "grad_norm": 0.9872347116470337, + "kl": 0.296750009059906, + "learning_rate": 4.830809503038781e-06, + "loss": 0.0119, + "reward": 2.0916666984558105, + "reward_std": 1.442365050315857, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 412 + }, + { + "completion_length": 168.5, + "epoch": 1.4440559440559442, + "grad_norm": 1.2185351848602295, + "kl": 0.34197482466697693, + "learning_rate": 4.829228068963873e-06, + "loss": 0.0137, + "reward": 3.2333333492279053, + "reward_std": 0.5671566724777222, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 413 + }, + { + "completion_length": 775.3333740234375, + "epoch": 1.4475524475524475, + "grad_norm": 1.1913334131240845, + "kl": 0.3759481906890869, + "learning_rate": 4.8276395396563215e-06, + "loss": 0.015, + "reward": 0.8916667699813843, + "reward_std": 0.5633975267410278, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7250000834465027, + "step": 414 + }, + { + "completion_length": 203.6666717529297, + "epoch": 1.451048951048951, + "grad_norm": 1.0359302759170532, + "kl": 0.31211602687835693, + "learning_rate": 4.826043919955062e-06, + "loss": 0.0125, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 415 + }, + { + "completion_length": 543.6666870117188, + "epoch": 1.4545454545454546, + "grad_norm": 0.7396105527877808, + "kl": 0.25116777420043945, + "learning_rate": 4.824441214720629e-06, + "loss": 0.01, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 416 + }, + { + "completion_length": 253.0, + "epoch": 1.458041958041958, + "grad_norm": 2.3947131633758545, + "kl": 0.3577002286911011, + "learning_rate": 4.8228314288351405e-06, + "loss": 0.0143, + "reward": 1.8500001430511475, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 417 + }, + { + "completion_length": 776.0, + "epoch": 1.4615384615384617, + "grad_norm": 0.9339893460273743, + "kl": 0.2636467218399048, + "learning_rate": 4.821214567202284e-06, + "loss": 0.0105, + "reward": 2.2333333492279053, + "reward_std": 0.5671566724777222, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 418 + }, + { + "completion_length": 185.33334350585938, + "epoch": 1.465034965034965, + "grad_norm": 3.6216635704040527, + "kl": 0.6233493685722351, + "learning_rate": 4.8195906347473e-06, + "loss": 0.0249, + "reward": 1.8000000715255737, + "reward_std": 1.579240322113037, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 419 + }, + { + "completion_length": 1112.0, + "epoch": 1.4685314685314685, + "grad_norm": 0.6356344223022461, + "kl": 0.26539915800094604, + "learning_rate": 4.817959636416969e-06, + "loss": 0.0106, + "reward": 2.375, + "reward_std": 1.001873254776001, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 420 + }, + { + "completion_length": 531.1666870117188, + "epoch": 1.472027972027972, + "grad_norm": 0.8300501108169556, + "kl": 0.31844228506088257, + "learning_rate": 4.816321577179594e-06, + "loss": 0.0127, + "reward": 2.875, + "reward_std": 0.7834221124649048, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 421 + }, + { + "completion_length": 218.83334350585938, + "epoch": 1.4755244755244754, + "grad_norm": 0.796237051486969, + "kl": 0.331187903881073, + "learning_rate": 4.814676462024988e-06, + "loss": 0.0132, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 422 + }, + { + "completion_length": 186.83334350585938, + "epoch": 1.479020979020979, + "grad_norm": 1.279965877532959, + "kl": 0.3236890733242035, + "learning_rate": 4.8130242959644555e-06, + "loss": 0.0129, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 423 + }, + { + "completion_length": 249.0, + "epoch": 1.4825174825174825, + "grad_norm": 4.079779624938965, + "kl": 0.39256423711776733, + "learning_rate": 4.811365084030784e-06, + "loss": 0.0157, + "reward": 1.7125000953674316, + "reward_std": 0.9684717655181885, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7125000357627869, + "step": 424 + }, + { + "completion_length": 183.33334350585938, + "epoch": 1.486013986013986, + "grad_norm": 1.1069165468215942, + "kl": 0.262847363948822, + "learning_rate": 4.809698831278217e-06, + "loss": 0.0105, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 425 + }, + { + "completion_length": 199.6666717529297, + "epoch": 1.4895104895104896, + "grad_norm": 1.413517713546753, + "kl": 0.39733991026878357, + "learning_rate": 4.808025542782453e-06, + "loss": 0.0159, + "reward": 2.7083334922790527, + "reward_std": 0.8662660121917725, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 426 + }, + { + "completion_length": 235.6666717529297, + "epoch": 1.493006993006993, + "grad_norm": 0.9659198522567749, + "kl": 0.2365071177482605, + "learning_rate": 4.806345223640616e-06, + "loss": 0.0095, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 427 + }, + { + "completion_length": 774.1666870117188, + "epoch": 1.4965034965034965, + "grad_norm": 0.830765962600708, + "kl": 0.33350443840026855, + "learning_rate": 4.804657878971252e-06, + "loss": 0.0133, + "reward": 2.183333396911621, + "reward_std": 1.3265244960784912, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 428 + }, + { + "completion_length": 203.0, + "epoch": 1.5, + "grad_norm": 1.0319793224334717, + "kl": 0.27221041917800903, + "learning_rate": 4.802963513914304e-06, + "loss": 0.0109, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 429 + }, + { + "completion_length": 461.16668701171875, + "epoch": 1.5034965034965035, + "grad_norm": 1.0231879949569702, + "kl": 0.24733422696590424, + "learning_rate": 4.801262133631101e-06, + "loss": 0.0099, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 430 + }, + { + "completion_length": 244.83334350585938, + "epoch": 1.506993006993007, + "grad_norm": 0.9520881772041321, + "kl": 0.31419527530670166, + "learning_rate": 4.799553743304345e-06, + "loss": 0.0126, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 431 + }, + { + "completion_length": 206.83334350585938, + "epoch": 1.5104895104895104, + "grad_norm": 0.8148533701896667, + "kl": 0.2550124228000641, + "learning_rate": 4.7978383481380865e-06, + "loss": 0.0102, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 432 + }, + { + "completion_length": 1087.8333740234375, + "epoch": 1.513986013986014, + "grad_norm": 0.3516090214252472, + "kl": 0.2816867530345917, + "learning_rate": 4.796115953357718e-06, + "loss": 0.0113, + "reward": 2.2833333015441895, + "reward_std": 1.2408331632614136, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 433 + }, + { + "completion_length": 556.3333740234375, + "epoch": 1.5174825174825175, + "grad_norm": 3.6779227256774902, + "kl": 0.4250108003616333, + "learning_rate": 4.794386564209953e-06, + "loss": 0.017, + "reward": 2.4083335399627686, + "reward_std": 1.687132716178894, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 434 + }, + { + "completion_length": 707.8333740234375, + "epoch": 1.5209790209790208, + "grad_norm": 1.121485948562622, + "kl": 0.24696388840675354, + "learning_rate": 4.79265018596281e-06, + "loss": 0.0099, + "reward": 2.9000000953674316, + "reward_std": 0.9027735590934753, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 435 + }, + { + "completion_length": 469.8333435058594, + "epoch": 1.5244755244755246, + "grad_norm": 2.6518046855926514, + "kl": 0.7716752886772156, + "learning_rate": 4.790906823905599e-06, + "loss": 0.0309, + "reward": 1.8000000715255737, + "reward_std": 1.447066068649292, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 436 + }, + { + "completion_length": 192.83334350585938, + "epoch": 1.527972027972028, + "grad_norm": 1.165176272392273, + "kl": 0.2884241044521332, + "learning_rate": 4.7891564833489035e-06, + "loss": 0.0115, + "reward": 2.450000286102295, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 437 + }, + { + "completion_length": 254.6666717529297, + "epoch": 1.5314685314685315, + "grad_norm": 0.8783808350563049, + "kl": 0.26613113284111023, + "learning_rate": 4.787399169624562e-06, + "loss": 0.0106, + "reward": 3.370833396911621, + "reward_std": 1.011238932609558, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 438 + }, + { + "completion_length": 158.5, + "epoch": 1.534965034965035, + "grad_norm": 2.008617877960205, + "kl": 0.5028926134109497, + "learning_rate": 4.7856348880856595e-06, + "loss": 0.0201, + "reward": 1.7416666746139526, + "reward_std": 1.1517016887664795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7416666746139526, + "step": 439 + }, + { + "completion_length": 208.5, + "epoch": 1.5384615384615383, + "grad_norm": 0.8693957924842834, + "kl": 0.2799164056777954, + "learning_rate": 4.783863644106502e-06, + "loss": 0.0112, + "reward": 2.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 440 + }, + { + "completion_length": 211.5, + "epoch": 1.541958041958042, + "grad_norm": 1.5437381267547607, + "kl": 0.3011782467365265, + "learning_rate": 4.782085443082607e-06, + "loss": 0.012, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 441 + }, + { + "completion_length": 491.8333435058594, + "epoch": 1.5454545454545454, + "grad_norm": 3.308060884475708, + "kl": 0.43526870012283325, + "learning_rate": 4.780300290430683e-06, + "loss": 0.0174, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 442 + }, + { + "completion_length": 177.1666717529297, + "epoch": 1.548951048951049, + "grad_norm": 2.3108198642730713, + "kl": 0.6005208492279053, + "learning_rate": 4.778508191588613e-06, + "loss": 0.024, + "reward": 2.683333396911621, + "reward_std": 1.2110600471496582, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 443 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.5524475524475525, + "grad_norm": 0.9576809406280518, + "kl": 0.3041282296180725, + "learning_rate": 4.776709152015443e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 444 + }, + { + "completion_length": 807.3333740234375, + "epoch": 1.5559440559440558, + "grad_norm": 0.6298768520355225, + "kl": 0.2337806224822998, + "learning_rate": 4.774903177191358e-06, + "loss": 0.0094, + "reward": 2.5458335876464844, + "reward_std": 1.3377609252929688, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 445 + }, + { + "completion_length": 523.6666870117188, + "epoch": 1.5594405594405596, + "grad_norm": 1.1019190549850464, + "kl": 0.39509618282318115, + "learning_rate": 4.773090272617672e-06, + "loss": 0.0158, + "reward": 2.049999952316284, + "reward_std": 1.5391557216644287, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 446 + }, + { + "completion_length": 787.6666870117188, + "epoch": 1.562937062937063, + "grad_norm": 0.893694281578064, + "kl": 0.37470337748527527, + "learning_rate": 4.771270443816805e-06, + "loss": 0.015, + "reward": 2.2083334922790527, + "reward_std": 0.8720186948776245, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 447 + }, + { + "completion_length": 546.8333740234375, + "epoch": 1.5664335664335665, + "grad_norm": 0.837485134601593, + "kl": 0.22402605414390564, + "learning_rate": 4.769443696332272e-06, + "loss": 0.009, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 448 + }, + { + "completion_length": 177.6666717529297, + "epoch": 1.56993006993007, + "grad_norm": 1.617317795753479, + "kl": 0.3958384692668915, + "learning_rate": 4.767610035728663e-06, + "loss": 0.0158, + "reward": 2.875, + "reward_std": 1.0068515539169312, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 449 + }, + { + "completion_length": 147.33334350585938, + "epoch": 1.5734265734265733, + "grad_norm": 0.9628480076789856, + "kl": 0.3490566611289978, + "learning_rate": 4.765769467591626e-06, + "loss": 0.014, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 450 + }, + { + "completion_length": 203.83334350585938, + "epoch": 1.5769230769230769, + "grad_norm": 0.9194980263710022, + "kl": 0.3181028962135315, + "learning_rate": 4.763921997527849e-06, + "loss": 0.0127, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 451 + }, + { + "completion_length": 167.5, + "epoch": 1.5804195804195804, + "grad_norm": 3.041954517364502, + "kl": 0.426164835691452, + "learning_rate": 4.762067631165049e-06, + "loss": 0.017, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 452 + }, + { + "completion_length": 212.33334350585938, + "epoch": 1.583916083916084, + "grad_norm": 1.1762245893478394, + "kl": 0.2974995970726013, + "learning_rate": 4.760206374151947e-06, + "loss": 0.0119, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 453 + }, + { + "completion_length": 493.66668701171875, + "epoch": 1.5874125874125875, + "grad_norm": 1.3206851482391357, + "kl": 0.36789295077323914, + "learning_rate": 4.7583382321582525e-06, + "loss": 0.0147, + "reward": 1.9166667461395264, + "reward_std": 1.2738393545150757, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.75, + "step": 454 + }, + { + "completion_length": 205.0, + "epoch": 1.5909090909090908, + "grad_norm": 1.0482568740844727, + "kl": 0.2594867944717407, + "learning_rate": 4.7564632108746524e-06, + "loss": 0.0104, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 455 + }, + { + "completion_length": 502.8333435058594, + "epoch": 1.5944055944055944, + "grad_norm": 2.1341159343719482, + "kl": 0.4591405391693115, + "learning_rate": 4.754581316012785e-06, + "loss": 0.0184, + "reward": 3.7083334922790527, + "reward_std": 0.591960072517395, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 456 + }, + { + "completion_length": 633.3333740234375, + "epoch": 1.597902097902098, + "grad_norm": 1.0107204914093018, + "kl": 0.24642407894134521, + "learning_rate": 4.752692553305229e-06, + "loss": 0.0099, + "reward": 3.0375001430511475, + "reward_std": 0.7974569201469421, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708333373069763, + "step": 457 + }, + { + "completion_length": 517.0, + "epoch": 1.6013986013986012, + "grad_norm": 0.6217291355133057, + "kl": 0.22938358783721924, + "learning_rate": 4.750796928505484e-06, + "loss": 0.0092, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 458 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.604895104895105, + "grad_norm": 0.5446264743804932, + "kl": 0.1968853920698166, + "learning_rate": 4.7488944473879515e-06, + "loss": 0.0079, + "reward": 1.625, + "reward_std": 0.7960842847824097, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 459 + }, + { + "completion_length": 193.83334350585938, + "epoch": 1.6083916083916083, + "grad_norm": 0.8946224451065063, + "kl": 0.25773894786834717, + "learning_rate": 4.746985115747918e-06, + "loss": 0.0103, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 460 + }, + { + "completion_length": 204.6666717529297, + "epoch": 1.6118881118881119, + "grad_norm": 0.8260864019393921, + "kl": 0.2527741491794586, + "learning_rate": 4.745068939401539e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 461 + }, + { + "completion_length": 848.6666870117188, + "epoch": 1.6153846153846154, + "grad_norm": 1.5746495723724365, + "kl": 0.3351367712020874, + "learning_rate": 4.743145924185821e-06, + "loss": 0.0134, + "reward": 2.25, + "reward_std": 0.7803846597671509, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.75, + "step": 462 + }, + { + "completion_length": 190.0, + "epoch": 1.6188811188811187, + "grad_norm": 1.0435597896575928, + "kl": 0.26553571224212646, + "learning_rate": 4.741216075958602e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 463 + }, + { + "completion_length": 209.6666717529297, + "epoch": 1.6223776223776225, + "grad_norm": 1.0996354818344116, + "kl": 0.31133967638015747, + "learning_rate": 4.7392794005985324e-06, + "loss": 0.0125, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 464 + }, + { + "completion_length": 512.6666870117188, + "epoch": 1.6258741258741258, + "grad_norm": 0.7010518908500671, + "kl": 0.21432137489318848, + "learning_rate": 4.737335904005063e-06, + "loss": 0.0086, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 465 + }, + { + "completion_length": 527.0, + "epoch": 1.6293706293706294, + "grad_norm": 0.5995029211044312, + "kl": 0.22433510422706604, + "learning_rate": 4.735385592098421e-06, + "loss": 0.009, + "reward": 2.7916667461395264, + "reward_std": 1.4527275562286377, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 466 + }, + { + "completion_length": 191.0, + "epoch": 1.632867132867133, + "grad_norm": 1.2079272270202637, + "kl": 0.2614157795906067, + "learning_rate": 4.733428470819595e-06, + "loss": 0.0105, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 467 + }, + { + "completion_length": 783.1666870117188, + "epoch": 1.6363636363636362, + "grad_norm": 2.2251851558685303, + "kl": 0.6713162660598755, + "learning_rate": 4.731464546130315e-06, + "loss": 0.0269, + "reward": 2.4375, + "reward_std": 1.3401259183883667, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7708333730697632, + "step": 468 + }, + { + "completion_length": 529.1666870117188, + "epoch": 1.63986013986014, + "grad_norm": 0.5742272138595581, + "kl": 0.23623262345790863, + "learning_rate": 4.729493824013036e-06, + "loss": 0.0094, + "reward": 2.2125000953674316, + "reward_std": 1.234073519706726, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7124999761581421, + "step": 469 + }, + { + "completion_length": 181.0, + "epoch": 1.6433566433566433, + "grad_norm": 1.7596086263656616, + "kl": 0.33919036388397217, + "learning_rate": 4.72751631047092e-06, + "loss": 0.0136, + "reward": 1.8500001430511475, + "reward_std": 1.2247450351715088, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 470 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.6468531468531469, + "grad_norm": 1.0671755075454712, + "kl": 0.27314767241477966, + "learning_rate": 4.725532011527817e-06, + "loss": 0.0109, + "reward": 3.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 471 + }, + { + "completion_length": 189.6666717529297, + "epoch": 1.6503496503496504, + "grad_norm": 1.0676515102386475, + "kl": 0.2805836498737335, + "learning_rate": 4.723540933228245e-06, + "loss": 0.0112, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 472 + }, + { + "completion_length": 836.5, + "epoch": 1.6538461538461537, + "grad_norm": 0.8203516006469727, + "kl": 0.172221839427948, + "learning_rate": 4.721543081637372e-06, + "loss": 0.0069, + "reward": 1.5833333730697632, + "reward_std": 1.0308573246002197, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7499999403953552, + "step": 473 + }, + { + "completion_length": 169.0, + "epoch": 1.6573426573426573, + "grad_norm": 1.7924721240997314, + "kl": 0.30363911390304565, + "learning_rate": 4.719538462841003e-06, + "loss": 0.0121, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 474 + }, + { + "completion_length": 176.6666717529297, + "epoch": 1.6608391608391608, + "grad_norm": 0.19596193730831146, + "kl": 0.24111799895763397, + "learning_rate": 4.717527082945555e-06, + "loss": 0.0108, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 475 + }, + { + "completion_length": 234.6666717529297, + "epoch": 1.6643356643356644, + "grad_norm": 0.9966434240341187, + "kl": 0.25714850425720215, + "learning_rate": 4.715508948078037e-06, + "loss": 0.0103, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 476 + }, + { + "completion_length": 1046.8333740234375, + "epoch": 1.667832167832168, + "grad_norm": 0.6285001635551453, + "kl": 0.1687658280134201, + "learning_rate": 4.71348406438604e-06, + "loss": 0.0068, + "reward": 2.0250000953674316, + "reward_std": 1.4372718334197998, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 477 + }, + { + "completion_length": 219.1666717529297, + "epoch": 1.6713286713286712, + "grad_norm": 1.0476932525634766, + "kl": 0.29544544219970703, + "learning_rate": 4.71145243803771e-06, + "loss": 0.0118, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 478 + }, + { + "completion_length": 561.1666870117188, + "epoch": 1.6748251748251748, + "grad_norm": 1.0641223192214966, + "kl": 0.1950298398733139, + "learning_rate": 4.709414075221734e-06, + "loss": 0.0078, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 479 + }, + { + "completion_length": 228.5, + "epoch": 1.6783216783216783, + "grad_norm": 0.8561164736747742, + "kl": 0.26422810554504395, + "learning_rate": 4.707368982147318e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 480 + }, + { + "completion_length": 509.3333435058594, + "epoch": 1.6818181818181817, + "grad_norm": 0.5843437314033508, + "kl": 0.20474323630332947, + "learning_rate": 4.70531716504417e-06, + "loss": 0.0082, + "reward": 2.183333396911621, + "reward_std": 1.2027745246887207, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 481 + }, + { + "completion_length": 548.6666870117188, + "epoch": 1.6853146853146854, + "grad_norm": 0.648353636264801, + "kl": 0.18905925750732422, + "learning_rate": 4.703258630162481e-06, + "loss": 0.0076, + "reward": 2.2125000953674316, + "reward_std": 1.3164108991622925, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7124999761581421, + "step": 482 + }, + { + "completion_length": 219.6666717529297, + "epoch": 1.6888111888111887, + "grad_norm": 4.2207932472229, + "kl": 1.0905920267105103, + "learning_rate": 4.701193383772905e-06, + "loss": 0.0436, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 483 + }, + { + "completion_length": 1049.166748046875, + "epoch": 1.6923076923076923, + "grad_norm": 0.5171648859977722, + "kl": 0.20516209304332733, + "learning_rate": 4.699121432166542e-06, + "loss": 0.0082, + "reward": 2.2333333492279053, + "reward_std": 0.9174240827560425, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 484 + }, + { + "completion_length": 201.6666717529297, + "epoch": 1.6958041958041958, + "grad_norm": 1.1004559993743896, + "kl": 0.2839426100254059, + "learning_rate": 4.697042781654913e-06, + "loss": 0.0114, + "reward": 1.870833396911621, + "reward_std": 0.193917915225029, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 485 + }, + { + "completion_length": 190.33334350585938, + "epoch": 1.6993006993006992, + "grad_norm": 1.0573567152023315, + "kl": 0.22315821051597595, + "learning_rate": 4.6949574385699514e-06, + "loss": 0.0089, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 486 + }, + { + "completion_length": 835.5, + "epoch": 1.702797202797203, + "grad_norm": 0.7173390984535217, + "kl": 0.1510881930589676, + "learning_rate": 4.6928654092639725e-06, + "loss": 0.006, + "reward": 1.5500000715255737, + "reward_std": 1.0904128551483154, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666388511658, + "step": 487 + }, + { + "completion_length": 615.8333740234375, + "epoch": 1.7062937062937062, + "grad_norm": 0.8014463186264038, + "kl": 0.22651296854019165, + "learning_rate": 4.690766700109659e-06, + "loss": 0.0091, + "reward": 2.7083334922790527, + "reward_std": 1.315453052520752, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 488 + }, + { + "completion_length": 169.6666717529297, + "epoch": 1.7097902097902098, + "grad_norm": 3.6473190784454346, + "kl": 0.40026336908340454, + "learning_rate": 4.688661317500045e-06, + "loss": 0.016, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 489 + }, + { + "completion_length": 1151.5, + "epoch": 1.7132867132867133, + "grad_norm": 0.8561959266662598, + "kl": 0.16577297449111938, + "learning_rate": 4.68654926784849e-06, + "loss": 0.0066, + "reward": 2.7083334922790527, + "reward_std": 1.0641508102416992, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.875, + "step": 490 + }, + { + "completion_length": 397.3333435058594, + "epoch": 1.7167832167832167, + "grad_norm": 1.0723934173583984, + "kl": 0.21682481467723846, + "learning_rate": 4.6844305575886635e-06, + "loss": 0.0087, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 491 + }, + { + "completion_length": 169.6666717529297, + "epoch": 1.7202797202797204, + "grad_norm": 1.4164685010910034, + "kl": 0.245243638753891, + "learning_rate": 4.682305193174524e-06, + "loss": 0.0098, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 492 + }, + { + "completion_length": 110.33333587646484, + "epoch": 1.7237762237762237, + "grad_norm": 5.974154949188232, + "kl": 1.1889418363571167, + "learning_rate": 4.680173181080302e-06, + "loss": 0.0476, + "reward": 3.075000286102295, + "reward_std": 1.1660832166671753, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 493 + }, + { + "completion_length": 215.5, + "epoch": 1.7272727272727273, + "grad_norm": 0.9199399352073669, + "kl": 0.2431143820285797, + "learning_rate": 4.6780345278004744e-06, + "loss": 0.0097, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 494 + }, + { + "completion_length": 190.83334350585938, + "epoch": 1.7307692307692308, + "grad_norm": 0.9801461696624756, + "kl": 0.25382137298583984, + "learning_rate": 4.675889239849749e-06, + "loss": 0.0102, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 495 + }, + { + "completion_length": 846.6666870117188, + "epoch": 1.7342657342657342, + "grad_norm": 0.6822401881217957, + "kl": 0.21501430869102478, + "learning_rate": 4.673737323763048e-06, + "loss": 0.0086, + "reward": 2.679166793823242, + "reward_std": 1.3748105764389038, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8458333015441895, + "step": 496 + }, + { + "completion_length": 182.33334350585938, + "epoch": 1.737762237762238, + "grad_norm": 6.3415422439575195, + "kl": 1.284159541130066, + "learning_rate": 4.671578786095479e-06, + "loss": 0.0514, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 497 + }, + { + "completion_length": 164.83334350585938, + "epoch": 1.7412587412587412, + "grad_norm": 1.421428918838501, + "kl": 0.3243716359138489, + "learning_rate": 4.669413633422322e-06, + "loss": 0.013, + "reward": 3.566666603088379, + "reward_std": 0.6013872623443604, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 498 + }, + { + "completion_length": 229.6666717529297, + "epoch": 1.7447552447552448, + "grad_norm": 0.8355535864830017, + "kl": 0.24279817938804626, + "learning_rate": 4.667241872339007e-06, + "loss": 0.0097, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 499 + }, + { + "completion_length": 672.6666870117188, + "epoch": 1.7482517482517483, + "grad_norm": 0.5215955376625061, + "kl": 0.19877499341964722, + "learning_rate": 4.665063509461098e-06, + "loss": 0.008, + "reward": 2.924999952316284, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 500 + }, + { + "completion_length": 198.83334350585938, + "epoch": 1.7517482517482517, + "grad_norm": 0.9148537516593933, + "kl": 0.24169328808784485, + "learning_rate": 4.6628785514242615e-06, + "loss": 0.0097, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 501 + }, + { + "completion_length": 928.5, + "epoch": 1.7552447552447552, + "grad_norm": 0.4413454532623291, + "kl": 0.15593400597572327, + "learning_rate": 4.6606870048842626e-06, + "loss": 0.0062, + "reward": 2.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 502 + }, + { + "completion_length": 508.0, + "epoch": 1.7587412587412588, + "grad_norm": 0.7536454796791077, + "kl": 0.24186736345291138, + "learning_rate": 4.658488876516929e-06, + "loss": 0.0097, + "reward": 2.625, + "reward_std": 1.616709589958191, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 503 + }, + { + "completion_length": 208.33334350585938, + "epoch": 1.762237762237762, + "grad_norm": 1.1730728149414062, + "kl": 0.2987002432346344, + "learning_rate": 4.656284173018144e-06, + "loss": 0.0119, + "reward": 2.758333206176758, + "reward_std": 1.0394309759140015, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 504 + }, + { + "completion_length": 197.1666717529297, + "epoch": 1.7657342657342658, + "grad_norm": 2.2083706855773926, + "kl": 0.3215945363044739, + "learning_rate": 4.654072901103815e-06, + "loss": 0.0129, + "reward": 2.0416667461395264, + "reward_std": 0.9002315402030945, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 505 + }, + { + "completion_length": 572.0, + "epoch": 1.7692307692307692, + "grad_norm": 0.8655341863632202, + "kl": 0.24153539538383484, + "learning_rate": 4.65185506750986e-06, + "loss": 0.0097, + "reward": 1.870833396911621, + "reward_std": 1.0137083530426025, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 506 + }, + { + "completion_length": 517.5, + "epoch": 1.7727272727272727, + "grad_norm": 0.49979329109191895, + "kl": 0.16330799460411072, + "learning_rate": 4.649630678992184e-06, + "loss": 0.0065, + "reward": 2.4000000953674316, + "reward_std": 0.9460445642471313, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 507 + }, + { + "completion_length": 324.16668701171875, + "epoch": 1.7762237762237763, + "grad_norm": 0.9129101037979126, + "kl": 0.26079505681991577, + "learning_rate": 4.6473997423266615e-06, + "loss": 0.0104, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 508 + }, + { + "completion_length": 316.16668701171875, + "epoch": 1.7797202797202796, + "grad_norm": 0.7381297945976257, + "kl": 0.34089159965515137, + "learning_rate": 4.645162264309112e-06, + "loss": 0.0136, + "reward": 3.2333335876464844, + "reward_std": 0.849509596824646, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 509 + }, + { + "completion_length": 207.83334350585938, + "epoch": 1.7832167832167833, + "grad_norm": 1.0436253547668457, + "kl": 0.2835765480995178, + "learning_rate": 4.642918251755281e-06, + "loss": 0.0113, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 510 + }, + { + "completion_length": 230.33334350585938, + "epoch": 1.7867132867132867, + "grad_norm": 0.9628374576568604, + "kl": 0.2641430199146271, + "learning_rate": 4.640667711500821e-06, + "loss": 0.0106, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 511 + }, + { + "completion_length": 507.66668701171875, + "epoch": 1.7902097902097902, + "grad_norm": 0.3851446211338043, + "kl": 0.251933217048645, + "learning_rate": 4.638410650401267e-06, + "loss": 0.0101, + "reward": 3.516666889190674, + "reward_std": 1.0614454746246338, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 512 + }, + { + "completion_length": 192.0, + "epoch": 1.7937062937062938, + "grad_norm": 1.3856638669967651, + "kl": 0.2984909415245056, + "learning_rate": 4.636147075332019e-06, + "loss": 0.0119, + "reward": 3.0916666984558105, + "reward_std": 1.2249150276184082, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 513 + }, + { + "completion_length": 206.83334350585938, + "epoch": 1.797202797202797, + "grad_norm": 0.9139816164970398, + "kl": 0.24960675835609436, + "learning_rate": 4.633876993188319e-06, + "loss": 0.01, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 514 + }, + { + "completion_length": 538.0, + "epoch": 1.8006993006993008, + "grad_norm": 0.7666388750076294, + "kl": 0.2067805826663971, + "learning_rate": 4.631600410885231e-06, + "loss": 0.0083, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 515 + }, + { + "completion_length": 186.0, + "epoch": 1.8041958041958042, + "grad_norm": 0.9322411417961121, + "kl": 0.24232684075832367, + "learning_rate": 4.62931733535762e-06, + "loss": 0.0097, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 516 + }, + { + "completion_length": 170.6666717529297, + "epoch": 1.8076923076923077, + "grad_norm": 1.5746034383773804, + "kl": 0.36948150396347046, + "learning_rate": 4.627027773560129e-06, + "loss": 0.0148, + "reward": 2.516666889190674, + "reward_std": 1.525341510772705, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 517 + }, + { + "completion_length": 193.0, + "epoch": 1.8111888111888113, + "grad_norm": 0.9759989380836487, + "kl": 0.3557225167751312, + "learning_rate": 4.62473173246716e-06, + "loss": 0.0142, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 518 + }, + { + "completion_length": 523.6666870117188, + "epoch": 1.8146853146853146, + "grad_norm": 0.9804190993309021, + "kl": 0.2574712038040161, + "learning_rate": 4.622429219072854e-06, + "loss": 0.0103, + "reward": 1.633333444595337, + "reward_std": 1.1919171810150146, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 519 + }, + { + "completion_length": 1029.166748046875, + "epoch": 1.8181818181818183, + "grad_norm": 0.5941687822341919, + "kl": 0.1915300190448761, + "learning_rate": 4.620120240391065e-06, + "loss": 0.0077, + "reward": 2.883333206176758, + "reward_std": 1.7192052602767944, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 520 + }, + { + "completion_length": 157.1666717529297, + "epoch": 1.8216783216783217, + "grad_norm": 3.1836304664611816, + "kl": 0.6161837577819824, + "learning_rate": 4.6178048034553435e-06, + "loss": 0.0246, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 521 + }, + { + "completion_length": 201.33334350585938, + "epoch": 1.8251748251748252, + "grad_norm": 1.5185062885284424, + "kl": 0.31097742915153503, + "learning_rate": 4.6154829153189105e-06, + "loss": 0.0124, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 522 + }, + { + "completion_length": 186.1666717529297, + "epoch": 1.8286713286713288, + "grad_norm": 0.936562180519104, + "kl": 0.3272198438644409, + "learning_rate": 4.613154583054641e-06, + "loss": 0.0131, + "reward": 2.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 523 + }, + { + "completion_length": 216.6666717529297, + "epoch": 1.832167832167832, + "grad_norm": 0.9323495626449585, + "kl": 0.3112618923187256, + "learning_rate": 4.610819813755038e-06, + "loss": 0.0125, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 524 + }, + { + "completion_length": 525.3333740234375, + "epoch": 1.8356643356643356, + "grad_norm": 0.40873953700065613, + "kl": 0.241009920835495, + "learning_rate": 4.608478614532215e-06, + "loss": 0.0096, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 525 + }, + { + "completion_length": 160.83334350585938, + "epoch": 1.8391608391608392, + "grad_norm": 1.1447237730026245, + "kl": 0.37633103132247925, + "learning_rate": 4.60613099251787e-06, + "loss": 0.0151, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 526 + }, + { + "completion_length": 176.5, + "epoch": 1.8426573426573427, + "grad_norm": 1.4215019941329956, + "kl": 0.31421756744384766, + "learning_rate": 4.603776954863266e-06, + "loss": 0.0126, + "reward": 2.2083334922790527, + "reward_std": 0.6003471612930298, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 527 + }, + { + "completion_length": 511.16668701171875, + "epoch": 1.8461538461538463, + "grad_norm": 0.7890862226486206, + "kl": 0.21260276436805725, + "learning_rate": 4.601416508739211e-06, + "loss": 0.0085, + "reward": 2.2916667461395264, + "reward_std": 1.4901063442230225, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 528 + }, + { + "completion_length": 145.6666717529297, + "epoch": 1.8496503496503496, + "grad_norm": 2.972633123397827, + "kl": 1.6821321249008179, + "learning_rate": 4.599049661336033e-06, + "loss": 0.0673, + "reward": 2.4583334922790527, + "reward_std": 1.3603004217147827, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 529 + }, + { + "completion_length": 337.66668701171875, + "epoch": 1.8531468531468531, + "grad_norm": 0.4933686852455139, + "kl": 0.2972989082336426, + "learning_rate": 4.596676419863561e-06, + "loss": 0.0119, + "reward": 3.758333206176758, + "reward_std": 0.4694856107234955, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9250000715255737, + "step": 530 + }, + { + "completion_length": 1491.166748046875, + "epoch": 1.8566433566433567, + "grad_norm": 0.7114420533180237, + "kl": 0.16526620090007782, + "learning_rate": 4.5942967915510975e-06, + "loss": 0.0066, + "reward": 2.683333396911621, + "reward_std": 0.8942409753799438, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 531 + }, + { + "completion_length": 822.0, + "epoch": 1.86013986013986, + "grad_norm": 0.4190931022167206, + "kl": 0.21502110362052917, + "learning_rate": 4.591910783647405e-06, + "loss": 0.0086, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 532 + }, + { + "completion_length": 739.5, + "epoch": 1.8636363636363638, + "grad_norm": 0.5615747570991516, + "kl": 0.223265141248703, + "learning_rate": 4.589518403420676e-06, + "loss": 0.0089, + "reward": 2.3500001430511475, + "reward_std": 1.5231547355651855, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 533 + }, + { + "completion_length": 188.6666717529297, + "epoch": 1.867132867132867, + "grad_norm": 0.754673957824707, + "kl": 0.2731919288635254, + "learning_rate": 4.587119658158517e-06, + "loss": 0.0109, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 534 + }, + { + "completion_length": 528.3333740234375, + "epoch": 1.8706293706293706, + "grad_norm": 0.45285508036613464, + "kl": 0.21540388464927673, + "learning_rate": 4.584714555167921e-06, + "loss": 0.0086, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 535 + }, + { + "completion_length": 513.1666870117188, + "epoch": 1.8741258741258742, + "grad_norm": 0.6436936259269714, + "kl": 0.2541727125644684, + "learning_rate": 4.582303101775249e-06, + "loss": 0.0102, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 536 + }, + { + "completion_length": 503.3333435058594, + "epoch": 1.8776223776223775, + "grad_norm": 0.5080775618553162, + "kl": 0.2073960304260254, + "learning_rate": 4.579885305326206e-06, + "loss": 0.0083, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 537 + }, + { + "completion_length": 209.6666717529297, + "epoch": 1.8811188811188813, + "grad_norm": 0.9030362963676453, + "kl": 0.283308744430542, + "learning_rate": 4.577461173185821e-06, + "loss": 0.0113, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 538 + }, + { + "completion_length": 121.5, + "epoch": 1.8846153846153846, + "grad_norm": 2.8895628452301025, + "kl": 0.8616495132446289, + "learning_rate": 4.5750307127384194e-06, + "loss": 0.0345, + "reward": 1.4666666984558105, + "reward_std": 1.2002778053283691, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 539 + }, + { + "completion_length": 208.83334350585938, + "epoch": 1.8881118881118881, + "grad_norm": 1.0781502723693848, + "kl": 0.2666887640953064, + "learning_rate": 4.572593931387604e-06, + "loss": 0.0107, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 540 + }, + { + "completion_length": 529.8333740234375, + "epoch": 1.8916083916083917, + "grad_norm": 0.8341970443725586, + "kl": 0.27578771114349365, + "learning_rate": 4.570150836556236e-06, + "loss": 0.011, + "reward": 2.683333396911621, + "reward_std": 0.9092121124267578, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 541 + }, + { + "completion_length": 509.0, + "epoch": 1.895104895104895, + "grad_norm": 0.7221694588661194, + "kl": 0.20753830671310425, + "learning_rate": 4.567701435686405e-06, + "loss": 0.0083, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 542 + }, + { + "completion_length": 999.0, + "epoch": 1.8986013986013988, + "grad_norm": 0.8567831516265869, + "kl": 0.2119346261024475, + "learning_rate": 4.5652457362394094e-06, + "loss": 0.0085, + "reward": 1.808333396911621, + "reward_std": 2.014302968978882, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 543 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.902097902097902, + "grad_norm": 0.5826951265335083, + "kl": 0.2415902316570282, + "learning_rate": 4.562783745695738e-06, + "loss": 0.0097, + "reward": 2.183333396911621, + "reward_std": 1.0230673551559448, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 544 + }, + { + "completion_length": 831.0, + "epoch": 1.9055944055944056, + "grad_norm": 0.5661029815673828, + "kl": 0.2621002495288849, + "learning_rate": 4.560315471555039e-06, + "loss": 0.0105, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 545 + }, + { + "completion_length": 190.6666717529297, + "epoch": 1.9090909090909092, + "grad_norm": 0.8984940648078918, + "kl": 0.261735200881958, + "learning_rate": 4.5578409213361055e-06, + "loss": 0.0105, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 546 + }, + { + "completion_length": 672.5, + "epoch": 1.9125874125874125, + "grad_norm": 0.6307451128959656, + "kl": 0.3331562280654907, + "learning_rate": 4.555360102576844e-06, + "loss": 0.0133, + "reward": 3.5916666984558105, + "reward_std": 0.5571505427360535, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9250000715255737, + "step": 547 + }, + { + "completion_length": 193.5, + "epoch": 1.916083916083916, + "grad_norm": 0.9689189791679382, + "kl": 0.31761375069618225, + "learning_rate": 4.55287302283426e-06, + "loss": 0.0127, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 548 + }, + { + "completion_length": 477.0, + "epoch": 1.9195804195804196, + "grad_norm": 1.1217161417007446, + "kl": 0.4803551435470581, + "learning_rate": 4.550379689684431e-06, + "loss": 0.0192, + "reward": 2.924999952316284, + "reward_std": 0.06123730167746544, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.9249999523162842, + "step": 549 + }, + { + "completion_length": 501.66668701171875, + "epoch": 1.9230769230769231, + "grad_norm": 0.48732584714889526, + "kl": 0.3280116021633148, + "learning_rate": 4.54788011072248e-06, + "loss": 0.0131, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 550 + }, + { + "completion_length": 190.5, + "epoch": 1.9265734265734267, + "grad_norm": 0.05169845372438431, + "kl": 0.2321687638759613, + "learning_rate": 4.545374293562559e-06, + "loss": 0.0117, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 551 + }, + { + "completion_length": 226.33334350585938, + "epoch": 1.93006993006993, + "grad_norm": 1.1284880638122559, + "kl": 0.3435511291027069, + "learning_rate": 4.542862245837821e-06, + "loss": 0.0137, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 552 + }, + { + "completion_length": 197.5, + "epoch": 1.9335664335664335, + "grad_norm": 0.8085185289382935, + "kl": 0.2905815541744232, + "learning_rate": 4.540343975200401e-06, + "loss": 0.0116, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 553 + }, + { + "completion_length": 504.8333435058594, + "epoch": 1.937062937062937, + "grad_norm": 0.38323989510536194, + "kl": 0.26971811056137085, + "learning_rate": 4.537819489321385e-06, + "loss": 0.0108, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 554 + }, + { + "completion_length": 172.5, + "epoch": 1.9405594405594404, + "grad_norm": 1.8462821245193481, + "kl": 0.32645952701568604, + "learning_rate": 4.535288795890799e-06, + "loss": 0.0131, + "reward": 2.8500001430511475, + "reward_std": 1.3190906047821045, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 555 + }, + { + "completion_length": 508.66668701171875, + "epoch": 1.9440559440559442, + "grad_norm": 0.48262494802474976, + "kl": 0.26610442996025085, + "learning_rate": 4.5327519026175694e-06, + "loss": 0.0106, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 556 + }, + { + "completion_length": 205.33334350585938, + "epoch": 1.9475524475524475, + "grad_norm": 0.8724077343940735, + "kl": 0.34979626536369324, + "learning_rate": 4.530208817229516e-06, + "loss": 0.014, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 557 + }, + { + "completion_length": 466.3333435058594, + "epoch": 1.951048951048951, + "grad_norm": 1.2409106492996216, + "kl": 0.5075003504753113, + "learning_rate": 4.527659547473317e-06, + "loss": 0.0203, + "reward": 1.774999976158142, + "reward_std": 1.3299436569213867, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6083333492279053, + "step": 558 + }, + { + "completion_length": 201.0, + "epoch": 1.9545454545454546, + "grad_norm": 0.9538130760192871, + "kl": 0.22750967741012573, + "learning_rate": 4.5251041011144905e-06, + "loss": 0.0091, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 559 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.958041958041958, + "grad_norm": 0.8161240220069885, + "kl": 0.28019654750823975, + "learning_rate": 4.522542485937369e-06, + "loss": 0.0112, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 560 + }, + { + "completion_length": 515.5, + "epoch": 1.9615384615384617, + "grad_norm": 0.6905736327171326, + "kl": 0.20913702249526978, + "learning_rate": 4.519974709745076e-06, + "loss": 0.0084, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 561 + }, + { + "completion_length": 201.5, + "epoch": 1.965034965034965, + "grad_norm": 1.109075665473938, + "kl": 0.29383933544158936, + "learning_rate": 4.517400780359505e-06, + "loss": 0.0118, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 562 + }, + { + "completion_length": 849.0, + "epoch": 1.9685314685314685, + "grad_norm": 0.5454800128936768, + "kl": 0.16988810896873474, + "learning_rate": 4.51482070562129e-06, + "loss": 0.0068, + "reward": 2.4666666984558105, + "reward_std": 1.949530005455017, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 563 + }, + { + "completion_length": 826.0, + "epoch": 1.972027972027972, + "grad_norm": 0.521063506603241, + "kl": 0.2149253934621811, + "learning_rate": 4.512234493389785e-06, + "loss": 0.0086, + "reward": 1.9666666984558105, + "reward_std": 1.5233734846115112, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 564 + }, + { + "completion_length": 502.8333435058594, + "epoch": 1.9755244755244754, + "grad_norm": 0.4798555076122284, + "kl": 0.26902374625205994, + "learning_rate": 4.509642151543043e-06, + "loss": 0.0108, + "reward": 1.625, + "reward_std": 0.7960842847824097, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 565 + }, + { + "completion_length": 525.0, + "epoch": 1.9790209790209792, + "grad_norm": 0.566384494304657, + "kl": 0.2703857123851776, + "learning_rate": 4.507043687977787e-06, + "loss": 0.0108, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 566 + }, + { + "completion_length": 194.33334350585938, + "epoch": 1.9825174825174825, + "grad_norm": 2.502077579498291, + "kl": 0.4179210364818573, + "learning_rate": 4.504439110609385e-06, + "loss": 0.0167, + "reward": 1.383333444595337, + "reward_std": 0.8920015096664429, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 567 + }, + { + "completion_length": 199.33334350585938, + "epoch": 1.986013986013986, + "grad_norm": 0.07109465450048447, + "kl": 0.2686344385147095, + "learning_rate": 4.501828427371834e-06, + "loss": 0.0131, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 568 + }, + { + "completion_length": 190.83334350585938, + "epoch": 1.9895104895104896, + "grad_norm": 1.11842942237854, + "kl": 0.2603175640106201, + "learning_rate": 4.4992116462177274e-06, + "loss": 0.0104, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 569 + }, + { + "completion_length": 513.8333740234375, + "epoch": 1.993006993006993, + "grad_norm": 0.47602808475494385, + "kl": 0.20756664872169495, + "learning_rate": 4.496588775118232e-06, + "loss": 0.0083, + "reward": 2.2916667461395264, + "reward_std": 1.19181227684021, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 570 + }, + { + "completion_length": 197.1666717529297, + "epoch": 1.9965034965034965, + "grad_norm": 0.7599025368690491, + "kl": 0.23664715886116028, + "learning_rate": 4.4939598220630724e-06, + "loss": 0.0095, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 571 + }, + { + "completion_length": 207.83334350585938, + "epoch": 2.0, + "grad_norm": 0.7908173203468323, + "kl": 0.28615739941596985, + "learning_rate": 4.491324795060491e-06, + "loss": 0.0114, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 572 + }, + { + "completion_length": 194.6666717529297, + "epoch": 2.0034965034965033, + "grad_norm": 0.9715352654457092, + "kl": 0.3183891177177429, + "learning_rate": 4.48868370213724e-06, + "loss": 0.0127, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 573 + }, + { + "completion_length": 173.83334350585938, + "epoch": 2.006993006993007, + "grad_norm": 2.3841874599456787, + "kl": 1.3214149475097656, + "learning_rate": 4.4860365513385456e-06, + "loss": 0.0529, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 574 + }, + { + "completion_length": 213.83334350585938, + "epoch": 2.0104895104895104, + "grad_norm": 0.9496575593948364, + "kl": 0.22735705971717834, + "learning_rate": 4.4833833507280884e-06, + "loss": 0.0091, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 575 + }, + { + "completion_length": 511.0, + "epoch": 2.013986013986014, + "grad_norm": 0.6045878529548645, + "kl": 0.25393787026405334, + "learning_rate": 4.4807241083879774e-06, + "loss": 0.0102, + "reward": 1.4583333730697632, + "reward_std": 0.8187898397445679, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 576 + }, + { + "completion_length": 222.1666717529297, + "epoch": 2.0174825174825175, + "grad_norm": 0.7379043102264404, + "kl": 0.22020569443702698, + "learning_rate": 4.478058832418726e-06, + "loss": 0.0088, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 577 + }, + { + "completion_length": 204.6666717529297, + "epoch": 2.020979020979021, + "grad_norm": 0.9404547810554504, + "kl": 0.2797861695289612, + "learning_rate": 4.475387530939226e-06, + "loss": 0.0112, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 578 + }, + { + "completion_length": 206.6666717529297, + "epoch": 2.0244755244755246, + "grad_norm": 0.8784480690956116, + "kl": 0.24152153730392456, + "learning_rate": 4.4727102120867274e-06, + "loss": 0.0097, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 579 + }, + { + "completion_length": 414.66668701171875, + "epoch": 2.027972027972028, + "grad_norm": 0.6715477705001831, + "kl": 0.21307629346847534, + "learning_rate": 4.470026884016805e-06, + "loss": 0.0085, + "reward": 2.4583334922790527, + "reward_std": 1.5001389980316162, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 580 + }, + { + "completion_length": 528.5, + "epoch": 2.0314685314685317, + "grad_norm": 0.7886191010475159, + "kl": 0.4145243763923645, + "learning_rate": 4.467337554903344e-06, + "loss": 0.0166, + "reward": 3.5416667461395264, + "reward_std": 1.0002083778381348, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.875, + "step": 581 + }, + { + "completion_length": 457.5, + "epoch": 2.034965034965035, + "grad_norm": 5.719381809234619, + "kl": 1.370613932609558, + "learning_rate": 4.464642232938505e-06, + "loss": 0.0548, + "reward": 1.9750001430511475, + "reward_std": 2.163504123687744, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 582 + }, + { + "completion_length": 361.5, + "epoch": 2.0384615384615383, + "grad_norm": 0.5381609201431274, + "kl": 0.23687216639518738, + "learning_rate": 4.461940926332708e-06, + "loss": 0.0095, + "reward": 1.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 583 + }, + { + "completion_length": 874.6666870117188, + "epoch": 2.041958041958042, + "grad_norm": 0.45025861263275146, + "kl": 0.16833463311195374, + "learning_rate": 4.4592336433146e-06, + "loss": 0.0067, + "reward": 2.9583334922790527, + "reward_std": 1.6554203033447266, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 584 + }, + { + "completion_length": 726.3333740234375, + "epoch": 2.0454545454545454, + "grad_norm": 0.4446694254875183, + "kl": 0.17844387888908386, + "learning_rate": 4.456520392131035e-06, + "loss": 0.0071, + "reward": 1.133333444595337, + "reward_std": 0.9595138430595398, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 585 + }, + { + "completion_length": 830.3333740234375, + "epoch": 2.0489510489510487, + "grad_norm": 0.8371572494506836, + "kl": 0.16316595673561096, + "learning_rate": 4.453801181047047e-06, + "loss": 0.0065, + "reward": 1.524999976158142, + "reward_std": 1.1206024885177612, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 586 + }, + { + "completion_length": 110.5, + "epoch": 2.0524475524475525, + "grad_norm": 3.6648356914520264, + "kl": 0.4860494136810303, + "learning_rate": 4.4510760183458246e-06, + "loss": 0.0194, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 587 + }, + { + "completion_length": 228.6666717529297, + "epoch": 2.055944055944056, + "grad_norm": 0.8717478513717651, + "kl": 0.28448450565338135, + "learning_rate": 4.448344912328686e-06, + "loss": 0.0114, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 588 + }, + { + "completion_length": 614.0, + "epoch": 2.0594405594405596, + "grad_norm": 0.352130651473999, + "kl": 0.19009076058864594, + "learning_rate": 4.445607871315053e-06, + "loss": 0.0076, + "reward": 1.7333333492279053, + "reward_std": 0.5307227969169617, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 589 + }, + { + "completion_length": 476.3333435058594, + "epoch": 2.062937062937063, + "grad_norm": 2.5581870079040527, + "kl": 0.5677192807197571, + "learning_rate": 4.442864903642428e-06, + "loss": 0.0227, + "reward": 1.8000000715255737, + "reward_std": 1.5792405605316162, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 590 + }, + { + "completion_length": 314.66668701171875, + "epoch": 2.0664335664335662, + "grad_norm": 0.657811164855957, + "kl": 0.20458662509918213, + "learning_rate": 4.440116017666365e-06, + "loss": 0.0082, + "reward": 3.116666793823242, + "reward_std": 1.3291600942611694, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 591 + }, + { + "completion_length": 516.0, + "epoch": 2.06993006993007, + "grad_norm": 0.473056823015213, + "kl": 0.19687163829803467, + "learning_rate": 4.437361221760449e-06, + "loss": 0.0079, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 592 + }, + { + "completion_length": 217.0, + "epoch": 2.0734265734265733, + "grad_norm": 0.793745756149292, + "kl": 0.2862774133682251, + "learning_rate": 4.434600524316266e-06, + "loss": 0.0115, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 593 + }, + { + "completion_length": 216.0, + "epoch": 2.076923076923077, + "grad_norm": 0.7589979767799377, + "kl": 0.2887541651725769, + "learning_rate": 4.431833933743378e-06, + "loss": 0.0116, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 594 + }, + { + "completion_length": 234.0, + "epoch": 2.0804195804195804, + "grad_norm": 0.952064037322998, + "kl": 0.30340343713760376, + "learning_rate": 4.4290614584693005e-06, + "loss": 0.0121, + "reward": 2.5375001430511475, + "reward_std": 0.9115578532218933, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 595 + }, + { + "completion_length": 1109.8333740234375, + "epoch": 2.0839160839160837, + "grad_norm": 0.382217139005661, + "kl": 0.1974603831768036, + "learning_rate": 4.426283106939474e-06, + "loss": 0.0079, + "reward": 1.7166666984558105, + "reward_std": 0.967298686504364, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7166666984558105, + "step": 596 + }, + { + "completion_length": 497.66668701171875, + "epoch": 2.0874125874125875, + "grad_norm": 0.7741627097129822, + "kl": 0.2393149733543396, + "learning_rate": 4.423498887617238e-06, + "loss": 0.0096, + "reward": 1.9583333730697632, + "reward_std": 1.400148868560791, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 597 + }, + { + "completion_length": 518.0, + "epoch": 2.090909090909091, + "grad_norm": 0.534230649471283, + "kl": 0.22715210914611816, + "learning_rate": 4.420708808983809e-06, + "loss": 0.0091, + "reward": 2.4583334922790527, + "reward_std": 1.3603003025054932, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 598 + }, + { + "completion_length": 502.8333435058594, + "epoch": 2.0944055944055946, + "grad_norm": 0.5411605834960938, + "kl": 0.2008448839187622, + "learning_rate": 4.41791287953825e-06, + "loss": 0.008, + "reward": 2.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 599 + }, + { + "completion_length": 545.6666870117188, + "epoch": 2.097902097902098, + "grad_norm": 0.44943779706954956, + "kl": 0.225155770778656, + "learning_rate": 4.415111107797445e-06, + "loss": 0.009, + "reward": 3.016666889190674, + "reward_std": 1.3952300548553467, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 600 + }, + { + "completion_length": 239.0, + "epoch": 2.1013986013986012, + "grad_norm": 0.9387716054916382, + "kl": 0.2535586357116699, + "learning_rate": 4.412303502296081e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 601 + }, + { + "completion_length": 188.0, + "epoch": 2.104895104895105, + "grad_norm": 3.3025033473968506, + "kl": 0.3564508557319641, + "learning_rate": 4.409490071586606e-06, + "loss": 0.0143, + "reward": 2.9583334922790527, + "reward_std": 1.6554205417633057, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 602 + }, + { + "completion_length": 526.8333740234375, + "epoch": 2.1083916083916083, + "grad_norm": 0.7135488986968994, + "kl": 0.25961729884147644, + "learning_rate": 4.406670824239221e-06, + "loss": 0.0104, + "reward": 2.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 603 + }, + { + "completion_length": 201.0, + "epoch": 2.111888111888112, + "grad_norm": 0.5526494979858398, + "kl": 0.26036110520362854, + "learning_rate": 4.403845768841842e-06, + "loss": 0.0104, + "reward": 2.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 604 + }, + { + "completion_length": 516.8333740234375, + "epoch": 2.1153846153846154, + "grad_norm": 0.4089651107788086, + "kl": 0.2617362141609192, + "learning_rate": 4.401014914000078e-06, + "loss": 0.0105, + "reward": 3.2916667461395264, + "reward_std": 1.6125807762145996, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 605 + }, + { + "completion_length": 192.5, + "epoch": 2.1188811188811187, + "grad_norm": 0.7996219396591187, + "kl": 0.30715522170066833, + "learning_rate": 4.398178268337202e-06, + "loss": 0.0123, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 606 + }, + { + "completion_length": 793.3333740234375, + "epoch": 2.1223776223776225, + "grad_norm": 0.8545472025871277, + "kl": 0.20438644289970398, + "learning_rate": 4.395335840494131e-06, + "loss": 0.0082, + "reward": 3.375, + "reward_std": 0.493710458278656, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.875, + "step": 607 + }, + { + "completion_length": 197.5, + "epoch": 2.125874125874126, + "grad_norm": 0.09662449359893799, + "kl": 0.2624778151512146, + "learning_rate": 4.3924876391293915e-06, + "loss": 0.0117, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 608 + }, + { + "completion_length": 199.0, + "epoch": 2.129370629370629, + "grad_norm": 0.8693634867668152, + "kl": 0.232680082321167, + "learning_rate": 4.389633672919099e-06, + "loss": 0.0093, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 609 + }, + { + "completion_length": 213.1666717529297, + "epoch": 2.132867132867133, + "grad_norm": 0.23271039128303528, + "kl": 0.2889987826347351, + "learning_rate": 4.386773950556931e-06, + "loss": 0.0139, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 610 + }, + { + "completion_length": 197.83334350585938, + "epoch": 2.1363636363636362, + "grad_norm": 0.8127601742744446, + "kl": 0.35951054096221924, + "learning_rate": 4.3839084807540956e-06, + "loss": 0.0144, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 611 + }, + { + "completion_length": 164.6666717529297, + "epoch": 2.13986013986014, + "grad_norm": 1.0649946928024292, + "kl": 0.26743820309638977, + "learning_rate": 4.381037272239311e-06, + "loss": 0.0107, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 612 + }, + { + "completion_length": 217.6666717529297, + "epoch": 2.1433566433566433, + "grad_norm": 0.8122753500938416, + "kl": 0.27118992805480957, + "learning_rate": 4.378160333758779e-06, + "loss": 0.0108, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 613 + }, + { + "completion_length": 201.6666717529297, + "epoch": 2.1468531468531467, + "grad_norm": 0.8640854358673096, + "kl": 0.2445271909236908, + "learning_rate": 4.3752776740761495e-06, + "loss": 0.0098, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 614 + }, + { + "completion_length": 188.6666717529297, + "epoch": 2.1503496503496504, + "grad_norm": 1.3168154954910278, + "kl": 0.2900705933570862, + "learning_rate": 4.372389301972506e-06, + "loss": 0.0116, + "reward": 1.7083333730697632, + "reward_std": 0.591960072517395, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 615 + }, + { + "completion_length": 241.6666717529297, + "epoch": 2.1538461538461537, + "grad_norm": 1.1053791046142578, + "kl": 0.4096168875694275, + "learning_rate": 4.36949522624633e-06, + "loss": 0.0164, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 616 + }, + { + "completion_length": 147.83334350585938, + "epoch": 2.1573426573426575, + "grad_norm": 3.980419874191284, + "kl": 1.5825055837631226, + "learning_rate": 4.366595455713479e-06, + "loss": 0.0633, + "reward": 2.3000001907348633, + "reward_std": 1.4812158346176147, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 617 + }, + { + "completion_length": 197.0, + "epoch": 2.160839160839161, + "grad_norm": 0.8954426050186157, + "kl": 0.23646585643291473, + "learning_rate": 4.3636899992071555e-06, + "loss": 0.0095, + "reward": 2.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 618 + }, + { + "completion_length": 221.33334350585938, + "epoch": 2.164335664335664, + "grad_norm": 0.8455007076263428, + "kl": 0.25921204686164856, + "learning_rate": 4.360778865577885e-06, + "loss": 0.0104, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 619 + }, + { + "completion_length": 196.5, + "epoch": 2.167832167832168, + "grad_norm": 0.8735758662223816, + "kl": 0.27696120738983154, + "learning_rate": 4.357862063693486e-06, + "loss": 0.0111, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 620 + }, + { + "completion_length": 177.83334350585938, + "epoch": 2.1713286713286712, + "grad_norm": 32.12022018432617, + "kl": 2.4454264640808105, + "learning_rate": 4.354939602439041e-06, + "loss": 0.0978, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 621 + }, + { + "completion_length": 184.6666717529297, + "epoch": 2.174825174825175, + "grad_norm": 2.8916237354278564, + "kl": 0.3946024775505066, + "learning_rate": 4.352011490716875e-06, + "loss": 0.0158, + "reward": 3.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 622 + }, + { + "completion_length": 210.33334350585938, + "epoch": 2.1783216783216783, + "grad_norm": 1.4287588596343994, + "kl": 0.32967257499694824, + "learning_rate": 4.349077737446525e-06, + "loss": 0.0132, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 623 + }, + { + "completion_length": 229.83334350585938, + "epoch": 2.1818181818181817, + "grad_norm": 0.04024571180343628, + "kl": 0.2965821325778961, + "learning_rate": 4.346138351564711e-06, + "loss": 0.0142, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 624 + }, + { + "completion_length": 153.83334350585938, + "epoch": 2.1853146853146854, + "grad_norm": 0.9452215433120728, + "kl": 0.26284661889076233, + "learning_rate": 4.34319334202531e-06, + "loss": 0.0105, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 625 + }, + { + "completion_length": 162.1666717529297, + "epoch": 2.1888111888111887, + "grad_norm": 32.100563049316406, + "kl": 7.969426155090332, + "learning_rate": 4.340242717799337e-06, + "loss": 0.3188, + "reward": 2.2916667461395264, + "reward_std": 0.8002604246139526, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 626 + }, + { + "completion_length": 175.5, + "epoch": 2.1923076923076925, + "grad_norm": 6.515329360961914, + "kl": 0.3849031627178192, + "learning_rate": 4.3372864878749e-06, + "loss": 0.0154, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 627 + }, + { + "completion_length": 504.3333435058594, + "epoch": 2.195804195804196, + "grad_norm": 0.6083482503890991, + "kl": 0.19082359969615936, + "learning_rate": 4.334324661257191e-06, + "loss": 0.0076, + "reward": 2.4583334922790527, + "reward_std": 1.5001389980316162, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 628 + }, + { + "completion_length": 196.0, + "epoch": 2.199300699300699, + "grad_norm": 0.9820056557655334, + "kl": 0.2912360727787018, + "learning_rate": 4.331357246968447e-06, + "loss": 0.0116, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 629 + }, + { + "completion_length": 544.0, + "epoch": 2.202797202797203, + "grad_norm": 0.5948340892791748, + "kl": 0.22720639407634735, + "learning_rate": 4.328384254047927e-06, + "loss": 0.0091, + "reward": 2.375, + "reward_std": 0.6509608626365662, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 630 + }, + { + "completion_length": 237.0, + "epoch": 2.2062937062937062, + "grad_norm": 0.0632646456360817, + "kl": 0.2671894431114197, + "learning_rate": 4.3254056915518815e-06, + "loss": 0.0131, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 631 + }, + { + "completion_length": 501.16668701171875, + "epoch": 2.20979020979021, + "grad_norm": 0.44626739621162415, + "kl": 0.2233467698097229, + "learning_rate": 4.322421568553529e-06, + "loss": 0.0089, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 632 + }, + { + "completion_length": 187.5, + "epoch": 2.2132867132867133, + "grad_norm": 0.9024590849876404, + "kl": 0.299750417470932, + "learning_rate": 4.319431894143027e-06, + "loss": 0.012, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 633 + }, + { + "completion_length": 532.5, + "epoch": 2.2167832167832167, + "grad_norm": 0.38001272082328796, + "kl": 0.28776365518569946, + "learning_rate": 4.316436677427441e-06, + "loss": 0.0115, + "reward": 3.566666603088379, + "reward_std": 0.9389711618423462, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 634 + }, + { + "completion_length": 201.6666717529297, + "epoch": 2.2202797202797204, + "grad_norm": 1.1841076612472534, + "kl": 0.3013113737106323, + "learning_rate": 4.313435927530719e-06, + "loss": 0.0121, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 635 + }, + { + "completion_length": 185.6666717529297, + "epoch": 2.2237762237762237, + "grad_norm": 0.8018883466720581, + "kl": 0.2923080325126648, + "learning_rate": 4.3104296535936695e-06, + "loss": 0.0117, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 636 + }, + { + "completion_length": 525.3333740234375, + "epoch": 2.227272727272727, + "grad_norm": 0.4936811923980713, + "kl": 0.25341111421585083, + "learning_rate": 4.3074178647739205e-06, + "loss": 0.0101, + "reward": 3.2083334922790527, + "reward_std": 0.9697508215904236, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 637 + }, + { + "completion_length": 191.83334350585938, + "epoch": 2.230769230769231, + "grad_norm": 0.6575815677642822, + "kl": 0.3100575804710388, + "learning_rate": 4.3044005702459055e-06, + "loss": 0.0124, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 638 + }, + { + "completion_length": 178.5, + "epoch": 2.234265734265734, + "grad_norm": 0.8525052666664124, + "kl": 0.31076908111572266, + "learning_rate": 4.301377779200826e-06, + "loss": 0.0124, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 639 + }, + { + "completion_length": 185.33334350585938, + "epoch": 2.237762237762238, + "grad_norm": 1.0106300115585327, + "kl": 0.30621784925460815, + "learning_rate": 4.2983495008466285e-06, + "loss": 0.0122, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 640 + }, + { + "completion_length": 186.5, + "epoch": 2.2412587412587412, + "grad_norm": 0.885761022567749, + "kl": 0.3738858103752136, + "learning_rate": 4.295315744407972e-06, + "loss": 0.015, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 641 + }, + { + "completion_length": 171.6666717529297, + "epoch": 2.2447552447552446, + "grad_norm": 1.113839030265808, + "kl": 0.3465404212474823, + "learning_rate": 4.2922765191262075e-06, + "loss": 0.0139, + "reward": 2.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 642 + }, + { + "completion_length": 203.0, + "epoch": 2.2482517482517483, + "grad_norm": 0.8950809836387634, + "kl": 0.2658528983592987, + "learning_rate": 4.28923183425934e-06, + "loss": 0.0106, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 643 + }, + { + "completion_length": 198.5, + "epoch": 2.2517482517482517, + "grad_norm": 0.9561752080917358, + "kl": 0.31710129976272583, + "learning_rate": 4.286181699082008e-06, + "loss": 0.0127, + "reward": 2.2833335399627686, + "reward_std": 1.0327956676483154, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 644 + }, + { + "completion_length": 168.1666717529297, + "epoch": 2.2552447552447554, + "grad_norm": 0.8310069441795349, + "kl": 0.27687615156173706, + "learning_rate": 4.283126122885455e-06, + "loss": 0.0111, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 645 + }, + { + "completion_length": 196.83334350585938, + "epoch": 2.2587412587412588, + "grad_norm": 0.09269661456346512, + "kl": 0.2699682414531708, + "learning_rate": 4.280065114977492e-06, + "loss": 0.012, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 646 + }, + { + "completion_length": 163.6666717529297, + "epoch": 2.262237762237762, + "grad_norm": 1.2992812395095825, + "kl": 0.3616819381713867, + "learning_rate": 4.276998684682482e-06, + "loss": 0.0145, + "reward": 2.375, + "reward_std": 1.1847995519638062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 647 + }, + { + "completion_length": 198.6666717529297, + "epoch": 2.265734265734266, + "grad_norm": 0.8000275492668152, + "kl": 0.2609575390815735, + "learning_rate": 4.273926841341303e-06, + "loss": 0.0104, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 648 + }, + { + "completion_length": 196.1666717529297, + "epoch": 2.269230769230769, + "grad_norm": 0.8786153197288513, + "kl": 0.3877195119857788, + "learning_rate": 4.270849594311323e-06, + "loss": 0.0155, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 649 + }, + { + "completion_length": 201.0, + "epoch": 2.2727272727272725, + "grad_norm": 0.9727340936660767, + "kl": 0.3743540942668915, + "learning_rate": 4.267766952966369e-06, + "loss": 0.015, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 650 + }, + { + "completion_length": 205.33334350585938, + "epoch": 2.2762237762237763, + "grad_norm": 0.09209764748811722, + "kl": 0.27989333868026733, + "learning_rate": 4.264678926696703e-06, + "loss": 0.0136, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 651 + }, + { + "completion_length": 202.5, + "epoch": 2.2797202797202796, + "grad_norm": 0.9205158948898315, + "kl": 0.3037436008453369, + "learning_rate": 4.261585524908987e-06, + "loss": 0.0121, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 652 + }, + { + "completion_length": 304.66668701171875, + "epoch": 2.2832167832167833, + "grad_norm": 0.8844843506813049, + "kl": 0.3668223023414612, + "learning_rate": 4.25848675702626e-06, + "loss": 0.0147, + "reward": 1.9500001668930054, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 653 + }, + { + "completion_length": 194.6666717529297, + "epoch": 2.2867132867132867, + "grad_norm": 1.0558805465698242, + "kl": 0.3064219057559967, + "learning_rate": 4.255382632487907e-06, + "loss": 0.0123, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 654 + }, + { + "completion_length": 185.6666717529297, + "epoch": 2.29020979020979, + "grad_norm": 0.9313608407974243, + "kl": 0.31230098009109497, + "learning_rate": 4.2522731607496275e-06, + "loss": 0.0125, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 655 + }, + { + "completion_length": 211.1666717529297, + "epoch": 2.2937062937062938, + "grad_norm": 0.19107016921043396, + "kl": 0.373710036277771, + "learning_rate": 4.249158351283414e-06, + "loss": 0.0173, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 656 + }, + { + "completion_length": 348.8333435058594, + "epoch": 2.297202797202797, + "grad_norm": 0.7309221029281616, + "kl": 0.3733287751674652, + "learning_rate": 4.246038213577516e-06, + "loss": 0.0149, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 657 + }, + { + "completion_length": 180.6666717529297, + "epoch": 2.300699300699301, + "grad_norm": 0.8861889839172363, + "kl": 0.35562607645988464, + "learning_rate": 4.242912757136412e-06, + "loss": 0.0142, + "reward": 2.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 658 + }, + { + "completion_length": 204.5, + "epoch": 2.304195804195804, + "grad_norm": 0.7407400608062744, + "kl": 0.28287678956985474, + "learning_rate": 4.239781991480786e-06, + "loss": 0.0113, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 659 + }, + { + "completion_length": 174.0, + "epoch": 2.3076923076923075, + "grad_norm": 8.534856796264648, + "kl": 1.5403010845184326, + "learning_rate": 4.236645926147493e-06, + "loss": 0.0616, + "reward": 2.2916667461395264, + "reward_std": 0.8002604246139526, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 660 + }, + { + "completion_length": 184.33334350585938, + "epoch": 2.3111888111888113, + "grad_norm": 0.06887773424386978, + "kl": 0.2856985628604889, + "learning_rate": 4.233504570689533e-06, + "loss": 0.0138, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 661 + }, + { + "completion_length": 202.83334350585938, + "epoch": 2.3146853146853146, + "grad_norm": 0.8288156986236572, + "kl": 0.2896421253681183, + "learning_rate": 4.230357934676017e-06, + "loss": 0.0116, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 662 + }, + { + "completion_length": 207.6666717529297, + "epoch": 2.3181818181818183, + "grad_norm": 1.119509220123291, + "kl": 0.4124630391597748, + "learning_rate": 4.227206027692146e-06, + "loss": 0.0165, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 663 + }, + { + "completion_length": 198.1666717529297, + "epoch": 2.3216783216783217, + "grad_norm": 0.8312250971794128, + "kl": 0.3108134865760803, + "learning_rate": 4.224048859339175e-06, + "loss": 0.0124, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 664 + }, + { + "completion_length": 610.8333740234375, + "epoch": 2.325174825174825, + "grad_norm": 0.5707215070724487, + "kl": 0.23091670870780945, + "learning_rate": 4.220886439234385e-06, + "loss": 0.0092, + "reward": 2.383333444595337, + "reward_std": 1.5413198471069336, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 665 + }, + { + "completion_length": 460.0, + "epoch": 2.3286713286713288, + "grad_norm": 10.873461723327637, + "kl": 2.6264634132385254, + "learning_rate": 4.217718777011058e-06, + "loss": 0.1051, + "reward": 1.4666666984558105, + "reward_std": 1.356711745262146, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 666 + }, + { + "completion_length": 207.0, + "epoch": 2.332167832167832, + "grad_norm": 0.6674370765686035, + "kl": 0.2692621350288391, + "learning_rate": 4.2145458823184414e-06, + "loss": 0.0108, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 667 + }, + { + "completion_length": 567.8333740234375, + "epoch": 2.335664335664336, + "grad_norm": 0.42179885506629944, + "kl": 0.2716664671897888, + "learning_rate": 4.211367764821722e-06, + "loss": 0.0109, + "reward": 3.566666603088379, + "reward_std": 0.938971221446991, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 668 + }, + { + "completion_length": 223.5, + "epoch": 2.339160839160839, + "grad_norm": 0.6866164803504944, + "kl": 0.24070698022842407, + "learning_rate": 4.208184434201999e-06, + "loss": 0.0096, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 669 + }, + { + "completion_length": 214.5, + "epoch": 2.3426573426573425, + "grad_norm": 0.9751102924346924, + "kl": 0.2499878704547882, + "learning_rate": 4.204995900156247e-06, + "loss": 0.01, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 670 + }, + { + "completion_length": 182.33334350585938, + "epoch": 2.3461538461538463, + "grad_norm": 3.7804720401763916, + "kl": 0.46188828349113464, + "learning_rate": 4.201802172397295e-06, + "loss": 0.0185, + "reward": 3.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 671 + }, + { + "completion_length": 1019.6666870117188, + "epoch": 2.3496503496503496, + "grad_norm": 0.4247821569442749, + "kl": 0.21799665689468384, + "learning_rate": 4.198603260653792e-06, + "loss": 0.0087, + "reward": 2.7166669368743896, + "reward_std": 1.6418485641479492, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666388511658, + "step": 672 + }, + { + "completion_length": 753.8333740234375, + "epoch": 2.3531468531468533, + "grad_norm": 0.5194523334503174, + "kl": 0.22523364424705505, + "learning_rate": 4.195399174670177e-06, + "loss": 0.009, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 673 + }, + { + "completion_length": 573.6666870117188, + "epoch": 2.3566433566433567, + "grad_norm": 0.5000849366188049, + "kl": 0.22850388288497925, + "learning_rate": 4.192189924206652e-06, + "loss": 0.0091, + "reward": 2.950000286102295, + "reward_std": 1.095445156097412, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 674 + }, + { + "completion_length": 1213.8333740234375, + "epoch": 2.36013986013986, + "grad_norm": 0.5522187352180481, + "kl": 0.177886962890625, + "learning_rate": 4.188975519039151e-06, + "loss": 0.0071, + "reward": 1.3916667699813843, + "reward_std": 1.4026464223861694, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5583333969116211, + "step": 675 + }, + { + "completion_length": 872.6666870117188, + "epoch": 2.3636363636363638, + "grad_norm": 0.4857361912727356, + "kl": 0.20906971395015717, + "learning_rate": 4.185755968959308e-06, + "loss": 0.0084, + "reward": 2.9083335399627686, + "reward_std": 1.696000337600708, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 676 + }, + { + "completion_length": 502.0, + "epoch": 2.367132867132867, + "grad_norm": 0.5935739278793335, + "kl": 0.27800655364990234, + "learning_rate": 4.182531283774434e-06, + "loss": 0.0111, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 677 + }, + { + "completion_length": 738.5, + "epoch": 2.370629370629371, + "grad_norm": 0.5985221862792969, + "kl": 0.2548876702785492, + "learning_rate": 4.179301473307476e-06, + "loss": 0.0102, + "reward": 2.2125000953674316, + "reward_std": 1.3164108991622925, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 678 + }, + { + "completion_length": 173.83334350585938, + "epoch": 2.374125874125874, + "grad_norm": 1.7061294317245483, + "kl": 0.3693540692329407, + "learning_rate": 4.176066547396998e-06, + "loss": 0.0148, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 679 + }, + { + "completion_length": 203.6666717529297, + "epoch": 2.3776223776223775, + "grad_norm": 1.0101178884506226, + "kl": 0.31931599974632263, + "learning_rate": 4.172826515897146e-06, + "loss": 0.0128, + "reward": 2.2083334922790527, + "reward_std": 1.1577637195587158, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 680 + }, + { + "completion_length": 205.83334350585938, + "epoch": 2.3811188811188813, + "grad_norm": 0.8966777920722961, + "kl": 0.3051684498786926, + "learning_rate": 4.169581388677617e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 681 + }, + { + "completion_length": 210.6666717529297, + "epoch": 2.3846153846153846, + "grad_norm": 0.7840998768806458, + "kl": 0.31647345423698425, + "learning_rate": 4.166331175623631e-06, + "loss": 0.0127, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 682 + }, + { + "completion_length": 209.1666717529297, + "epoch": 2.3881118881118883, + "grad_norm": 0.9048584699630737, + "kl": 0.25157231092453003, + "learning_rate": 4.163075886635902e-06, + "loss": 0.0101, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 683 + }, + { + "completion_length": 494.66668701171875, + "epoch": 2.3916083916083917, + "grad_norm": 0.612885057926178, + "kl": 0.1984379142522812, + "learning_rate": 4.159815531630604e-06, + "loss": 0.0079, + "reward": 2.125, + "reward_std": 1.3129165172576904, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 684 + }, + { + "completion_length": 182.83334350585938, + "epoch": 2.395104895104895, + "grad_norm": 1.069145679473877, + "kl": 0.33643895387649536, + "learning_rate": 4.1565501205393445e-06, + "loss": 0.0135, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 685 + }, + { + "completion_length": 192.6666717529297, + "epoch": 2.3986013986013988, + "grad_norm": 0.8116271495819092, + "kl": 0.29202282428741455, + "learning_rate": 4.15327966330913e-06, + "loss": 0.0117, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 686 + }, + { + "completion_length": 196.0, + "epoch": 2.402097902097902, + "grad_norm": 0.9276851415634155, + "kl": 0.31228408217430115, + "learning_rate": 4.150004169902343e-06, + "loss": 0.0125, + "reward": 1.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 687 + }, + { + "completion_length": 213.6666717529297, + "epoch": 2.4055944055944054, + "grad_norm": 1.0499162673950195, + "kl": 0.24672053754329681, + "learning_rate": 4.146723650296701e-06, + "loss": 0.0099, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 688 + }, + { + "completion_length": 219.1666717529297, + "epoch": 2.409090909090909, + "grad_norm": 0.7051374912261963, + "kl": 0.24717721343040466, + "learning_rate": 4.14343811448524e-06, + "loss": 0.0099, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 689 + }, + { + "completion_length": 226.5, + "epoch": 2.4125874125874125, + "grad_norm": 0.7789434194564819, + "kl": 0.2564643919467926, + "learning_rate": 4.140147572476269e-06, + "loss": 0.0103, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 690 + }, + { + "completion_length": 212.0, + "epoch": 2.4160839160839163, + "grad_norm": 0.8126075267791748, + "kl": 0.23958399891853333, + "learning_rate": 4.136852034293349e-06, + "loss": 0.0096, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 691 + }, + { + "completion_length": 210.6666717529297, + "epoch": 2.4195804195804196, + "grad_norm": 0.8626409769058228, + "kl": 0.2777412533760071, + "learning_rate": 4.133551509975264e-06, + "loss": 0.0111, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 692 + }, + { + "completion_length": 529.8333740234375, + "epoch": 2.423076923076923, + "grad_norm": 0.5266372561454773, + "kl": 0.2946487069129944, + "learning_rate": 4.130246009575981e-06, + "loss": 0.0118, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 693 + }, + { + "completion_length": 217.6666717529297, + "epoch": 2.4265734265734267, + "grad_norm": 0.814607560634613, + "kl": 0.31643202900886536, + "learning_rate": 4.126935543164628e-06, + "loss": 0.0127, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 694 + }, + { + "completion_length": 206.1666717529297, + "epoch": 2.43006993006993, + "grad_norm": 0.6121898293495178, + "kl": 0.24353787302970886, + "learning_rate": 4.123620120825459e-06, + "loss": 0.0097, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 695 + }, + { + "completion_length": 416.16668701171875, + "epoch": 2.4335664335664333, + "grad_norm": 0.65854811668396, + "kl": 0.29339665174484253, + "learning_rate": 4.120299752657828e-06, + "loss": 0.0117, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 696 + }, + { + "completion_length": 524.3333740234375, + "epoch": 2.437062937062937, + "grad_norm": 0.5596239566802979, + "kl": 0.26455265283584595, + "learning_rate": 4.11697444877615e-06, + "loss": 0.0106, + "reward": 2.7916667461395264, + "reward_std": 1.4527273178100586, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 697 + }, + { + "completion_length": 173.6666717529297, + "epoch": 2.4405594405594404, + "grad_norm": 2.7013747692108154, + "kl": 0.5755926370620728, + "learning_rate": 4.113644219309877e-06, + "loss": 0.023, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 698 + }, + { + "completion_length": 893.0, + "epoch": 2.444055944055944, + "grad_norm": 0.5892761945724487, + "kl": 0.22364209592342377, + "learning_rate": 4.110309074403467e-06, + "loss": 0.0089, + "reward": 2.433333396911621, + "reward_std": 1.3728317022323608, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 699 + }, + { + "completion_length": 1029.166748046875, + "epoch": 2.4475524475524475, + "grad_norm": 0.41362571716308594, + "kl": 0.20189592242240906, + "learning_rate": 4.106969024216348e-06, + "loss": 0.0081, + "reward": 1.7416666746139526, + "reward_std": 0.9625055193901062, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7416666746139526, + "step": 700 + }, + { + "completion_length": 200.5, + "epoch": 2.451048951048951, + "grad_norm": 0.9199966788291931, + "kl": 0.29405680298805237, + "learning_rate": 4.103624078922895e-06, + "loss": 0.0118, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 701 + }, + { + "completion_length": 551.8333740234375, + "epoch": 2.4545454545454546, + "grad_norm": 0.5847578644752502, + "kl": 0.30494964122772217, + "learning_rate": 4.1002742487123896e-06, + "loss": 0.0122, + "reward": 2.4583334922790527, + "reward_std": 1.3603003025054932, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 702 + }, + { + "completion_length": 158.5, + "epoch": 2.458041958041958, + "grad_norm": 3.148179054260254, + "kl": 0.33209604024887085, + "learning_rate": 4.096919543788995e-06, + "loss": 0.0133, + "reward": 2.9583334922790527, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 703 + }, + { + "completion_length": 635.1666870117188, + "epoch": 2.4615384615384617, + "grad_norm": 0.7368152141571045, + "kl": 0.2001763880252838, + "learning_rate": 4.093559974371725e-06, + "loss": 0.008, + "reward": 3.204166889190674, + "reward_std": 0.42143115401268005, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 704 + }, + { + "completion_length": 206.83334350585938, + "epoch": 2.465034965034965, + "grad_norm": 0.7404118776321411, + "kl": 0.2592664361000061, + "learning_rate": 4.09019555069441e-06, + "loss": 0.0104, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 705 + }, + { + "completion_length": 203.5, + "epoch": 2.4685314685314683, + "grad_norm": 0.7086665630340576, + "kl": 0.28512802720069885, + "learning_rate": 4.086826283005669e-06, + "loss": 0.0114, + "reward": 2.704166889190674, + "reward_std": 0.6021662950515747, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 706 + }, + { + "completion_length": 175.33334350585938, + "epoch": 2.472027972027972, + "grad_norm": 3.0447657108306885, + "kl": 0.38635802268981934, + "learning_rate": 4.083452181568876e-06, + "loss": 0.0155, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 707 + }, + { + "completion_length": 200.6666717529297, + "epoch": 2.4755244755244754, + "grad_norm": 0.7985562682151794, + "kl": 0.30575287342071533, + "learning_rate": 4.080073256662128e-06, + "loss": 0.0122, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 708 + }, + { + "completion_length": 212.5, + "epoch": 2.479020979020979, + "grad_norm": 1.0262845754623413, + "kl": 0.30596381425857544, + "learning_rate": 4.076689518578217e-06, + "loss": 0.0122, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 709 + }, + { + "completion_length": 199.5, + "epoch": 2.4825174825174825, + "grad_norm": 0.8163771629333496, + "kl": 0.23148366808891296, + "learning_rate": 4.073300977624594e-06, + "loss": 0.0093, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 710 + }, + { + "completion_length": 228.33334350585938, + "epoch": 2.486013986013986, + "grad_norm": 0.6531832218170166, + "kl": 0.27565860748291016, + "learning_rate": 4.069907644123346e-06, + "loss": 0.011, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 711 + }, + { + "completion_length": 487.16668701171875, + "epoch": 2.4895104895104896, + "grad_norm": 0.3693908452987671, + "kl": 0.32342347502708435, + "learning_rate": 4.066509528411151e-06, + "loss": 0.0129, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 712 + }, + { + "completion_length": 191.83334350585938, + "epoch": 2.493006993006993, + "grad_norm": 0.822213351726532, + "kl": 0.3490138649940491, + "learning_rate": 4.063106640839264e-06, + "loss": 0.014, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 713 + }, + { + "completion_length": 178.6666717529297, + "epoch": 2.4965034965034967, + "grad_norm": 0.7303230166435242, + "kl": 0.26454809308052063, + "learning_rate": 4.059698991773466e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 714 + }, + { + "completion_length": 213.6666717529297, + "epoch": 2.5, + "grad_norm": 0.792052149772644, + "kl": 0.32973194122314453, + "learning_rate": 4.056286591594049e-06, + "loss": 0.0132, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 715 + }, + { + "completion_length": 211.0, + "epoch": 2.5034965034965033, + "grad_norm": 0.6441434025764465, + "kl": 0.3346059024333954, + "learning_rate": 4.052869450695776e-06, + "loss": 0.0134, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 716 + }, + { + "completion_length": 181.6666717529297, + "epoch": 2.506993006993007, + "grad_norm": 2.2384145259857178, + "kl": 0.4402106702327728, + "learning_rate": 4.049447579487851e-06, + "loss": 0.0176, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 717 + }, + { + "completion_length": 825.8333740234375, + "epoch": 2.5104895104895104, + "grad_norm": 0.4227934777736664, + "kl": 0.19202569127082825, + "learning_rate": 4.046020988393886e-06, + "loss": 0.0077, + "reward": 2.7916667461395264, + "reward_std": 1.5844295024871826, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 718 + }, + { + "completion_length": 199.6666717529297, + "epoch": 2.513986013986014, + "grad_norm": 0.7948997020721436, + "kl": 0.30144181847572327, + "learning_rate": 4.0425896878518725e-06, + "loss": 0.0121, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 719 + }, + { + "completion_length": 200.6666717529297, + "epoch": 2.5174825174825175, + "grad_norm": 0.7969666123390198, + "kl": 0.2623240351676941, + "learning_rate": 4.039153688314146e-06, + "loss": 0.0105, + "reward": 2.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 720 + }, + { + "completion_length": 184.6666717529297, + "epoch": 2.520979020979021, + "grad_norm": 1.1336637735366821, + "kl": 0.2935950756072998, + "learning_rate": 4.035713000247358e-06, + "loss": 0.0117, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 721 + }, + { + "completion_length": 667.3333740234375, + "epoch": 2.5244755244755246, + "grad_norm": 0.39087414741516113, + "kl": 0.2444695681333542, + "learning_rate": 4.032267634132442e-06, + "loss": 0.0098, + "reward": 3.704166889190674, + "reward_std": 0.6021662950515747, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 722 + }, + { + "completion_length": 818.5, + "epoch": 2.527972027972028, + "grad_norm": 0.42902201414108276, + "kl": 0.18485748767852783, + "learning_rate": 4.028817600464579e-06, + "loss": 0.0074, + "reward": 1.9666666984558105, + "reward_std": 1.5233734846115112, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 723 + }, + { + "completion_length": 197.33334350585938, + "epoch": 2.5314685314685317, + "grad_norm": 0.5554837584495544, + "kl": 0.3039252758026123, + "learning_rate": 4.02536290975317e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 724 + }, + { + "completion_length": 519.3333740234375, + "epoch": 2.534965034965035, + "grad_norm": 0.44166073203086853, + "kl": 0.24431876838207245, + "learning_rate": 4.021903572521802e-06, + "loss": 0.0098, + "reward": 2.7916667461395264, + "reward_std": 1.4527273178100586, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 725 + }, + { + "completion_length": 200.0, + "epoch": 2.5384615384615383, + "grad_norm": 0.7037209868431091, + "kl": 0.3631229102611542, + "learning_rate": 4.018439599308217e-06, + "loss": 0.0145, + "reward": 3.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 726 + }, + { + "completion_length": 192.5, + "epoch": 2.541958041958042, + "grad_norm": 0.664789617061615, + "kl": 0.29182663559913635, + "learning_rate": 4.0149710006642775e-06, + "loss": 0.0117, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 727 + }, + { + "completion_length": 198.5, + "epoch": 2.5454545454545454, + "grad_norm": 1.0678514242172241, + "kl": 0.28828293085098267, + "learning_rate": 4.011497787155938e-06, + "loss": 0.0115, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 728 + }, + { + "completion_length": 213.83334350585938, + "epoch": 2.548951048951049, + "grad_norm": 0.8395413756370544, + "kl": 0.3076155185699463, + "learning_rate": 4.008019969363206e-06, + "loss": 0.0123, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 729 + }, + { + "completion_length": 212.0, + "epoch": 2.5524475524475525, + "grad_norm": 0.7780301570892334, + "kl": 0.2876867651939392, + "learning_rate": 4.0045375578801216e-06, + "loss": 0.0115, + "reward": 2.616666793823242, + "reward_std": 1.0327956676483154, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 730 + }, + { + "completion_length": 183.33334350585938, + "epoch": 2.555944055944056, + "grad_norm": 0.043716005980968475, + "kl": 0.40688663721084595, + "learning_rate": 4.001050563314711e-06, + "loss": 0.0187, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 731 + }, + { + "completion_length": 181.6666717529297, + "epoch": 2.5594405594405596, + "grad_norm": 0.7270947098731995, + "kl": 0.2820360064506531, + "learning_rate": 3.997558996288965e-06, + "loss": 0.0113, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 732 + }, + { + "completion_length": 522.0, + "epoch": 2.562937062937063, + "grad_norm": 0.5480185747146606, + "kl": 0.2843058109283447, + "learning_rate": 3.994062867438803e-06, + "loss": 0.0114, + "reward": 3.016666889190674, + "reward_std": 0.9521905183792114, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 733 + }, + { + "completion_length": 192.0, + "epoch": 2.5664335664335667, + "grad_norm": 0.733644962310791, + "kl": 0.27982231974601746, + "learning_rate": 3.9905621874140396e-06, + "loss": 0.0112, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 734 + }, + { + "completion_length": 198.6666717529297, + "epoch": 2.56993006993007, + "grad_norm": 0.7122451066970825, + "kl": 0.36668699979782104, + "learning_rate": 3.987056966878354e-06, + "loss": 0.0147, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 735 + }, + { + "completion_length": 204.33334350585938, + "epoch": 2.5734265734265733, + "grad_norm": 0.07662484794855118, + "kl": 0.3632362484931946, + "learning_rate": 3.983547216509254e-06, + "loss": 0.0169, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 736 + }, + { + "completion_length": 189.1666717529297, + "epoch": 2.5769230769230766, + "grad_norm": 0.34811052680015564, + "kl": 0.4749183654785156, + "learning_rate": 3.9800329469980495e-06, + "loss": 0.0214, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 737 + }, + { + "completion_length": 583.5, + "epoch": 2.5804195804195804, + "grad_norm": 0.3855575919151306, + "kl": 0.2539462447166443, + "learning_rate": 3.976514169049814e-06, + "loss": 0.0102, + "reward": 2.704166889190674, + "reward_std": 0.6021661758422852, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 738 + }, + { + "completion_length": 174.6666717529297, + "epoch": 2.583916083916084, + "grad_norm": 1.0900449752807617, + "kl": 0.3619951605796814, + "learning_rate": 3.972990893383356e-06, + "loss": 0.0145, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 739 + }, + { + "completion_length": 203.1666717529297, + "epoch": 2.5874125874125875, + "grad_norm": 0.9708390831947327, + "kl": 0.28454601764678955, + "learning_rate": 3.969463130731183e-06, + "loss": 0.0114, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 740 + }, + { + "completion_length": 522.1666870117188, + "epoch": 2.590909090909091, + "grad_norm": 0.6295937895774841, + "kl": 0.26834964752197266, + "learning_rate": 3.965930891839473e-06, + "loss": 0.0107, + "reward": 3.370833396911621, + "reward_std": 0.4925486445426941, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 741 + }, + { + "completion_length": 703.5, + "epoch": 2.594405594405594, + "grad_norm": 0.35760697722435, + "kl": 0.28400832414627075, + "learning_rate": 3.96239418746804e-06, + "loss": 0.0114, + "reward": 3.066666603088379, + "reward_std": 0.2857738435268402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8999999761581421, + "step": 742 + }, + { + "completion_length": 833.8333740234375, + "epoch": 2.597902097902098, + "grad_norm": 0.5528135895729065, + "kl": 0.28165918588638306, + "learning_rate": 3.958853028390294e-06, + "loss": 0.0113, + "reward": 2.3583335876464844, + "reward_std": 1.752831220626831, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916666030883789, + "step": 743 + }, + { + "completion_length": 143.33334350585938, + "epoch": 2.6013986013986012, + "grad_norm": 0.7684369683265686, + "kl": 0.3106473684310913, + "learning_rate": 3.955307425393224e-06, + "loss": 0.0124, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 744 + }, + { + "completion_length": 789.1666870117188, + "epoch": 2.604895104895105, + "grad_norm": 0.9867936372756958, + "kl": 0.2591046094894409, + "learning_rate": 3.951757389277349e-06, + "loss": 0.0104, + "reward": 3.3500001430511475, + "reward_std": 0.5224940180778503, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 745 + }, + { + "completion_length": 194.83334350585938, + "epoch": 2.6083916083916083, + "grad_norm": 0.7808223962783813, + "kl": 0.30762046575546265, + "learning_rate": 3.948202930856697e-06, + "loss": 0.0123, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 746 + }, + { + "completion_length": 503.8333435058594, + "epoch": 2.6118881118881117, + "grad_norm": 0.6441946625709534, + "kl": 0.2855534851551056, + "learning_rate": 3.944644060958764e-06, + "loss": 0.0114, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 747 + }, + { + "completion_length": 190.1666717529297, + "epoch": 2.6153846153846154, + "grad_norm": 0.8443914651870728, + "kl": 0.32207822799682617, + "learning_rate": 3.941080790424483e-06, + "loss": 0.0129, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 748 + }, + { + "completion_length": 193.6666717529297, + "epoch": 2.6188811188811187, + "grad_norm": 0.620596706867218, + "kl": 0.33432909846305847, + "learning_rate": 3.9375131301081974e-06, + "loss": 0.0134, + "reward": 2.616666793823242, + "reward_std": 1.0327956676483154, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 749 + }, + { + "completion_length": 218.33334350585938, + "epoch": 2.6223776223776225, + "grad_norm": 0.8599146604537964, + "kl": 0.2318965494632721, + "learning_rate": 3.933941090877615e-06, + "loss": 0.0093, + "reward": 2.0375001430511475, + "reward_std": 0.4857339859008789, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 750 + }, + { + "completion_length": 196.83334350585938, + "epoch": 2.625874125874126, + "grad_norm": 0.042067479342222214, + "kl": 0.25582045316696167, + "learning_rate": 3.930364683613791e-06, + "loss": 0.0114, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 751 + }, + { + "completion_length": 187.6666717529297, + "epoch": 2.629370629370629, + "grad_norm": 0.6770573854446411, + "kl": 0.2656649649143219, + "learning_rate": 3.92678391921108e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 752 + }, + { + "completion_length": 217.5, + "epoch": 2.632867132867133, + "grad_norm": 1.6130694150924683, + "kl": 0.29323238134384155, + "learning_rate": 3.923198808577111e-06, + "loss": 0.0117, + "reward": 2.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 753 + }, + { + "completion_length": 224.83334350585938, + "epoch": 2.6363636363636362, + "grad_norm": 0.7095122933387756, + "kl": 0.27353787422180176, + "learning_rate": 3.9196093626327535e-06, + "loss": 0.0109, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 754 + }, + { + "completion_length": 827.5, + "epoch": 2.63986013986014, + "grad_norm": 0.5739628076553345, + "kl": 0.21068716049194336, + "learning_rate": 3.916015592312083e-06, + "loss": 0.0084, + "reward": 2.883333206176758, + "reward_std": 1.7192052602767944, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 755 + }, + { + "completion_length": 186.0, + "epoch": 2.6433566433566433, + "grad_norm": 0.8608355522155762, + "kl": 0.3407597243785858, + "learning_rate": 3.912417508562345e-06, + "loss": 0.0136, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 756 + }, + { + "completion_length": 556.3333740234375, + "epoch": 2.6468531468531467, + "grad_norm": 0.3163861036300659, + "kl": 0.2427646368741989, + "learning_rate": 3.908815122343929e-06, + "loss": 0.0097, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 757 + }, + { + "completion_length": 187.5, + "epoch": 2.6503496503496504, + "grad_norm": 0.8031748533248901, + "kl": 0.30763155221939087, + "learning_rate": 3.905208444630326e-06, + "loss": 0.0123, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 758 + }, + { + "completion_length": 218.1666717529297, + "epoch": 2.6538461538461537, + "grad_norm": 0.8372368216514587, + "kl": 0.28790879249572754, + "learning_rate": 3.901597486408105e-06, + "loss": 0.0115, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 759 + }, + { + "completion_length": 1181.8333740234375, + "epoch": 2.6573426573426575, + "grad_norm": 0.647392988204956, + "kl": 0.20365619659423828, + "learning_rate": 3.897982258676867e-06, + "loss": 0.0081, + "reward": 1.7208335399627686, + "reward_std": 1.5208892822265625, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5541666746139526, + "step": 760 + }, + { + "completion_length": 180.1666717529297, + "epoch": 2.660839160839161, + "grad_norm": 0.6884165406227112, + "kl": 0.2719978392124176, + "learning_rate": 3.894362772449226e-06, + "loss": 0.0109, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 761 + }, + { + "completion_length": 257.8333435058594, + "epoch": 2.664335664335664, + "grad_norm": 1.337699055671692, + "kl": 0.5194430351257324, + "learning_rate": 3.890739038750763e-06, + "loss": 0.0208, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 762 + }, + { + "completion_length": 498.16668701171875, + "epoch": 2.667832167832168, + "grad_norm": 0.9563208818435669, + "kl": 0.3499029874801636, + "learning_rate": 3.887111068619999e-06, + "loss": 0.014, + "reward": 1.75, + "reward_std": 1.1730302572250366, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.75, + "step": 763 + }, + { + "completion_length": 215.33334350585938, + "epoch": 2.6713286713286712, + "grad_norm": 0.5849650502204895, + "kl": 0.21754197776317596, + "learning_rate": 3.88347887310836e-06, + "loss": 0.0087, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 764 + }, + { + "completion_length": 206.83334350585938, + "epoch": 2.674825174825175, + "grad_norm": 0.5816351771354675, + "kl": 0.2685267925262451, + "learning_rate": 3.879842463280146e-06, + "loss": 0.0107, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 765 + }, + { + "completion_length": 190.1666717529297, + "epoch": 2.6783216783216783, + "grad_norm": 0.7096436023712158, + "kl": 0.3302849531173706, + "learning_rate": 3.876201850212489e-06, + "loss": 0.0132, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 766 + }, + { + "completion_length": 205.83334350585938, + "epoch": 2.6818181818181817, + "grad_norm": 0.7019976377487183, + "kl": 0.3386441469192505, + "learning_rate": 3.87255704499533e-06, + "loss": 0.0135, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 767 + }, + { + "completion_length": 220.83334350585938, + "epoch": 2.6853146853146854, + "grad_norm": 0.7764424681663513, + "kl": 0.25025084614753723, + "learning_rate": 3.868908058731376e-06, + "loss": 0.01, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 768 + }, + { + "completion_length": 191.1666717529297, + "epoch": 2.6888111888111887, + "grad_norm": 0.6796668767929077, + "kl": 0.2684442698955536, + "learning_rate": 3.865254902536073e-06, + "loss": 0.0107, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 769 + }, + { + "completion_length": 898.3333740234375, + "epoch": 2.6923076923076925, + "grad_norm": 0.38831865787506104, + "kl": 0.14873462915420532, + "learning_rate": 3.861597587537568e-06, + "loss": 0.0059, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 770 + }, + { + "completion_length": 1279.166748046875, + "epoch": 2.695804195804196, + "grad_norm": 0.6360457539558411, + "kl": 0.1556037813425064, + "learning_rate": 3.857936124876677e-06, + "loss": 0.0062, + "reward": 2.25, + "reward_std": 1.957294225692749, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5833333730697632, + "step": 771 + }, + { + "completion_length": 185.6666717529297, + "epoch": 2.699300699300699, + "grad_norm": 0.8891352415084839, + "kl": 0.2973707318305969, + "learning_rate": 3.85427052570685e-06, + "loss": 0.0119, + "reward": 2.450000286102295, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 772 + }, + { + "completion_length": 223.0, + "epoch": 2.702797202797203, + "grad_norm": 0.9200516939163208, + "kl": 0.2344827651977539, + "learning_rate": 3.850600801194138e-06, + "loss": 0.0094, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 773 + }, + { + "completion_length": 181.6666717529297, + "epoch": 2.7062937062937062, + "grad_norm": 1.2495554685592651, + "kl": 0.4023559093475342, + "learning_rate": 3.846926962517158e-06, + "loss": 0.0161, + "reward": 2.4000000953674316, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 774 + }, + { + "completion_length": 186.83334350585938, + "epoch": 2.70979020979021, + "grad_norm": 0.7409746646881104, + "kl": 0.2839186489582062, + "learning_rate": 3.8432490208670605e-06, + "loss": 0.0114, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 775 + }, + { + "completion_length": 187.1666717529297, + "epoch": 2.7132867132867133, + "grad_norm": 0.9320999383926392, + "kl": 0.2990000247955322, + "learning_rate": 3.839566987447492e-06, + "loss": 0.012, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 776 + }, + { + "completion_length": 531.8333740234375, + "epoch": 2.7167832167832167, + "grad_norm": 0.3263534903526306, + "kl": 0.2381911277770996, + "learning_rate": 3.835880873474567e-06, + "loss": 0.0095, + "reward": 3.2916667461395264, + "reward_std": 1.6125807762145996, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 777 + }, + { + "completion_length": 1248.666748046875, + "epoch": 2.7202797202797204, + "grad_norm": 0.5097912549972534, + "kl": 0.1756594479084015, + "learning_rate": 3.832190690176825e-06, + "loss": 0.007, + "reward": 2.25, + "reward_std": 1.957294225692749, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5833333730697632, + "step": 778 + }, + { + "completion_length": 546.5, + "epoch": 2.7237762237762237, + "grad_norm": 0.38489583134651184, + "kl": 0.233808696269989, + "learning_rate": 3.828496448795208e-06, + "loss": 0.0094, + "reward": 2.7916667461395264, + "reward_std": 1.4527273178100586, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 779 + }, + { + "completion_length": 200.5, + "epoch": 2.7272727272727275, + "grad_norm": 0.6196880340576172, + "kl": 0.28656402230262756, + "learning_rate": 3.824798160583012e-06, + "loss": 0.0115, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 780 + }, + { + "completion_length": 206.83334350585938, + "epoch": 2.730769230769231, + "grad_norm": 0.06716328859329224, + "kl": 0.35444962978363037, + "learning_rate": 3.821095836805868e-06, + "loss": 0.0166, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 781 + }, + { + "completion_length": 1108.0, + "epoch": 2.734265734265734, + "grad_norm": 0.46010759472846985, + "kl": 0.2134471833705902, + "learning_rate": 3.817389488741694e-06, + "loss": 0.0085, + "reward": 2.9375, + "reward_std": 1.2437593936920166, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7708332538604736, + "step": 782 + }, + { + "completion_length": 192.83334350585938, + "epoch": 2.737762237762238, + "grad_norm": 0.7892248034477234, + "kl": 0.27930283546447754, + "learning_rate": 3.8136791276806695e-06, + "loss": 0.0112, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 783 + }, + { + "completion_length": 526.1666870117188, + "epoch": 2.7412587412587412, + "grad_norm": 0.5663818120956421, + "kl": 0.23246847093105316, + "learning_rate": 3.8099647649251984e-06, + "loss": 0.0093, + "reward": 2.183333396911621, + "reward_std": 1.0230673551559448, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 784 + }, + { + "completion_length": 302.0, + "epoch": 2.744755244755245, + "grad_norm": 0.8390914797782898, + "kl": 0.304746150970459, + "learning_rate": 3.806246411789872e-06, + "loss": 0.0122, + "reward": 2.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 785 + }, + { + "completion_length": 463.66668701171875, + "epoch": 2.7482517482517483, + "grad_norm": 0.4586171507835388, + "kl": 0.2490534633398056, + "learning_rate": 3.802524079601442e-06, + "loss": 0.01, + "reward": 2.2916667461395264, + "reward_std": 1.19181227684021, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 786 + }, + { + "completion_length": 538.1666870117188, + "epoch": 2.7517482517482517, + "grad_norm": 0.4255636930465698, + "kl": 0.21123819053173065, + "learning_rate": 3.798797779698774e-06, + "loss": 0.0084, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 787 + }, + { + "completion_length": 499.0, + "epoch": 2.755244755244755, + "grad_norm": 0.5292470455169678, + "kl": 0.24850648641586304, + "learning_rate": 3.795067523432826e-06, + "loss": 0.0099, + "reward": 1.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 788 + }, + { + "completion_length": 527.6666870117188, + "epoch": 2.7587412587412588, + "grad_norm": 0.6640042662620544, + "kl": 0.21320059895515442, + "learning_rate": 3.791333322166605e-06, + "loss": 0.0085, + "reward": 2.3500001430511475, + "reward_std": 0.6928203105926514, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 789 + }, + { + "completion_length": 212.5, + "epoch": 2.762237762237762, + "grad_norm": 0.7885140776634216, + "kl": 0.25268059968948364, + "learning_rate": 3.787595187275136e-06, + "loss": 0.0101, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 790 + }, + { + "completion_length": 185.0, + "epoch": 2.765734265734266, + "grad_norm": 1.2679868936538696, + "kl": 0.35767948627471924, + "learning_rate": 3.7838531301454257e-06, + "loss": 0.0143, + "reward": 2.3500001430511475, + "reward_std": 0.9380831718444824, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 791 + }, + { + "completion_length": 202.0, + "epoch": 2.769230769230769, + "grad_norm": 0.6652596592903137, + "kl": 0.2619841694831848, + "learning_rate": 3.780107162176429e-06, + "loss": 0.0105, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 792 + }, + { + "completion_length": 474.0, + "epoch": 2.7727272727272725, + "grad_norm": 8.084759712219238, + "kl": 2.9472758769989014, + "learning_rate": 3.776357294779015e-06, + "loss": 0.1179, + "reward": 2.133333444595337, + "reward_std": 1.6972527503967285, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 793 + }, + { + "completion_length": 487.3333435058594, + "epoch": 2.7762237762237763, + "grad_norm": 0.43876194953918457, + "kl": 0.234140545129776, + "learning_rate": 3.772603539375929e-06, + "loss": 0.0094, + "reward": 2.875, + "reward_std": 1.2624380588531494, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 794 + }, + { + "completion_length": 216.5, + "epoch": 2.7797202797202796, + "grad_norm": 0.7178113460540771, + "kl": 0.3248441517353058, + "learning_rate": 3.768845907401761e-06, + "loss": 0.013, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 795 + }, + { + "completion_length": 425.66668701171875, + "epoch": 2.7832167832167833, + "grad_norm": 0.4357425570487976, + "kl": 0.24865001440048218, + "learning_rate": 3.7650844103029093e-06, + "loss": 0.0099, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 796 + }, + { + "completion_length": 827.1666870117188, + "epoch": 2.7867132867132867, + "grad_norm": 0.36945709586143494, + "kl": 0.26294025778770447, + "learning_rate": 3.7613190595375484e-06, + "loss": 0.0105, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 797 + }, + { + "completion_length": 193.33334350585938, + "epoch": 2.79020979020979, + "grad_norm": 1.0428582429885864, + "kl": 0.3159600496292114, + "learning_rate": 3.7575498665755884e-06, + "loss": 0.0126, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 798 + }, + { + "completion_length": 573.1666870117188, + "epoch": 2.7937062937062938, + "grad_norm": 0.7567842602729797, + "kl": 0.37068232893943787, + "learning_rate": 3.753776842898644e-06, + "loss": 0.0148, + "reward": 2.704166889190674, + "reward_std": 0.8732721209526062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 799 + }, + { + "completion_length": 500.66668701171875, + "epoch": 2.797202797202797, + "grad_norm": 0.5451098680496216, + "kl": 0.24930475652217865, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.01, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 800 + }, + { + "completion_length": 534.8333740234375, + "epoch": 2.800699300699301, + "grad_norm": 0.6100650429725647, + "kl": 0.2307787835597992, + "learning_rate": 3.7462193493845763e-06, + "loss": 0.0092, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 801 + }, + { + "completion_length": 494.3333435058594, + "epoch": 2.804195804195804, + "grad_norm": 0.3402723968029022, + "kl": 0.18503499031066895, + "learning_rate": 3.742434902568889e-06, + "loss": 0.0074, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 802 + }, + { + "completion_length": 173.5, + "epoch": 2.8076923076923075, + "grad_norm": 1.8058785200119019, + "kl": 0.3168944716453552, + "learning_rate": 3.738646671081019e-06, + "loss": 0.0127, + "reward": 2.016666889190674, + "reward_std": 0.9521905183792114, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 803 + }, + { + "completion_length": 512.8333740234375, + "epoch": 2.8111888111888113, + "grad_norm": 0.526727020740509, + "kl": 0.23828241229057312, + "learning_rate": 3.7348546664605777e-06, + "loss": 0.0095, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 804 + }, + { + "completion_length": 527.5, + "epoch": 2.8146853146853146, + "grad_norm": 0.41340726613998413, + "kl": 0.18879906833171844, + "learning_rate": 3.7310589002586683e-06, + "loss": 0.0076, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 805 + }, + { + "completion_length": 505.16668701171875, + "epoch": 2.8181818181818183, + "grad_norm": 0.5648691058158875, + "kl": 0.315143346786499, + "learning_rate": 3.7272593840378526e-06, + "loss": 0.0126, + "reward": 3.3500001430511475, + "reward_std": 1.058300495147705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 806 + }, + { + "completion_length": 478.0, + "epoch": 2.8216783216783217, + "grad_norm": 0.7256986498832703, + "kl": 0.36903661489486694, + "learning_rate": 3.723456129372116e-06, + "loss": 0.0148, + "reward": 2.691666603088379, + "reward_std": 1.66505765914917, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 807 + }, + { + "completion_length": 502.3333435058594, + "epoch": 2.825174825174825, + "grad_norm": 0.44219493865966797, + "kl": 0.20148871839046478, + "learning_rate": 3.7196491478468322e-06, + "loss": 0.0081, + "reward": 3.125, + "reward_std": 1.5823242664337158, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 808 + }, + { + "completion_length": 486.5, + "epoch": 2.8286713286713288, + "grad_norm": 0.47201624512672424, + "kl": 0.21870753169059753, + "learning_rate": 3.7158384510587264e-06, + "loss": 0.0087, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 809 + }, + { + "completion_length": 537.3333740234375, + "epoch": 2.832167832167832, + "grad_norm": 0.6436510682106018, + "kl": 0.22534185647964478, + "learning_rate": 3.7120240506158433e-06, + "loss": 0.009, + "reward": 2.950000286102295, + "reward_std": 1.095445156097412, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 810 + }, + { + "completion_length": 237.5, + "epoch": 2.835664335664336, + "grad_norm": 0.646536648273468, + "kl": 0.27237847447395325, + "learning_rate": 3.708205958137506e-06, + "loss": 0.0109, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 811 + }, + { + "completion_length": 255.6666717529297, + "epoch": 2.839160839160839, + "grad_norm": 0.6682825684547424, + "kl": 0.3421524465084076, + "learning_rate": 3.7043841852542884e-06, + "loss": 0.0137, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 812 + }, + { + "completion_length": 210.33334350585938, + "epoch": 2.8426573426573425, + "grad_norm": 0.9165672063827515, + "kl": 0.307815283536911, + "learning_rate": 3.7005587436079724e-06, + "loss": 0.0123, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 813 + }, + { + "completion_length": 830.8333740234375, + "epoch": 2.8461538461538463, + "grad_norm": 0.4606754779815674, + "kl": 0.20725002884864807, + "learning_rate": 3.6967296448515176e-06, + "loss": 0.0083, + "reward": 1.633333444595337, + "reward_std": 1.3418892621994019, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 814 + }, + { + "completion_length": 165.6666717529297, + "epoch": 2.8496503496503496, + "grad_norm": 8.907544136047363, + "kl": 0.7807542085647583, + "learning_rate": 3.6928969006490212e-06, + "loss": 0.0312, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 815 + }, + { + "completion_length": 852.3333740234375, + "epoch": 2.8531468531468533, + "grad_norm": 0.5166431665420532, + "kl": 0.22571682929992676, + "learning_rate": 3.689060522675689e-06, + "loss": 0.009, + "reward": 2.191666603088379, + "reward_std": 1.2499668598175049, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 816 + }, + { + "completion_length": 223.6666717529297, + "epoch": 2.8566433566433567, + "grad_norm": 0.7508683204650879, + "kl": 0.2404782623052597, + "learning_rate": 3.6852205226177907e-06, + "loss": 0.0096, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 817 + }, + { + "completion_length": 197.33334350585938, + "epoch": 2.86013986013986, + "grad_norm": 0.9129036664962769, + "kl": 0.3411031663417816, + "learning_rate": 3.6813769121726356e-06, + "loss": 0.0136, + "reward": 2.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 818 + }, + { + "completion_length": 516.5, + "epoch": 2.8636363636363638, + "grad_norm": 0.462089866399765, + "kl": 0.2940046191215515, + "learning_rate": 3.677529703048525e-06, + "loss": 0.0118, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 819 + }, + { + "completion_length": 200.33334350585938, + "epoch": 2.867132867132867, + "grad_norm": 0.8957485556602478, + "kl": 0.2655426859855652, + "learning_rate": 3.6736789069647273e-06, + "loss": 0.0106, + "reward": 2.370833396911621, + "reward_std": 0.4925486445426941, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 820 + }, + { + "completion_length": 198.5, + "epoch": 2.870629370629371, + "grad_norm": 0.8342744708061218, + "kl": 0.24618063867092133, + "learning_rate": 3.6698245356514337e-06, + "loss": 0.0098, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 821 + }, + { + "completion_length": 153.33334350585938, + "epoch": 2.874125874125874, + "grad_norm": 0.7607580423355103, + "kl": 0.27888309955596924, + "learning_rate": 3.6659666008497287e-06, + "loss": 0.0112, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 822 + }, + { + "completion_length": 206.1666717529297, + "epoch": 2.8776223776223775, + "grad_norm": 0.8246819972991943, + "kl": 0.2606455087661743, + "learning_rate": 3.66210511431155e-06, + "loss": 0.0104, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 823 + }, + { + "completion_length": 509.5, + "epoch": 2.8811188811188813, + "grad_norm": 0.4599984586238861, + "kl": 0.2845722734928131, + "learning_rate": 3.658240087799655e-06, + "loss": 0.0114, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 824 + }, + { + "completion_length": 577.5, + "epoch": 2.8846153846153846, + "grad_norm": 0.3288043439388275, + "kl": 0.2456766664981842, + "learning_rate": 3.654371533087586e-06, + "loss": 0.0098, + "reward": 3.516666889190674, + "reward_std": 1.0614454746246338, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 825 + }, + { + "completion_length": 516.3333740234375, + "epoch": 2.8881118881118883, + "grad_norm": 0.4140225946903229, + "kl": 0.2799639403820038, + "learning_rate": 3.6504994619596295e-06, + "loss": 0.0112, + "reward": 2.8500001430511475, + "reward_std": 1.3190906047821045, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 826 + }, + { + "completion_length": 180.0, + "epoch": 2.8916083916083917, + "grad_norm": 4.02695894241333, + "kl": 0.5962376594543457, + "learning_rate": 3.6466238862107884e-06, + "loss": 0.0238, + "reward": 2.625, + "reward_std": 1.1973929405212402, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 827 + }, + { + "completion_length": 220.5, + "epoch": 2.895104895104895, + "grad_norm": 0.7351843118667603, + "kl": 0.2822580337524414, + "learning_rate": 3.642744817646736e-06, + "loss": 0.0113, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 828 + }, + { + "completion_length": 515.0, + "epoch": 2.8986013986013988, + "grad_norm": 0.08226211369037628, + "kl": 0.25046059489250183, + "learning_rate": 3.6388622680837893e-06, + "loss": 0.0124, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 829 + }, + { + "completion_length": 505.16668701171875, + "epoch": 2.902097902097902, + "grad_norm": 0.4543350040912628, + "kl": 0.24183645844459534, + "learning_rate": 3.634976249348867e-06, + "loss": 0.0097, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 830 + }, + { + "completion_length": 218.6666717529297, + "epoch": 2.905594405594406, + "grad_norm": 0.7515471577644348, + "kl": 0.23829472064971924, + "learning_rate": 3.631086773279457e-06, + "loss": 0.0095, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 831 + }, + { + "completion_length": 213.6666717529297, + "epoch": 2.909090909090909, + "grad_norm": 0.7354035973548889, + "kl": 0.261251837015152, + "learning_rate": 3.627193851723577e-06, + "loss": 0.0105, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 832 + }, + { + "completion_length": 212.6666717529297, + "epoch": 2.9125874125874125, + "grad_norm": 0.057771261781454086, + "kl": 0.27106887102127075, + "learning_rate": 3.6232974965397416e-06, + "loss": 0.0132, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 833 + }, + { + "completion_length": 504.5, + "epoch": 2.916083916083916, + "grad_norm": 0.5733168125152588, + "kl": 0.2574426233768463, + "learning_rate": 3.6193977195969243e-06, + "loss": 0.0103, + "reward": 3.3500001430511475, + "reward_std": 1.058300495147705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 834 + }, + { + "completion_length": 201.33334350585938, + "epoch": 2.9195804195804196, + "grad_norm": 0.04076343774795532, + "kl": 0.3926897644996643, + "learning_rate": 3.6154945327745223e-06, + "loss": 0.0181, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 835 + }, + { + "completion_length": 504.66668701171875, + "epoch": 2.9230769230769234, + "grad_norm": 0.41645586490631104, + "kl": 0.28634482622146606, + "learning_rate": 3.611587947962319e-06, + "loss": 0.0115, + "reward": 2.8500001430511475, + "reward_std": 0.8366600275039673, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 836 + }, + { + "completion_length": 193.1666717529297, + "epoch": 2.9265734265734267, + "grad_norm": 0.584320068359375, + "kl": 0.36589449644088745, + "learning_rate": 3.6076779770604496e-06, + "loss": 0.0146, + "reward": 2.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 837 + }, + { + "completion_length": 151.1666717529297, + "epoch": 2.93006993006993, + "grad_norm": 20.536643981933594, + "kl": 2.479689121246338, + "learning_rate": 3.6037646319793635e-06, + "loss": 0.0992, + "reward": 2.2166666984558105, + "reward_std": 0.8577102422714233, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 838 + }, + { + "completion_length": 191.5, + "epoch": 2.9335664335664333, + "grad_norm": 16.66517448425293, + "kl": 1.5780773162841797, + "learning_rate": 3.599847924639788e-06, + "loss": 0.0631, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 839 + }, + { + "completion_length": 391.16668701171875, + "epoch": 2.937062937062937, + "grad_norm": 0.408719927072525, + "kl": 0.27218514680862427, + "learning_rate": 3.595927866972694e-06, + "loss": 0.0109, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 840 + }, + { + "completion_length": 214.83334350585938, + "epoch": 2.9405594405594404, + "grad_norm": 0.8976386189460754, + "kl": 0.3075045049190521, + "learning_rate": 3.592004470919256e-06, + "loss": 0.0123, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 841 + }, + { + "completion_length": 191.5, + "epoch": 2.944055944055944, + "grad_norm": 0.8545355796813965, + "kl": 0.3112524747848511, + "learning_rate": 3.5880777484308193e-06, + "loss": 0.0125, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 842 + }, + { + "completion_length": 169.0, + "epoch": 2.9475524475524475, + "grad_norm": 3.7284159660339355, + "kl": 0.7016023397445679, + "learning_rate": 3.5841477114688616e-06, + "loss": 0.0281, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 843 + }, + { + "completion_length": 125.16667175292969, + "epoch": 2.951048951048951, + "grad_norm": 0.08267883211374283, + "kl": 0.3959384560585022, + "learning_rate": 3.5802143720049565e-06, + "loss": 0.0182, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 844 + }, + { + "completion_length": 191.5, + "epoch": 2.9545454545454546, + "grad_norm": 0.6881157159805298, + "kl": 0.28095734119415283, + "learning_rate": 3.5762777420207382e-06, + "loss": 0.0112, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 845 + }, + { + "completion_length": 165.0, + "epoch": 2.958041958041958, + "grad_norm": 17.351783752441406, + "kl": 3.690758466720581, + "learning_rate": 3.5723378335078653e-06, + "loss": 0.1476, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 846 + }, + { + "completion_length": 498.8333435058594, + "epoch": 2.9615384615384617, + "grad_norm": 0.6604005098342896, + "kl": 0.22229203581809998, + "learning_rate": 3.5683946584679818e-06, + "loss": 0.0089, + "reward": 2.516666889190674, + "reward_std": 1.2355835437774658, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 847 + }, + { + "completion_length": 831.5, + "epoch": 2.965034965034965, + "grad_norm": 0.4960877597332001, + "kl": 0.2161356508731842, + "learning_rate": 3.564448228912682e-06, + "loss": 0.0086, + "reward": 1.691666603088379, + "reward_std": 1.2619099617004395, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916666030883789, + "step": 848 + }, + { + "completion_length": 206.0, + "epoch": 2.9685314685314683, + "grad_norm": 0.7653523683547974, + "kl": 0.2660280764102936, + "learning_rate": 3.5604985568634754e-06, + "loss": 0.0106, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 849 + }, + { + "completion_length": 188.6666717529297, + "epoch": 2.972027972027972, + "grad_norm": 5.382836818695068, + "kl": 0.5285301804542542, + "learning_rate": 3.556545654351749e-06, + "loss": 0.0211, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 850 + } + ], + "logging_steps": 1, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-850/training_args.bin b/checkpoint-850/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..404a67ca1097568ef818195412e92eb5df6df003 --- /dev/null +++ b/checkpoint-850/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9b809202c83316443ca7c3596f9666d891e249e918f031374256726d85b5070 +size 6008 diff --git a/checkpoint-900/README.md b/checkpoint-900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..342a23987f57b711334f1f7c4b72004ab4751d11 --- /dev/null +++ b/checkpoint-900/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/checkpoint-900/adapter_config.json b/checkpoint-900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca69f90ffbea02ffd530ac27f43588458c02af39 --- /dev/null +++ b/checkpoint-900/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "up_proj", + "o_proj", + "k_proj", + "gate_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-900/adapter_model.safetensors b/checkpoint-900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ce468575338b6571f6e8e366cc91a000c76cbeee --- /dev/null +++ b/checkpoint-900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c27c90d54f98ea4236b91536efbe721c2f35ab357dff3e2b60a180debaba22b +size 778096664 diff --git a/checkpoint-900/optimizer.pt b/checkpoint-900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..95f2aa032680e25c55e305ca43906c7ba89913d4 --- /dev/null +++ b/checkpoint-900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d525d1f282bd156486c36553bb7f988f960735a3e214aec23212ba8dd02056dc +size 395571252 diff --git a/checkpoint-900/rng_state.pth b/checkpoint-900/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3d04bcf7fa3c2bf9bd08b679b4391e7faa04c809 --- /dev/null +++ b/checkpoint-900/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a5b91c92a15ee2d9500884d357aad985905552c0b2e06cd6c4e22dfca5ee211 +size 14244 diff --git a/checkpoint-900/scheduler.pt b/checkpoint-900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..69e18ec54195f4f7ec2d161231a87a6886c53375 --- /dev/null +++ b/checkpoint-900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5f4d8c433e235e5c4da895db277662f688cf69b75792bd4b7ceaa7a32ce3bef +size 1064 diff --git a/checkpoint-900/special_tokens_map.json b/checkpoint-900/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-900/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-900/tokenizer.json b/checkpoint-900/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-900/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-900/tokenizer_config.json b/checkpoint-900/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f29bafcf7d24e386a389486e71a4e81dfef0f5c2 --- /dev/null +++ b/checkpoint-900/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-900/trainer_state.json b/checkpoint-900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3aa46094f807bf92039578c11784ba53fc9129f5 --- /dev/null +++ b/checkpoint-900/trainer_state.json @@ -0,0 +1,13533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.1468531468531467, + "eval_steps": 500, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 399.0, + "epoch": 0.0034965034965034965, + "grad_norm": 0.9857833385467529, + "kl": 0.0, + "learning_rate": 2.5000000000000002e-08, + "loss": 0.0, + "reward": 1.75, + "reward_std": 1.069111704826355, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4166666865348816, + "step": 1 + }, + { + "completion_length": 305.3333435058594, + "epoch": 0.006993006993006993, + "grad_norm": 1.3122953176498413, + "kl": 0.0, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.0, + "reward": 1.0500000715255737, + "reward_std": 0.6340347528457642, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 2 + }, + { + "completion_length": 475.3333435058594, + "epoch": 0.01048951048951049, + "grad_norm": 6.344944953918457, + "kl": 0.0006356238736771047, + "learning_rate": 7.500000000000001e-08, + "loss": 0.0, + "reward": 1.4750001430511475, + "reward_std": 1.025061011314392, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 3 + }, + { + "completion_length": 378.3333435058594, + "epoch": 0.013986013986013986, + "grad_norm": 0.9831988215446472, + "kl": 0.0006719424272887409, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.0, + "reward": 1.2208333015441895, + "reward_std": 1.3383214473724365, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.22083334624767303, + "step": 4 + }, + { + "completion_length": 925.0, + "epoch": 0.017482517482517484, + "grad_norm": 1.042701005935669, + "kl": 0.000699286290910095, + "learning_rate": 1.2500000000000002e-07, + "loss": 0.0, + "reward": 2.4666666984558105, + "reward_std": 1.618847370147705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 5 + }, + { + "completion_length": 130.6666717529297, + "epoch": 0.02097902097902098, + "grad_norm": 1.276957631111145, + "kl": 0.0007741473382338881, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.0, + "reward": 0.38333332538604736, + "reward_std": 0.7222649455070496, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 6 + }, + { + "completion_length": 185.5, + "epoch": 0.024475524475524476, + "grad_norm": 1.277024507522583, + "kl": 0.0007853443967178464, + "learning_rate": 1.7500000000000002e-07, + "loss": 0.0, + "reward": 0.7250000238418579, + "reward_std": 0.44017040729522705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 7 + }, + { + "completion_length": 113.83333587646484, + "epoch": 0.027972027972027972, + "grad_norm": 4.894377708435059, + "kl": 0.0010196010116487741, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.0, + "reward": 0.7250000238418579, + "reward_std": 0.5777109861373901, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 8 + }, + { + "completion_length": 195.33334350585938, + "epoch": 0.03146853146853147, + "grad_norm": 0.9491543769836426, + "kl": 0.0009398699621669948, + "learning_rate": 2.2500000000000002e-07, + "loss": 0.0, + "reward": 1.2750000953674316, + "reward_std": 0.673609733581543, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10833333432674408, + "step": 9 + }, + { + "completion_length": 168.83334350585938, + "epoch": 0.03496503496503497, + "grad_norm": 4.634313583374023, + "kl": 0.0008446139981970191, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.0, + "reward": 0.5791666507720947, + "reward_std": 1.011238932609558, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.07916666567325592, + "step": 10 + }, + { + "completion_length": 181.0, + "epoch": 0.038461538461538464, + "grad_norm": 0.9203607439994812, + "kl": 0.0005472182529047132, + "learning_rate": 2.75e-07, + "loss": 0.0, + "reward": 1.2833333015441895, + "reward_std": 0.9125057458877563, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.11666666716337204, + "step": 11 + }, + { + "completion_length": 181.1666717529297, + "epoch": 0.04195804195804196, + "grad_norm": 1.4339206218719482, + "kl": 0.0007050944259390235, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.0, + "reward": 1.7333333492279053, + "reward_std": 1.0063133239746094, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.23333333432674408, + "step": 12 + }, + { + "completion_length": 130.0, + "epoch": 0.045454545454545456, + "grad_norm": 1.073473334312439, + "kl": 0.0007636564550921321, + "learning_rate": 3.25e-07, + "loss": 0.0, + "reward": 1.058333396911621, + "reward_std": 0.7486097812652588, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 13 + }, + { + "completion_length": 356.16668701171875, + "epoch": 0.04895104895104895, + "grad_norm": 0.8452476859092712, + "kl": 0.0006562608177773654, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.0, + "reward": 0.7416666746139526, + "reward_std": 0.8662659525871277, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.24166667461395264, + "step": 14 + }, + { + "completion_length": 143.1666717529297, + "epoch": 0.05244755244755245, + "grad_norm": 0.9590725302696228, + "kl": 0.0008172739762812853, + "learning_rate": 3.75e-07, + "loss": 0.0, + "reward": 0.5541666746139526, + "reward_std": 0.9553031921386719, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05416666716337204, + "step": 15 + }, + { + "completion_length": 454.16668701171875, + "epoch": 0.055944055944055944, + "grad_norm": 1.2272268533706665, + "kl": 0.0007388863014057279, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.0, + "reward": 1.2083333730697632, + "reward_std": 1.0360583066940308, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 16 + }, + { + "completion_length": 152.5, + "epoch": 0.05944055944055944, + "grad_norm": 1.0074872970581055, + "kl": 0.0006766216829419136, + "learning_rate": 4.2500000000000006e-07, + "loss": 0.0, + "reward": 0.8916666507720947, + "reward_std": 0.8662659525871277, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 17 + }, + { + "completion_length": 250.1666717529297, + "epoch": 0.06293706293706294, + "grad_norm": 1.305372953414917, + "kl": 0.001035388559103012, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.0, + "reward": 0.7166666984558105, + "reward_std": 1.2201093435287476, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 18 + }, + { + "completion_length": 243.0, + "epoch": 0.06643356643356643, + "grad_norm": 1.0690687894821167, + "kl": 0.0006665514083579183, + "learning_rate": 4.7500000000000006e-07, + "loss": 0.0, + "reward": 0.9916666746139526, + "reward_std": 0.6167792677879333, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.15833333134651184, + "step": 19 + }, + { + "completion_length": 276.16668701171875, + "epoch": 0.06993006993006994, + "grad_norm": 1.052300214767456, + "kl": 0.0005925261066295207, + "learning_rate": 5.000000000000001e-07, + "loss": 0.0, + "reward": 1.5333333015441895, + "reward_std": 1.0186593532562256, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.20000001788139343, + "step": 20 + }, + { + "completion_length": 333.3333435058594, + "epoch": 0.07342657342657342, + "grad_norm": 0.95088130235672, + "kl": 0.0006341444095596671, + "learning_rate": 5.250000000000001e-07, + "loss": 0.0, + "reward": 1.8583333492279053, + "reward_std": 0.8458231687545776, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3583333194255829, + "step": 21 + }, + { + "completion_length": 166.6666717529297, + "epoch": 0.07692307692307693, + "grad_norm": 1.2825149297714233, + "kl": 0.0007712479564361274, + "learning_rate": 5.5e-07, + "loss": 0.0, + "reward": 0.7666666507720947, + "reward_std": 1.1881358623504639, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10000000894069672, + "step": 22 + }, + { + "completion_length": 380.0, + "epoch": 0.08041958041958042, + "grad_norm": 1.2229748964309692, + "kl": 0.0007141837850213051, + "learning_rate": 5.750000000000001e-07, + "loss": 0.0, + "reward": 1.933333396911621, + "reward_std": 0.7672461867332458, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 23 + }, + { + "completion_length": 250.0, + "epoch": 0.08391608391608392, + "grad_norm": 1.1869820356369019, + "kl": 0.0007901927456259727, + "learning_rate": 6.000000000000001e-07, + "loss": 0.0, + "reward": 0.9666666984558105, + "reward_std": 0.797287106513977, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 24 + }, + { + "completion_length": 224.33334350585938, + "epoch": 0.08741258741258741, + "grad_norm": 1.1140718460083008, + "kl": 0.0006676652701571584, + "learning_rate": 6.25e-07, + "loss": 0.0, + "reward": 1.125, + "reward_std": 1.069462537765503, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.125, + "step": 25 + }, + { + "completion_length": 112.33333587646484, + "epoch": 0.09090909090909091, + "grad_norm": 1.20625901222229, + "kl": 0.0006995900766924024, + "learning_rate": 6.5e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0, + "step": 26 + }, + { + "completion_length": 398.8333435058594, + "epoch": 0.0944055944055944, + "grad_norm": 5.332723617553711, + "kl": 0.0007186655420809984, + "learning_rate": 6.750000000000001e-07, + "loss": 0.0, + "reward": 1.6625001430511475, + "reward_std": 0.9664044380187988, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3291666805744171, + "step": 27 + }, + { + "completion_length": 336.3333435058594, + "epoch": 0.0979020979020979, + "grad_norm": 0.7707162499427795, + "kl": 0.0007305681938305497, + "learning_rate": 7.000000000000001e-07, + "loss": 0.0, + "reward": 1.441666603088379, + "reward_std": 0.9876319766044617, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.2750000059604645, + "step": 28 + }, + { + "completion_length": 355.8333435058594, + "epoch": 0.10139860139860139, + "grad_norm": 0.999113142490387, + "kl": 0.0006821553106419742, + "learning_rate": 7.25e-07, + "loss": 0.0, + "reward": 1.1583333015441895, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.15833333134651184, + "step": 29 + }, + { + "completion_length": 188.1666717529297, + "epoch": 0.1048951048951049, + "grad_norm": 1.1029480695724487, + "kl": 0.0007804523920640349, + "learning_rate": 7.5e-07, + "loss": 0.0, + "reward": 1.183333396911621, + "reward_std": 1.0680201053619385, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.18333333730697632, + "step": 30 + }, + { + "completion_length": 380.3333435058594, + "epoch": 0.10839160839160839, + "grad_norm": 0.9132871627807617, + "kl": 0.0008556495886296034, + "learning_rate": 7.750000000000001e-07, + "loss": 0.0, + "reward": 2.2375001907348633, + "reward_std": 1.4762918949127197, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40416666865348816, + "step": 31 + }, + { + "completion_length": 348.0, + "epoch": 0.11188811188811189, + "grad_norm": 1.549122929573059, + "kl": 0.0009064790210686624, + "learning_rate": 8.000000000000001e-07, + "loss": 0.0, + "reward": 0.8291666507720947, + "reward_std": 1.029613733291626, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.16250000894069672, + "step": 32 + }, + { + "completion_length": 349.5, + "epoch": 0.11538461538461539, + "grad_norm": 0.8771302700042725, + "kl": 0.0008574656676501036, + "learning_rate": 8.250000000000001e-07, + "loss": 0.0, + "reward": 1.133333444595337, + "reward_std": 0.9867455363273621, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.30000001192092896, + "step": 33 + }, + { + "completion_length": 698.8333740234375, + "epoch": 0.11888111888111888, + "grad_norm": 0.7568854689598083, + "kl": 0.0007735582767054439, + "learning_rate": 8.500000000000001e-07, + "loss": 0.0, + "reward": 1.933333396911621, + "reward_std": 1.1737406253814697, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 34 + }, + { + "completion_length": 655.3333740234375, + "epoch": 0.12237762237762238, + "grad_norm": 1.5077099800109863, + "kl": 0.0007145506679080427, + "learning_rate": 8.75e-07, + "loss": 0.0, + "reward": 1.337499976158142, + "reward_std": 0.7572566270828247, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5041667222976685, + "step": 35 + }, + { + "completion_length": 156.0, + "epoch": 0.1258741258741259, + "grad_norm": 1.1091190576553345, + "kl": 0.0010963345412164927, + "learning_rate": 9.000000000000001e-07, + "loss": 0.0, + "reward": 1.1583333015441895, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.15833333134651184, + "step": 36 + }, + { + "completion_length": 184.6666717529297, + "epoch": 0.12937062937062938, + "grad_norm": 1.1978340148925781, + "kl": 0.000993944238871336, + "learning_rate": 9.25e-07, + "loss": 0.0, + "reward": 0.8333333730697632, + "reward_std": 1.2944754362106323, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 37 + }, + { + "completion_length": 170.1666717529297, + "epoch": 0.13286713286713286, + "grad_norm": 0.9296630620956421, + "kl": 0.0012741987593472004, + "learning_rate": 9.500000000000001e-07, + "loss": 0.0001, + "reward": 1.25, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 38 + }, + { + "completion_length": 284.3333435058594, + "epoch": 0.13636363636363635, + "grad_norm": 1.3948841094970703, + "kl": 0.0010804318590089679, + "learning_rate": 9.750000000000002e-07, + "loss": 0.0, + "reward": 1.1083333492279053, + "reward_std": 1.263098120689392, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2750000059604645, + "step": 39 + }, + { + "completion_length": 132.1666717529297, + "epoch": 0.13986013986013987, + "grad_norm": 1.0202951431274414, + "kl": 0.0013121496886014938, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0001, + "reward": 0.3333333432674408, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0, + "step": 40 + }, + { + "completion_length": 156.1666717529297, + "epoch": 0.14335664335664336, + "grad_norm": 0.9724128246307373, + "kl": 0.0010785979684442282, + "learning_rate": 1.025e-06, + "loss": 0.0, + "reward": 0.6083333492279053, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10833333432674408, + "step": 41 + }, + { + "completion_length": 603.1666870117188, + "epoch": 0.14685314685314685, + "grad_norm": 0.7776791453361511, + "kl": 0.0006764258723706007, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.0, + "reward": 1.4500001668930054, + "reward_std": 0.30659419298171997, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.45000001788139343, + "step": 42 + }, + { + "completion_length": 183.1666717529297, + "epoch": 0.15034965034965034, + "grad_norm": 1.2581369876861572, + "kl": 0.0012429999187588692, + "learning_rate": 1.075e-06, + "loss": 0.0, + "reward": 1.1749999523162842, + "reward_std": 1.0567638874053955, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17499999701976776, + "step": 43 + }, + { + "completion_length": 379.16668701171875, + "epoch": 0.15384615384615385, + "grad_norm": 2.0310208797454834, + "kl": 0.0011767616961151361, + "learning_rate": 1.1e-06, + "loss": 0.0, + "reward": 2.633333683013916, + "reward_std": 1.0595598220825195, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.46666663885116577, + "step": 44 + }, + { + "completion_length": 637.3333740234375, + "epoch": 0.15734265734265734, + "grad_norm": 1.2500090599060059, + "kl": 0.001643048133701086, + "learning_rate": 1.125e-06, + "loss": 0.0001, + "reward": 1.1500000953674316, + "reward_std": 0.7307531237602234, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 45 + }, + { + "completion_length": 182.0, + "epoch": 0.16083916083916083, + "grad_norm": 2.3323163986206055, + "kl": 0.003556631039828062, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.0001, + "reward": 0.9666666984558105, + "reward_std": 1.0230672359466553, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.13333334028720856, + "step": 46 + }, + { + "completion_length": 109.83333587646484, + "epoch": 0.16433566433566432, + "grad_norm": 1.834832787513733, + "kl": 0.002168774139136076, + "learning_rate": 1.175e-06, + "loss": 0.0001, + "reward": 0.5583333373069763, + "reward_std": 0.6248332858085632, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 47 + }, + { + "completion_length": 337.16668701171875, + "epoch": 0.16783216783216784, + "grad_norm": 1.1725846529006958, + "kl": 0.002405840437859297, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0001, + "reward": 0.6500000357627869, + "reward_std": 0.7962412238121033, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 48 + }, + { + "completion_length": 437.3333435058594, + "epoch": 0.17132867132867133, + "grad_norm": 0.743201494216919, + "kl": 0.0013375936541706324, + "learning_rate": 1.2250000000000001e-06, + "loss": 0.0001, + "reward": 1.183333396911621, + "reward_std": 1.3611271381378174, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3499999940395355, + "step": 49 + }, + { + "completion_length": 533.8333740234375, + "epoch": 0.17482517482517482, + "grad_norm": 0.7576809525489807, + "kl": 0.0019401045283302665, + "learning_rate": 1.25e-06, + "loss": 0.0001, + "reward": 1.7291667461395264, + "reward_std": 0.7050561308860779, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5625, + "step": 50 + }, + { + "completion_length": 203.5, + "epoch": 0.17832167832167833, + "grad_norm": 1.4076164960861206, + "kl": 0.0030774520710110664, + "learning_rate": 1.275e-06, + "loss": 0.0001, + "reward": 0.7750000357627869, + "reward_std": 0.5135659575462341, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2750000059604645, + "step": 51 + }, + { + "completion_length": 409.0, + "epoch": 0.18181818181818182, + "grad_norm": 0.8726016879081726, + "kl": 0.0025800741277635098, + "learning_rate": 1.3e-06, + "loss": 0.0001, + "reward": 0.5916666984558105, + "reward_std": 0.7324047088623047, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 52 + }, + { + "completion_length": 356.5, + "epoch": 0.1853146853146853, + "grad_norm": 0.877477765083313, + "kl": 0.0021268115378916264, + "learning_rate": 1.3250000000000002e-06, + "loss": 0.0001, + "reward": 1.6166666746139526, + "reward_std": 0.6976150274276733, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.28333336114883423, + "step": 53 + }, + { + "completion_length": 243.33334350585938, + "epoch": 0.1888111888111888, + "grad_norm": 0.9792532324790955, + "kl": 0.0043938253074884415, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.0002, + "reward": 1.1708333492279053, + "reward_std": 1.282616138458252, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17083333432674408, + "step": 54 + }, + { + "completion_length": 393.3333435058594, + "epoch": 0.19230769230769232, + "grad_norm": 1.205925703048706, + "kl": 0.0031106050591915846, + "learning_rate": 1.3750000000000002e-06, + "loss": 0.0001, + "reward": 0.9666666984558105, + "reward_std": 0.8084965944290161, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 55 + }, + { + "completion_length": 228.83334350585938, + "epoch": 0.1958041958041958, + "grad_norm": 0.7984407544136047, + "kl": 0.007072250358760357, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.0003, + "reward": 0.6916667222976685, + "reward_std": 1.1655113697052002, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19166666269302368, + "step": 56 + }, + { + "completion_length": 361.66668701171875, + "epoch": 0.1993006993006993, + "grad_norm": 3.0838680267333984, + "kl": 0.006738494616001844, + "learning_rate": 1.425e-06, + "loss": 0.0003, + "reward": 1.3041667938232422, + "reward_std": 0.2600080370903015, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30416667461395264, + "step": 57 + }, + { + "completion_length": 502.66668701171875, + "epoch": 0.20279720279720279, + "grad_norm": 0.7226095795631409, + "kl": 0.0058082761242985725, + "learning_rate": 1.45e-06, + "loss": 0.0002, + "reward": 1.399999976158142, + "reward_std": 1.3337916135787964, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.40000003576278687, + "step": 58 + }, + { + "completion_length": 210.5, + "epoch": 0.2062937062937063, + "grad_norm": 1.079681158065796, + "kl": 0.009464471600949764, + "learning_rate": 1.475e-06, + "loss": 0.0004, + "reward": 0.9750000238418579, + "reward_std": 1.1890122890472412, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.14166668057441711, + "step": 59 + }, + { + "completion_length": 208.5, + "epoch": 0.2097902097902098, + "grad_norm": 1.8312753438949585, + "kl": 0.03959222882986069, + "learning_rate": 1.5e-06, + "loss": 0.0016, + "reward": 0.5333333611488342, + "reward_std": 0.8553751707077026, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.20000001788139343, + "step": 60 + }, + { + "completion_length": 285.5, + "epoch": 0.21328671328671328, + "grad_norm": 0.9337784051895142, + "kl": 0.011914614588022232, + "learning_rate": 1.525e-06, + "loss": 0.0005, + "reward": 1.4458332061767578, + "reward_std": 0.4955846071243286, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.27916666865348816, + "step": 61 + }, + { + "completion_length": 276.3333435058594, + "epoch": 0.21678321678321677, + "grad_norm": 1.4266396760940552, + "kl": 0.02391706220805645, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.001, + "reward": 1.1583333015441895, + "reward_std": 0.8598934412002563, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32500001788139343, + "step": 62 + }, + { + "completion_length": 381.3333435058594, + "epoch": 0.2202797202797203, + "grad_norm": 1.1708087921142578, + "kl": 0.012987270019948483, + "learning_rate": 1.5750000000000002e-06, + "loss": 0.0005, + "reward": 1.5416667461395264, + "reward_std": 1.3807305097579956, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 63 + }, + { + "completion_length": 237.0, + "epoch": 0.22377622377622378, + "grad_norm": 1.3068374395370483, + "kl": 0.027782242745161057, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0011, + "reward": 1.433333396911621, + "reward_std": 1.162611961364746, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2666666805744171, + "step": 64 + }, + { + "completion_length": 797.6666870117188, + "epoch": 0.22727272727272727, + "grad_norm": 0.7319328784942627, + "kl": 0.013491494581103325, + "learning_rate": 1.6250000000000001e-06, + "loss": 0.0005, + "reward": 1.3166667222976685, + "reward_std": 0.8604747653007507, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3166666626930237, + "step": 65 + }, + { + "completion_length": 237.1666717529297, + "epoch": 0.23076923076923078, + "grad_norm": 1.9626200199127197, + "kl": 0.015099573880434036, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0006, + "reward": 0.9666666388511658, + "reward_std": 0.797287106513977, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 66 + }, + { + "completion_length": 221.1666717529297, + "epoch": 0.23426573426573427, + "grad_norm": 0.7815642952919006, + "kl": 0.03964684158563614, + "learning_rate": 1.6750000000000003e-06, + "loss": 0.0016, + "reward": 1.6416667699813843, + "reward_std": 1.0584973096847534, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.14166668057441711, + "step": 67 + }, + { + "completion_length": 227.33334350585938, + "epoch": 0.23776223776223776, + "grad_norm": 1.5282418727874756, + "kl": 0.0695306807756424, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0028, + "reward": 0.75, + "reward_std": 0.7375635504722595, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.25, + "step": 68 + }, + { + "completion_length": 673.3333740234375, + "epoch": 0.24125874125874125, + "grad_norm": 0.8560697436332703, + "kl": 0.03540939837694168, + "learning_rate": 1.725e-06, + "loss": 0.0014, + "reward": 2.200000047683716, + "reward_std": 0.9581232070922852, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5333333611488342, + "step": 69 + }, + { + "completion_length": 254.6666717529297, + "epoch": 0.24475524475524477, + "grad_norm": 1.2371562719345093, + "kl": 0.03692096844315529, + "learning_rate": 1.75e-06, + "loss": 0.0015, + "reward": 1.8249998092651367, + "reward_std": 0.9968700408935547, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32499998807907104, + "step": 70 + }, + { + "completion_length": 234.6666717529297, + "epoch": 0.24825174825174826, + "grad_norm": 0.9824966192245483, + "kl": 0.07421376556158066, + "learning_rate": 1.7750000000000002e-06, + "loss": 0.003, + "reward": 1.1666667461395264, + "reward_std": 0.6485882997512817, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3333333432674408, + "step": 71 + }, + { + "completion_length": 580.0, + "epoch": 0.2517482517482518, + "grad_norm": 1.0504631996154785, + "kl": 0.048039551824331284, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0019, + "reward": 1.808333396911621, + "reward_std": 1.302849531173706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 72 + }, + { + "completion_length": 788.1666870117188, + "epoch": 0.25524475524475526, + "grad_norm": 0.6447965502738953, + "kl": 0.04130098968744278, + "learning_rate": 1.825e-06, + "loss": 0.0017, + "reward": 1.3875000476837158, + "reward_std": 1.093817949295044, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5541666746139526, + "step": 73 + }, + { + "completion_length": 376.16668701171875, + "epoch": 0.25874125874125875, + "grad_norm": 1.347108244895935, + "kl": 0.19923770427703857, + "learning_rate": 1.85e-06, + "loss": 0.008, + "reward": 1.529166579246521, + "reward_std": 0.6618943214416504, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19583332538604736, + "step": 74 + }, + { + "completion_length": 227.1666717529297, + "epoch": 0.26223776223776224, + "grad_norm": 0.8091520667076111, + "kl": 0.06355344504117966, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.0025, + "reward": 0.75, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 75 + }, + { + "completion_length": 502.3333435058594, + "epoch": 0.26573426573426573, + "grad_norm": 1.1315293312072754, + "kl": 0.11514662951231003, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0046, + "reward": 1.504166603088379, + "reward_std": 1.256027102470398, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.33750003576278687, + "step": 76 + }, + { + "completion_length": 306.16668701171875, + "epoch": 0.2692307692307692, + "grad_norm": 1.6002874374389648, + "kl": 0.07964249700307846, + "learning_rate": 1.925e-06, + "loss": 0.0032, + "reward": 1.7083333730697632, + "reward_std": 1.2195971012115479, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5416666865348816, + "step": 77 + }, + { + "completion_length": 253.0, + "epoch": 0.2727272727272727, + "grad_norm": 1.134474754333496, + "kl": 0.09407778084278107, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0038, + "reward": 1.8333333730697632, + "reward_std": 1.0842816829681396, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3333333432674408, + "step": 78 + }, + { + "completion_length": 456.3333435058594, + "epoch": 0.2762237762237762, + "grad_norm": 1.4590799808502197, + "kl": 0.08163408935070038, + "learning_rate": 1.975e-06, + "loss": 0.0033, + "reward": 1.1875, + "reward_std": 1.164232611656189, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3541666865348816, + "step": 79 + }, + { + "completion_length": 273.0, + "epoch": 0.27972027972027974, + "grad_norm": 1.589087724685669, + "kl": 0.08010071516036987, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0032, + "reward": 0.9125000238418579, + "reward_std": 0.9088110327720642, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.07916666567325592, + "step": 80 + }, + { + "completion_length": 196.1666717529297, + "epoch": 0.28321678321678323, + "grad_norm": 1.4217482805252075, + "kl": 0.0619954913854599, + "learning_rate": 2.025e-06, + "loss": 0.0025, + "reward": 1.058333396911621, + "reward_std": 0.7486097812652588, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 81 + }, + { + "completion_length": 340.8333435058594, + "epoch": 0.2867132867132867, + "grad_norm": 1.056475043296814, + "kl": 0.05495650693774223, + "learning_rate": 2.05e-06, + "loss": 0.0022, + "reward": 0.8625000715255737, + "reward_std": 0.5305068492889404, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19583332538604736, + "step": 82 + }, + { + "completion_length": 410.66668701171875, + "epoch": 0.2902097902097902, + "grad_norm": 0.5162915587425232, + "kl": 0.04134432598948479, + "learning_rate": 2.075e-06, + "loss": 0.0017, + "reward": 1.1875, + "reward_std": 0.7466174364089966, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1875, + "step": 83 + }, + { + "completion_length": 510.66668701171875, + "epoch": 0.2937062937062937, + "grad_norm": 0.9501734972000122, + "kl": 0.047528013586997986, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0019, + "reward": 1.258333444595337, + "reward_std": 1.1069854497909546, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 84 + }, + { + "completion_length": 476.0, + "epoch": 0.2972027972027972, + "grad_norm": 1.0745543241500854, + "kl": 0.04738708958029747, + "learning_rate": 2.125e-06, + "loss": 0.0019, + "reward": 0.7541666030883789, + "reward_std": 0.6050654649734497, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2541666626930237, + "step": 85 + }, + { + "completion_length": 346.16668701171875, + "epoch": 0.3006993006993007, + "grad_norm": 0.7894018888473511, + "kl": 0.03818603605031967, + "learning_rate": 2.15e-06, + "loss": 0.0015, + "reward": 1.5499999523162842, + "reward_std": 0.8729261159896851, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 86 + }, + { + "completion_length": 157.5, + "epoch": 0.3041958041958042, + "grad_norm": 1.2285088300704956, + "kl": 0.04852033406496048, + "learning_rate": 2.1750000000000004e-06, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 1.2284135818481445, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 87 + }, + { + "completion_length": 853.5, + "epoch": 0.3076923076923077, + "grad_norm": 1.1314716339111328, + "kl": 0.03052813559770584, + "learning_rate": 2.2e-06, + "loss": 0.0012, + "reward": 1.5625, + "reward_std": 1.093817949295044, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3958333432674408, + "step": 88 + }, + { + "completion_length": 372.66668701171875, + "epoch": 0.3111888111888112, + "grad_norm": 0.9353286623954773, + "kl": 0.027921725064516068, + "learning_rate": 2.2250000000000003e-06, + "loss": 0.0011, + "reward": 1.8250000476837158, + "reward_std": 0.9234446287155151, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32500001788139343, + "step": 89 + }, + { + "completion_length": 296.3333435058594, + "epoch": 0.3146853146853147, + "grad_norm": 1.140289306640625, + "kl": 0.04811665043234825, + "learning_rate": 2.25e-06, + "loss": 0.0019, + "reward": 1.125, + "reward_std": 1.1268318891525269, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 90 + }, + { + "completion_length": 99.83333587646484, + "epoch": 0.3181818181818182, + "grad_norm": 4.178561687469482, + "kl": 0.09318779408931732, + "learning_rate": 2.2750000000000002e-06, + "loss": 0.0037, + "reward": 0.5583333373069763, + "reward_std": 0.9645810127258301, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 91 + }, + { + "completion_length": 192.1666717529297, + "epoch": 0.32167832167832167, + "grad_norm": 1.560648798942566, + "kl": 0.03698144853115082, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0015, + "reward": 1.9249999523162842, + "reward_std": 0.718853235244751, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.25833335518836975, + "step": 92 + }, + { + "completion_length": 576.5, + "epoch": 0.32517482517482516, + "grad_norm": 1.093043327331543, + "kl": 0.021529672667384148, + "learning_rate": 2.325e-06, + "loss": 0.0009, + "reward": 1.070833444595337, + "reward_std": 0.6477686166763306, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.23749999701976776, + "step": 93 + }, + { + "completion_length": 335.8333435058594, + "epoch": 0.32867132867132864, + "grad_norm": 0.8303731679916382, + "kl": 0.019405633211135864, + "learning_rate": 2.35e-06, + "loss": 0.0008, + "reward": 0.8416666984558105, + "reward_std": 0.9183772802352905, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17499999701976776, + "step": 94 + }, + { + "completion_length": 569.5, + "epoch": 0.3321678321678322, + "grad_norm": 1.4912625551223755, + "kl": 0.014733041636645794, + "learning_rate": 2.375e-06, + "loss": 0.0006, + "reward": 1.4541667699813843, + "reward_std": 1.1459076404571533, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4541666507720947, + "step": 95 + }, + { + "completion_length": 232.83334350585938, + "epoch": 0.3356643356643357, + "grad_norm": 0.9174475073814392, + "kl": 0.018923718482255936, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0008, + "reward": 1.3333333730697632, + "reward_std": 0.9877583980560303, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 96 + }, + { + "completion_length": 742.1666870117188, + "epoch": 0.33916083916083917, + "grad_norm": 1.258750557899475, + "kl": 0.017664968967437744, + "learning_rate": 2.425e-06, + "loss": 0.0007, + "reward": 1.4583333730697632, + "reward_std": 0.6202150583267212, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 97 + }, + { + "completion_length": 270.8333435058594, + "epoch": 0.34265734265734266, + "grad_norm": 0.9259786605834961, + "kl": 0.05115365609526634, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.002, + "reward": 1.5500000715255737, + "reward_std": 0.8729261159896851, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.21666666865348816, + "step": 98 + }, + { + "completion_length": 476.3333435058594, + "epoch": 0.34615384615384615, + "grad_norm": 1.240902066230774, + "kl": 0.036602895706892014, + "learning_rate": 2.475e-06, + "loss": 0.0015, + "reward": 1.2791666984558105, + "reward_std": 1.1935679912567139, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.27916669845581055, + "step": 99 + }, + { + "completion_length": 213.6666717529297, + "epoch": 0.34965034965034963, + "grad_norm": 0.943215548992157, + "kl": 0.04590342566370964, + "learning_rate": 2.5e-06, + "loss": 0.0018, + "reward": 1.841666579246521, + "reward_std": 0.6873258352279663, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.34166666865348816, + "step": 100 + }, + { + "completion_length": 401.0, + "epoch": 0.3531468531468531, + "grad_norm": 0.7366496324539185, + "kl": 0.016905900090932846, + "learning_rate": 2.5250000000000004e-06, + "loss": 0.0007, + "reward": 1.3000000715255737, + "reward_std": 1.1256110668182373, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 101 + }, + { + "completion_length": 854.5, + "epoch": 0.35664335664335667, + "grad_norm": 8.089740753173828, + "kl": 0.08785610646009445, + "learning_rate": 2.55e-06, + "loss": 0.0035, + "reward": 1.316666603088379, + "reward_std": 1.2330517768859863, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 102 + }, + { + "completion_length": 455.16668701171875, + "epoch": 0.36013986013986016, + "grad_norm": 1.6066083908081055, + "kl": 0.03349429741501808, + "learning_rate": 2.5750000000000003e-06, + "loss": 0.0013, + "reward": 1.7333333492279053, + "reward_std": 1.6448911428451538, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40000003576278687, + "step": 103 + }, + { + "completion_length": 558.6666870117188, + "epoch": 0.36363636363636365, + "grad_norm": 1.2461860179901123, + "kl": 0.0453556627035141, + "learning_rate": 2.6e-06, + "loss": 0.0018, + "reward": 1.933333396911621, + "reward_std": 1.1851863861083984, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 104 + }, + { + "completion_length": 462.16668701171875, + "epoch": 0.36713286713286714, + "grad_norm": 0.9176071286201477, + "kl": 0.05445032939314842, + "learning_rate": 2.6250000000000003e-06, + "loss": 0.0022, + "reward": 1.2916667461395264, + "reward_std": 0.9144214391708374, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 105 + }, + { + "completion_length": 357.5, + "epoch": 0.3706293706293706, + "grad_norm": 1.1796709299087524, + "kl": 0.08697855472564697, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0035, + "reward": 0.9833333492279053, + "reward_std": 1.2412359714508057, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 106 + }, + { + "completion_length": 556.8333740234375, + "epoch": 0.3741258741258741, + "grad_norm": 1.1719709634780884, + "kl": 0.09557916224002838, + "learning_rate": 2.6750000000000002e-06, + "loss": 0.0038, + "reward": 0.9541666507720947, + "reward_std": 1.0742924213409424, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2875000238418579, + "step": 107 + }, + { + "completion_length": 490.8333435058594, + "epoch": 0.3776223776223776, + "grad_norm": 0.9839584827423096, + "kl": 0.07620736211538315, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.003, + "reward": 1.3416666984558105, + "reward_std": 1.0551856756210327, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5083333253860474, + "step": 108 + }, + { + "completion_length": 459.8333435058594, + "epoch": 0.3811188811188811, + "grad_norm": 1.0232492685317993, + "kl": 0.09754881262779236, + "learning_rate": 2.7250000000000006e-06, + "loss": 0.0039, + "reward": 1.7916667461395264, + "reward_std": 1.201422929763794, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 109 + }, + { + "completion_length": 432.5, + "epoch": 0.38461538461538464, + "grad_norm": 0.7946304082870483, + "kl": 0.043154411017894745, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0017, + "reward": 2.1000001430511475, + "reward_std": 0.8933085203170776, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 110 + }, + { + "completion_length": 346.8333435058594, + "epoch": 0.3881118881118881, + "grad_norm": 0.9842674136161804, + "kl": 0.1046643778681755, + "learning_rate": 2.7750000000000005e-06, + "loss": 0.0042, + "reward": 0.8166667222976685, + "reward_std": 0.7353004217147827, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 111 + }, + { + "completion_length": 214.5, + "epoch": 0.3916083916083916, + "grad_norm": 1.1671849489212036, + "kl": 0.1281026154756546, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0051, + "reward": 1.0500000715255737, + "reward_std": 0.14832398295402527, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 112 + }, + { + "completion_length": 908.6666870117188, + "epoch": 0.3951048951048951, + "grad_norm": 0.3388780951499939, + "kl": 0.022495290264487267, + "learning_rate": 2.825e-06, + "loss": 0.0009, + "reward": 2.3375000953674316, + "reward_std": 0.3727431893348694, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6708333492279053, + "step": 113 + }, + { + "completion_length": 891.6666870117188, + "epoch": 0.3986013986013986, + "grad_norm": 0.467278391122818, + "kl": 0.025123490020632744, + "learning_rate": 2.85e-06, + "loss": 0.001, + "reward": 1.8541667461395264, + "reward_std": 0.7543899416923523, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6875, + "step": 114 + }, + { + "completion_length": 546.1666870117188, + "epoch": 0.4020979020979021, + "grad_norm": 1.054366111755371, + "kl": 0.0783834159374237, + "learning_rate": 2.875e-06, + "loss": 0.0031, + "reward": 2.4000000953674316, + "reward_std": 1.306904673576355, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5666666626930237, + "step": 115 + }, + { + "completion_length": 835.1666870117188, + "epoch": 0.40559440559440557, + "grad_norm": 0.7376688122749329, + "kl": 0.04768560454249382, + "learning_rate": 2.9e-06, + "loss": 0.0019, + "reward": 1.5291666984558105, + "reward_std": 0.32841163873672485, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5291666984558105, + "step": 116 + }, + { + "completion_length": 368.3333435058594, + "epoch": 0.4090909090909091, + "grad_norm": 1.456405758857727, + "kl": 0.1393664926290512, + "learning_rate": 2.925e-06, + "loss": 0.0056, + "reward": 0.9541666507720947, + "reward_std": 0.7450531721115112, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2875000238418579, + "step": 117 + }, + { + "completion_length": 485.5, + "epoch": 0.4125874125874126, + "grad_norm": 1.4957919120788574, + "kl": 0.1291833370923996, + "learning_rate": 2.95e-06, + "loss": 0.0052, + "reward": 1.5833333730697632, + "reward_std": 1.4998888969421387, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4166666865348816, + "step": 118 + }, + { + "completion_length": 356.3333435058594, + "epoch": 0.4160839160839161, + "grad_norm": 1.178475022315979, + "kl": 0.10108506679534912, + "learning_rate": 2.9750000000000003e-06, + "loss": 0.004, + "reward": 0.7083333730697632, + "reward_std": 0.7506109476089478, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 119 + }, + { + "completion_length": 140.33334350585938, + "epoch": 0.4195804195804196, + "grad_norm": 1.4624924659729004, + "kl": 0.2249661386013031, + "learning_rate": 3e-06, + "loss": 0.009, + "reward": 0.9166666865348816, + "reward_std": 0.9174239635467529, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 120 + }, + { + "completion_length": 673.1666870117188, + "epoch": 0.4230769230769231, + "grad_norm": 1.0837116241455078, + "kl": 0.09312133491039276, + "learning_rate": 3.0250000000000003e-06, + "loss": 0.0037, + "reward": 2.2208335399627686, + "reward_std": 0.9818881750106812, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.38749998807907104, + "step": 121 + }, + { + "completion_length": 238.1666717529297, + "epoch": 0.42657342657342656, + "grad_norm": 1.0982871055603027, + "kl": 0.05689762160181999, + "learning_rate": 3.05e-06, + "loss": 0.0023, + "reward": 1.1166666746139526, + "reward_std": 0.7567474246025085, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.11666666716337204, + "step": 122 + }, + { + "completion_length": 576.1666870117188, + "epoch": 0.43006993006993005, + "grad_norm": 1.0922025442123413, + "kl": 0.04579655081033707, + "learning_rate": 3.075e-06, + "loss": 0.0018, + "reward": 2.4000000953674316, + "reward_std": 1.0807406902313232, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5666666626930237, + "step": 123 + }, + { + "completion_length": 736.6666870117188, + "epoch": 0.43356643356643354, + "grad_norm": 1.5019290447235107, + "kl": 0.030428007245063782, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0012, + "reward": 1.504166603088379, + "reward_std": 1.2472386360168457, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5041667222976685, + "step": 124 + }, + { + "completion_length": 603.5, + "epoch": 0.4370629370629371, + "grad_norm": 4.212569713592529, + "kl": 0.37697991728782654, + "learning_rate": 3.125e-06, + "loss": 0.0151, + "reward": 1.6416667699813843, + "reward_std": 0.8303112387657166, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6416667103767395, + "step": 125 + }, + { + "completion_length": 492.0, + "epoch": 0.4405594405594406, + "grad_norm": 0.9634215831756592, + "kl": 0.06763506680727005, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0027, + "reward": 2.125, + "reward_std": 1.2069590091705322, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 126 + }, + { + "completion_length": 792.1666870117188, + "epoch": 0.44405594405594406, + "grad_norm": 0.4220138192176819, + "kl": 0.03986603766679764, + "learning_rate": 3.175e-06, + "loss": 0.0016, + "reward": 1.1375000476837158, + "reward_std": 0.5137485265731812, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6375000476837158, + "step": 127 + }, + { + "completion_length": 535.5, + "epoch": 0.44755244755244755, + "grad_norm": 4.797938823699951, + "kl": 0.13327616453170776, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0053, + "reward": 1.1791666746139526, + "reward_std": 1.1582764387130737, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.34583336114883423, + "step": 128 + }, + { + "completion_length": 444.8333435058594, + "epoch": 0.45104895104895104, + "grad_norm": 0.7808079719543457, + "kl": 0.055326174944639206, + "learning_rate": 3.2250000000000005e-06, + "loss": 0.0022, + "reward": 1.495833396911621, + "reward_std": 0.7681823968887329, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.16250000894069672, + "step": 129 + }, + { + "completion_length": 454.66668701171875, + "epoch": 0.45454545454545453, + "grad_norm": 0.8776301741600037, + "kl": 0.11162035167217255, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0045, + "reward": 1.5750001668930054, + "reward_std": 1.4812158346176147, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.24166665971279144, + "step": 130 + }, + { + "completion_length": 769.6666870117188, + "epoch": 0.458041958041958, + "grad_norm": 0.4391367733478546, + "kl": 0.025292951613664627, + "learning_rate": 3.2750000000000004e-06, + "loss": 0.001, + "reward": 2.433333396911621, + "reward_std": 0.2746209502220154, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6000000238418579, + "step": 131 + }, + { + "completion_length": 528.6666870117188, + "epoch": 0.46153846153846156, + "grad_norm": 0.8809014558792114, + "kl": 0.12223925441503525, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0049, + "reward": 2.120833396911621, + "reward_std": 1.101410150527954, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4541666507720947, + "step": 132 + }, + { + "completion_length": 491.3333435058594, + "epoch": 0.46503496503496505, + "grad_norm": 1.0070464611053467, + "kl": 0.05908138304948807, + "learning_rate": 3.3250000000000004e-06, + "loss": 0.0024, + "reward": 0.5916666984558105, + "reward_std": 0.5335416197776794, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 133 + }, + { + "completion_length": 892.5, + "epoch": 0.46853146853146854, + "grad_norm": 0.4570764899253845, + "kl": 0.037701599299907684, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0015, + "reward": 1.7249999046325684, + "reward_std": 1.292478322982788, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 134 + }, + { + "completion_length": 806.8333740234375, + "epoch": 0.47202797202797203, + "grad_norm": 0.5572299361228943, + "kl": 0.05404336377978325, + "learning_rate": 3.3750000000000003e-06, + "loss": 0.0022, + "reward": 1.4583333730697632, + "reward_std": 0.990033745765686, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7916666269302368, + "step": 135 + }, + { + "completion_length": 589.0, + "epoch": 0.4755244755244755, + "grad_norm": 0.7575751543045044, + "kl": 0.04170485585927963, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0017, + "reward": 2.683333396911621, + "reward_std": 1.1075499057769775, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8500000238418579, + "step": 136 + }, + { + "completion_length": 1060.166748046875, + "epoch": 0.479020979020979, + "grad_norm": 0.5119641423225403, + "kl": 0.04976843297481537, + "learning_rate": 3.4250000000000007e-06, + "loss": 0.002, + "reward": 1.1125000715255737, + "reward_std": 0.39457258582115173, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6124999523162842, + "step": 137 + }, + { + "completion_length": 559.8333740234375, + "epoch": 0.4825174825174825, + "grad_norm": 0.6115387082099915, + "kl": 0.05675242468714714, + "learning_rate": 3.45e-06, + "loss": 0.0023, + "reward": 2.0416667461395264, + "reward_std": 0.5715476274490356, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5416666865348816, + "step": 138 + }, + { + "completion_length": 685.6666870117188, + "epoch": 0.486013986013986, + "grad_norm": 1.2578071355819702, + "kl": 0.07080799341201782, + "learning_rate": 3.475e-06, + "loss": 0.0028, + "reward": 1.379166603088379, + "reward_std": 1.0072758197784424, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7125000357627869, + "step": 139 + }, + { + "completion_length": 987.5, + "epoch": 0.48951048951048953, + "grad_norm": 0.6280319690704346, + "kl": 0.03268418833613396, + "learning_rate": 3.5e-06, + "loss": 0.0013, + "reward": 0.9291666746139526, + "reward_std": 0.6654728651046753, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5958333015441895, + "step": 140 + }, + { + "completion_length": 728.5, + "epoch": 0.493006993006993, + "grad_norm": 0.8773026466369629, + "kl": 0.032183535397052765, + "learning_rate": 3.525e-06, + "loss": 0.0013, + "reward": 2.862499952316284, + "reward_std": 0.7864078879356384, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6958333253860474, + "step": 141 + }, + { + "completion_length": 405.8333435058594, + "epoch": 0.4965034965034965, + "grad_norm": 0.8974792957305908, + "kl": 0.059865664690732956, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0024, + "reward": 1.6875, + "reward_std": 0.8300225734710693, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.3541666865348816, + "step": 142 + }, + { + "completion_length": 1081.666748046875, + "epoch": 0.5, + "grad_norm": 0.5286564230918884, + "kl": 0.022505857050418854, + "learning_rate": 3.575e-06, + "loss": 0.0009, + "reward": 2.370833396911621, + "reward_std": 0.4925486445426941, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8708332777023315, + "step": 143 + }, + { + "completion_length": 1141.3333740234375, + "epoch": 0.5034965034965035, + "grad_norm": 0.527409017086029, + "kl": 0.021072231233119965, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0008, + "reward": 1.9291666746139526, + "reward_std": 0.7955214381217957, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5958333611488342, + "step": 144 + }, + { + "completion_length": 515.5, + "epoch": 0.506993006993007, + "grad_norm": 2.5036261081695557, + "kl": 0.3181736469268799, + "learning_rate": 3.625e-06, + "loss": 0.0127, + "reward": 1.5833333730697632, + "reward_std": 0.9988327026367188, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5833333730697632, + "step": 145 + }, + { + "completion_length": 599.5, + "epoch": 0.5104895104895105, + "grad_norm": 0.7538139224052429, + "kl": 0.041587017476558685, + "learning_rate": 3.65e-06, + "loss": 0.0017, + "reward": 1.3583334684371948, + "reward_std": 0.6873258352279663, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6916666030883789, + "step": 146 + }, + { + "completion_length": 645.6666870117188, + "epoch": 0.513986013986014, + "grad_norm": 0.6815938353538513, + "kl": 0.031590305268764496, + "learning_rate": 3.6750000000000004e-06, + "loss": 0.0013, + "reward": 2.445833683013916, + "reward_std": 1.186003565788269, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6124999523162842, + "step": 147 + }, + { + "completion_length": 731.0, + "epoch": 0.5174825174825175, + "grad_norm": 1.4654277563095093, + "kl": 0.11272114515304565, + "learning_rate": 3.7e-06, + "loss": 0.0045, + "reward": 1.2125000953674316, + "reward_std": 0.7435977458953857, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7124999761581421, + "step": 148 + }, + { + "completion_length": 476.16668701171875, + "epoch": 0.5209790209790209, + "grad_norm": 3.388495683670044, + "kl": 0.9080104827880859, + "learning_rate": 3.7250000000000003e-06, + "loss": 0.0363, + "reward": 1.8958333730697632, + "reward_std": 0.9965461492538452, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3958333432674408, + "step": 149 + }, + { + "completion_length": 1053.166748046875, + "epoch": 0.5244755244755245, + "grad_norm": 0.4761454164981842, + "kl": 0.027715642005205154, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0011, + "reward": 3.2916667461395264, + "reward_std": 0.7417322397232056, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 150 + }, + { + "completion_length": 751.1666870117188, + "epoch": 0.527972027972028, + "grad_norm": 0.6827074885368347, + "kl": 0.0386313796043396, + "learning_rate": 3.7750000000000003e-06, + "loss": 0.0015, + "reward": 2.495833396911621, + "reward_std": 1.0227923393249512, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6625000238418579, + "step": 151 + }, + { + "completion_length": 721.8333740234375, + "epoch": 0.5314685314685315, + "grad_norm": 1.2814685106277466, + "kl": 0.041070081293582916, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0016, + "reward": 2.4666666984558105, + "reward_std": 0.8834120631217957, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 152 + }, + { + "completion_length": 513.0, + "epoch": 0.534965034965035, + "grad_norm": 0.6044140458106995, + "kl": 0.08036690950393677, + "learning_rate": 3.825000000000001e-06, + "loss": 0.0032, + "reward": 1.7875001430511475, + "reward_std": 1.1646621227264404, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6208333373069763, + "step": 153 + }, + { + "completion_length": 720.8333740234375, + "epoch": 0.5384615384615384, + "grad_norm": 0.7732751965522766, + "kl": 0.04927179962396622, + "learning_rate": 3.85e-06, + "loss": 0.002, + "reward": 2.383333206176758, + "reward_std": 1.4126808643341064, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 154 + }, + { + "completion_length": 708.8333740234375, + "epoch": 0.541958041958042, + "grad_norm": 0.6660548448562622, + "kl": 0.07937665283679962, + "learning_rate": 3.875e-06, + "loss": 0.0032, + "reward": 2.183333396911621, + "reward_std": 0.6377042531967163, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8500000238418579, + "step": 155 + }, + { + "completion_length": 1192.0, + "epoch": 0.5454545454545454, + "grad_norm": 0.3896901309490204, + "kl": 0.025209862738847733, + "learning_rate": 3.900000000000001e-06, + "loss": 0.001, + "reward": 1.8833332061767578, + "reward_std": 0.8691471815109253, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 156 + }, + { + "completion_length": 705.1666870117188, + "epoch": 0.548951048951049, + "grad_norm": 0.5750932097434998, + "kl": 0.04517858847975731, + "learning_rate": 3.9250000000000005e-06, + "loss": 0.0018, + "reward": 2.9541664123535156, + "reward_std": 0.6458360552787781, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6208333373069763, + "step": 157 + }, + { + "completion_length": 465.5, + "epoch": 0.5524475524475524, + "grad_norm": 0.8335661888122559, + "kl": 0.08351196348667145, + "learning_rate": 3.95e-06, + "loss": 0.0033, + "reward": 2.424999952316284, + "reward_std": 0.941673994064331, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5916666984558105, + "step": 158 + }, + { + "completion_length": 539.6666870117188, + "epoch": 0.5559440559440559, + "grad_norm": 1.1459757089614868, + "kl": 0.12647944688796997, + "learning_rate": 3.975000000000001e-06, + "loss": 0.0051, + "reward": 1.6416667699813843, + "reward_std": 1.2068209648132324, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 159 + }, + { + "completion_length": 798.0, + "epoch": 0.5594405594405595, + "grad_norm": 0.4939272105693817, + "kl": 0.051064085215330124, + "learning_rate": 4.000000000000001e-06, + "loss": 0.002, + "reward": 2.183333396911621, + "reward_std": 1.2081665992736816, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 160 + }, + { + "completion_length": 338.8333435058594, + "epoch": 0.5629370629370629, + "grad_norm": 0.8890612125396729, + "kl": 0.12327366322278976, + "learning_rate": 4.0250000000000004e-06, + "loss": 0.0049, + "reward": 2.575000286102295, + "reward_std": 0.9913375377655029, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40833336114883423, + "step": 161 + }, + { + "completion_length": 809.6666870117188, + "epoch": 0.5664335664335665, + "grad_norm": 0.3928314447402954, + "kl": 0.040153808891773224, + "learning_rate": 4.05e-06, + "loss": 0.0016, + "reward": 1.7208335399627686, + "reward_std": 0.5225937366485596, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7208333015441895, + "step": 162 + }, + { + "completion_length": 766.0, + "epoch": 0.5699300699300699, + "grad_norm": 0.7869060039520264, + "kl": 0.04531605541706085, + "learning_rate": 4.075e-06, + "loss": 0.0018, + "reward": 2.120833396911621, + "reward_std": 0.8866251707077026, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4541667103767395, + "step": 163 + }, + { + "completion_length": 1085.666748046875, + "epoch": 0.5734265734265734, + "grad_norm": 1.0671396255493164, + "kl": 0.06464602053165436, + "learning_rate": 4.1e-06, + "loss": 0.0026, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666865348816, + "step": 164 + }, + { + "completion_length": 628.1666870117188, + "epoch": 0.5769230769230769, + "grad_norm": 0.9583672285079956, + "kl": 0.06743767857551575, + "learning_rate": 4.125e-06, + "loss": 0.0027, + "reward": 2.137500286102295, + "reward_std": 1.376930594444275, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.637499988079071, + "step": 165 + }, + { + "completion_length": 351.8333435058594, + "epoch": 0.5804195804195804, + "grad_norm": 0.6946209669113159, + "kl": 0.09894745796918869, + "learning_rate": 4.15e-06, + "loss": 0.004, + "reward": 2.7750000953674316, + "reward_std": 0.7055140733718872, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4416666626930237, + "step": 166 + }, + { + "completion_length": 448.16668701171875, + "epoch": 0.583916083916084, + "grad_norm": 0.6712130308151245, + "kl": 0.0714031383395195, + "learning_rate": 4.175e-06, + "loss": 0.0029, + "reward": 1.9583333730697632, + "reward_std": 0.6499359011650085, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6250000596046448, + "step": 167 + }, + { + "completion_length": 763.0, + "epoch": 0.5874125874125874, + "grad_norm": 0.5934569239616394, + "kl": 0.039833370596170425, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0016, + "reward": 1.7208335399627686, + "reward_std": 0.6870983839035034, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.720833420753479, + "step": 168 + }, + { + "completion_length": 813.8333740234375, + "epoch": 0.5909090909090909, + "grad_norm": 0.46408811211586, + "kl": 0.0639135017991066, + "learning_rate": 4.225e-06, + "loss": 0.0026, + "reward": 2.6625001430511475, + "reward_std": 0.271454393863678, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6625000238418579, + "step": 169 + }, + { + "completion_length": 621.3333740234375, + "epoch": 0.5944055944055944, + "grad_norm": 1.6175382137298584, + "kl": 0.23431169986724854, + "learning_rate": 4.25e-06, + "loss": 0.0094, + "reward": 1.5250000953674316, + "reward_std": 1.00784432888031, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 170 + }, + { + "completion_length": 685.1666870117188, + "epoch": 0.5979020979020979, + "grad_norm": 0.7504808306694031, + "kl": 0.06654171645641327, + "learning_rate": 4.2750000000000006e-06, + "loss": 0.0027, + "reward": 2.4583334922790527, + "reward_std": 0.8187898397445679, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 171 + }, + { + "completion_length": 772.6666870117188, + "epoch": 0.6013986013986014, + "grad_norm": 0.39892545342445374, + "kl": 0.030765770003199577, + "learning_rate": 4.3e-06, + "loss": 0.0012, + "reward": 1.7333333492279053, + "reward_std": 0.40207797288894653, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 172 + }, + { + "completion_length": 600.8333740234375, + "epoch": 0.6048951048951049, + "grad_norm": 0.6147928833961487, + "kl": 0.07108036428689957, + "learning_rate": 4.325e-06, + "loss": 0.0028, + "reward": 2.054166793823242, + "reward_std": 0.5684225559234619, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7208333015441895, + "step": 173 + }, + { + "completion_length": 761.3333740234375, + "epoch": 0.6083916083916084, + "grad_norm": 1.1690645217895508, + "kl": 0.11572085320949554, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0046, + "reward": 1.9583333730697632, + "reward_std": 1.2491663694381714, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7916666865348816, + "step": 174 + }, + { + "completion_length": 800.6666870117188, + "epoch": 0.6118881118881119, + "grad_norm": 1.141146183013916, + "kl": 0.0763167217373848, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.0031, + "reward": 1.4458335638046265, + "reward_std": 1.0782413482666016, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6124999523162842, + "step": 175 + }, + { + "completion_length": 582.0, + "epoch": 0.6153846153846154, + "grad_norm": 0.9667629599571228, + "kl": 0.04065123200416565, + "learning_rate": 4.4e-06, + "loss": 0.0016, + "reward": 1.5625, + "reward_std": 1.3656271696090698, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5625, + "step": 176 + }, + { + "completion_length": 653.6666870117188, + "epoch": 0.6188811188811189, + "grad_norm": 0.7743256092071533, + "kl": 0.07254478335380554, + "learning_rate": 4.425e-06, + "loss": 0.0029, + "reward": 1.308333396911621, + "reward_std": 0.7324048280715942, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6416666507720947, + "step": 177 + }, + { + "completion_length": 624.8333740234375, + "epoch": 0.6223776223776224, + "grad_norm": 1.7900493144989014, + "kl": 0.2500300407409668, + "learning_rate": 4.450000000000001e-06, + "loss": 0.01, + "reward": 1.3583333492279053, + "reward_std": 0.7825705409049988, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916667222976685, + "step": 178 + }, + { + "completion_length": 1285.0, + "epoch": 0.6258741258741258, + "grad_norm": 0.3387628197669983, + "kl": 0.025821728631854057, + "learning_rate": 4.475e-06, + "loss": 0.001, + "reward": 2.7916667461395264, + "reward_std": 0.678355872631073, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666269302368, + "step": 179 + }, + { + "completion_length": 975.8333740234375, + "epoch": 0.6293706293706294, + "grad_norm": 0.41932833194732666, + "kl": 0.04700490087270737, + "learning_rate": 4.5e-06, + "loss": 0.0019, + "reward": 1.8500001430511475, + "reward_std": 0.6782330274581909, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8499999046325684, + "step": 180 + }, + { + "completion_length": 771.8333740234375, + "epoch": 0.6328671328671329, + "grad_norm": 0.6049262881278992, + "kl": 0.05856431648135185, + "learning_rate": 4.525000000000001e-06, + "loss": 0.0023, + "reward": 1.6624999046325684, + "reward_std": 1.5213277339935303, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6625000238418579, + "step": 181 + }, + { + "completion_length": 718.3333740234375, + "epoch": 0.6363636363636364, + "grad_norm": 0.519266664981842, + "kl": 0.05408002436161041, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0022, + "reward": 3.012500286102295, + "reward_std": 1.0839452743530273, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8458333015441895, + "step": 182 + }, + { + "completion_length": 417.3333435058594, + "epoch": 0.6398601398601399, + "grad_norm": 1.159592866897583, + "kl": 0.06883987784385681, + "learning_rate": 4.575e-06, + "loss": 0.0028, + "reward": 2.308333396911621, + "reward_std": 1.089686393737793, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416666507720947, + "step": 183 + }, + { + "completion_length": 403.66668701171875, + "epoch": 0.6433566433566433, + "grad_norm": 0.9109689593315125, + "kl": 0.12938742339611053, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0052, + "reward": 2.829166889190674, + "reward_std": 0.9263390898704529, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4958333373069763, + "step": 184 + }, + { + "completion_length": 584.1666870117188, + "epoch": 0.6468531468531469, + "grad_norm": 1.3091282844543457, + "kl": 0.1182996854186058, + "learning_rate": 4.625000000000001e-06, + "loss": 0.0047, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 185 + }, + { + "completion_length": 715.8333740234375, + "epoch": 0.6503496503496503, + "grad_norm": 0.8944427967071533, + "kl": 0.07471362501382828, + "learning_rate": 4.65e-06, + "loss": 0.003, + "reward": 2.5500001907348633, + "reward_std": 1.0044898986816406, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 186 + }, + { + "completion_length": 328.66668701171875, + "epoch": 0.6538461538461539, + "grad_norm": 2.0265045166015625, + "kl": 0.3070363402366638, + "learning_rate": 4.675000000000001e-06, + "loss": 0.0123, + "reward": 2.0291666984558105, + "reward_std": 0.9910117983818054, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.36250001192092896, + "step": 187 + }, + { + "completion_length": 463.8333435058594, + "epoch": 0.6573426573426573, + "grad_norm": 1.1863874197006226, + "kl": 0.07772837579250336, + "learning_rate": 4.7e-06, + "loss": 0.0031, + "reward": 2.5333335399627686, + "reward_std": 0.9558593034744263, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5333333611488342, + "step": 188 + }, + { + "completion_length": 516.5, + "epoch": 0.6608391608391608, + "grad_norm": 0.690477192401886, + "kl": 0.08707510679960251, + "learning_rate": 4.7250000000000005e-06, + "loss": 0.0035, + "reward": 3.4000000953674316, + "reward_std": 1.2024973630905151, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 189 + }, + { + "completion_length": 656.8333740234375, + "epoch": 0.6643356643356644, + "grad_norm": 0.7191756963729858, + "kl": 0.05152536556124687, + "learning_rate": 4.75e-06, + "loss": 0.0021, + "reward": 1.7833335399627686, + "reward_std": 0.5288351774215698, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 190 + }, + { + "completion_length": 510.16668701171875, + "epoch": 0.6678321678321678, + "grad_norm": 1.589722990989685, + "kl": 0.11165278404951096, + "learning_rate": 4.775e-06, + "loss": 0.0045, + "reward": 1.5916666984558105, + "reward_std": 1.1620744466781616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5916666984558105, + "step": 191 + }, + { + "completion_length": 463.3333435058594, + "epoch": 0.6713286713286714, + "grad_norm": 1.1402506828308105, + "kl": 0.12224837392568588, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0049, + "reward": 3.0166664123535156, + "reward_std": 0.46224093437194824, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6833333373069763, + "step": 192 + }, + { + "completion_length": 668.8333740234375, + "epoch": 0.6748251748251748, + "grad_norm": 0.829407811164856, + "kl": 0.04827030003070831, + "learning_rate": 4.825e-06, + "loss": 0.0019, + "reward": 2.516666889190674, + "reward_std": 0.9416297674179077, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 193 + }, + { + "completion_length": 653.1666870117188, + "epoch": 0.6783216783216783, + "grad_norm": 0.8737359642982483, + "kl": 0.11687206476926804, + "learning_rate": 4.85e-06, + "loss": 0.0047, + "reward": 1.883333444595337, + "reward_std": 0.9978310465812683, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666388511658, + "step": 194 + }, + { + "completion_length": 521.1666870117188, + "epoch": 0.6818181818181818, + "grad_norm": 1.265020728111267, + "kl": 0.1497541069984436, + "learning_rate": 4.875e-06, + "loss": 0.006, + "reward": 1.6666667461395264, + "reward_std": 1.1578716039657593, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6666666865348816, + "step": 195 + }, + { + "completion_length": 720.3333740234375, + "epoch": 0.6853146853146853, + "grad_norm": 0.5844486355781555, + "kl": 0.07905390858650208, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0032, + "reward": 2.683333396911621, + "reward_std": 0.7659417986869812, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 196 + }, + { + "completion_length": 654.3333740234375, + "epoch": 0.6888111888111889, + "grad_norm": 1.0279442071914673, + "kl": 0.05869147181510925, + "learning_rate": 4.925e-06, + "loss": 0.0023, + "reward": 1.8250000476837158, + "reward_std": 1.047735571861267, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.824999988079071, + "step": 197 + }, + { + "completion_length": 696.5, + "epoch": 0.6923076923076923, + "grad_norm": 0.5949178338050842, + "kl": 0.10564576834440231, + "learning_rate": 4.95e-06, + "loss": 0.0042, + "reward": 2.7958333492279053, + "reward_std": 0.8044278621673584, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 198 + }, + { + "completion_length": 667.3333740234375, + "epoch": 0.6958041958041958, + "grad_norm": 1.4045933485031128, + "kl": 0.2249039262533188, + "learning_rate": 4.975000000000001e-06, + "loss": 0.009, + "reward": 1.7833333015441895, + "reward_std": 1.2967909574508667, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 199 + }, + { + "completion_length": 549.0, + "epoch": 0.6993006993006993, + "grad_norm": 11.491266250610352, + "kl": 2.7085909843444824, + "learning_rate": 5e-06, + "loss": 0.1083, + "reward": 1.883333444595337, + "reward_std": 1.0934655666351318, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 200 + }, + { + "completion_length": 1157.666748046875, + "epoch": 0.7027972027972028, + "grad_norm": 0.3758504092693329, + "kl": 0.03439244627952576, + "learning_rate": 4.99999619228322e-06, + "loss": 0.0014, + "reward": 1.5375001430511475, + "reward_std": 0.490853875875473, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 201 + }, + { + "completion_length": 276.66668701171875, + "epoch": 0.7062937062937062, + "grad_norm": 1.4240407943725586, + "kl": 0.09711845219135284, + "learning_rate": 4.999984769144476e-06, + "loss": 0.0039, + "reward": 1.774999976158142, + "reward_std": 1.4250439405441284, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.44166669249534607, + "step": 202 + }, + { + "completion_length": 506.16668701171875, + "epoch": 0.7097902097902098, + "grad_norm": 0.8863720893859863, + "kl": 0.0886097177863121, + "learning_rate": 4.999965730618567e-06, + "loss": 0.0035, + "reward": 2.4166667461395264, + "reward_std": 0.7717944979667664, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.75, + "step": 203 + }, + { + "completion_length": 558.8333740234375, + "epoch": 0.7132867132867133, + "grad_norm": 1.036176323890686, + "kl": 0.11752279102802277, + "learning_rate": 4.999939076763487e-06, + "loss": 0.0047, + "reward": 1.8583334684371948, + "reward_std": 0.7761551141738892, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916666030883789, + "step": 204 + }, + { + "completion_length": 590.3333740234375, + "epoch": 0.7167832167832168, + "grad_norm": 1.2968803644180298, + "kl": 0.1260688155889511, + "learning_rate": 4.9999048076604286e-06, + "loss": 0.005, + "reward": 1.883333444595337, + "reward_std": 1.0934655666351318, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666984558105, + "step": 205 + }, + { + "completion_length": 653.3333740234375, + "epoch": 0.7202797202797203, + "grad_norm": 1.9041389226913452, + "kl": 0.350026935338974, + "learning_rate": 4.999862923413781e-06, + "loss": 0.014, + "reward": 1.8041666746139526, + "reward_std": 0.5104941129684448, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6375000476837158, + "step": 206 + }, + { + "completion_length": 359.3333435058594, + "epoch": 0.7237762237762237, + "grad_norm": 1.4652067422866821, + "kl": 0.09337612986564636, + "learning_rate": 4.9998134241511305e-06, + "loss": 0.0037, + "reward": 1.875, + "reward_std": 1.1440061330795288, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5416666865348816, + "step": 207 + }, + { + "completion_length": 393.3333435058594, + "epoch": 0.7272727272727273, + "grad_norm": 0.8172839879989624, + "kl": 0.11479752510786057, + "learning_rate": 4.999756310023261e-06, + "loss": 0.0046, + "reward": 3.2916667461395264, + "reward_std": 0.46627962589263916, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.625, + "step": 208 + }, + { + "completion_length": 1035.166748046875, + "epoch": 0.7307692307692307, + "grad_norm": 0.45489755272865295, + "kl": 0.03647574782371521, + "learning_rate": 4.9996915812041515e-06, + "loss": 0.0015, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 209 + }, + { + "completion_length": 561.5, + "epoch": 0.7342657342657343, + "grad_norm": 0.7732179164886475, + "kl": 0.10910838097333908, + "learning_rate": 4.9996192378909785e-06, + "loss": 0.0044, + "reward": 3.075000286102295, + "reward_std": 0.9852665662765503, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416667342185974, + "step": 210 + }, + { + "completion_length": 327.3333435058594, + "epoch": 0.7377622377622378, + "grad_norm": 1.1959446668624878, + "kl": 0.18659886717796326, + "learning_rate": 4.999539280304111e-06, + "loss": 0.0075, + "reward": 1.7333333492279053, + "reward_std": 0.6875075697898865, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5666666626930237, + "step": 211 + }, + { + "completion_length": 698.1666870117188, + "epoch": 0.7412587412587412, + "grad_norm": 0.5885636806488037, + "kl": 0.06670037657022476, + "learning_rate": 4.999451708687114e-06, + "loss": 0.0027, + "reward": 2.7750003337860107, + "reward_std": 0.8341163396835327, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7749999761581421, + "step": 212 + }, + { + "completion_length": 679.8333740234375, + "epoch": 0.7447552447552448, + "grad_norm": 0.9122396111488342, + "kl": 0.10316199064254761, + "learning_rate": 4.999356523306746e-06, + "loss": 0.0041, + "reward": 2.008333444595337, + "reward_std": 1.2973692417144775, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5083333253860474, + "step": 213 + }, + { + "completion_length": 604.1666870117188, + "epoch": 0.7482517482517482, + "grad_norm": 0.7414869070053101, + "kl": 0.08340045064687729, + "learning_rate": 4.9992537244529585e-06, + "loss": 0.0033, + "reward": 3.299999952316284, + "reward_std": 0.41713306307792664, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8000000715255737, + "step": 214 + }, + { + "completion_length": 704.5, + "epoch": 0.7517482517482518, + "grad_norm": 2.09073543548584, + "kl": 0.10594753921031952, + "learning_rate": 4.999143312438893e-06, + "loss": 0.0042, + "reward": 1.7416666746139526, + "reward_std": 0.9259679317474365, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7416667342185974, + "step": 215 + }, + { + "completion_length": 587.8333740234375, + "epoch": 0.7552447552447552, + "grad_norm": 1.304240107536316, + "kl": 0.1295248121023178, + "learning_rate": 4.999025287600886e-06, + "loss": 0.0052, + "reward": 2.616666793823242, + "reward_std": 1.6061341762542725, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6166666746139526, + "step": 216 + }, + { + "completion_length": 495.8333435058594, + "epoch": 0.7587412587412588, + "grad_norm": 1.2090598344802856, + "kl": 0.11880560964345932, + "learning_rate": 4.9988996502984604e-06, + "loss": 0.0048, + "reward": 2.7333333492279053, + "reward_std": 1.022578477859497, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5666667222976685, + "step": 217 + }, + { + "completion_length": 565.6666870117188, + "epoch": 0.7622377622377622, + "grad_norm": 0.553954005241394, + "kl": 0.052788302302360535, + "learning_rate": 4.998766400914329e-06, + "loss": 0.0021, + "reward": 2.6999998092651367, + "reward_std": 0.9705669283866882, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.699999988079071, + "step": 218 + }, + { + "completion_length": 645.6666870117188, + "epoch": 0.7657342657342657, + "grad_norm": 2.507683038711548, + "kl": 0.2849184274673462, + "learning_rate": 4.998625539854394e-06, + "loss": 0.0114, + "reward": 2.6000001430511475, + "reward_std": 1.0089600086212158, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7666666507720947, + "step": 219 + }, + { + "completion_length": 321.66668701171875, + "epoch": 0.7692307692307693, + "grad_norm": 1.2175945043563843, + "kl": 0.0842239186167717, + "learning_rate": 4.99847706754774e-06, + "loss": 0.0034, + "reward": 2.933333158493042, + "reward_std": 0.6516644954681396, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6000000238418579, + "step": 220 + }, + { + "completion_length": 700.5, + "epoch": 0.7727272727272727, + "grad_norm": 2.048892021179199, + "kl": 0.16157689690589905, + "learning_rate": 4.9983209844466404e-06, + "loss": 0.0065, + "reward": 1.625, + "reward_std": 1.0167349576950073, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 221 + }, + { + "completion_length": 833.5, + "epoch": 0.7762237762237763, + "grad_norm": 0.9171572327613831, + "kl": 0.06645169854164124, + "learning_rate": 4.998157291026553e-06, + "loss": 0.0027, + "reward": 2.9083335399627686, + "reward_std": 1.2068209648132324, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7416666746139526, + "step": 222 + }, + { + "completion_length": 506.3333435058594, + "epoch": 0.7797202797202797, + "grad_norm": 19.220211029052734, + "kl": 3.192702293395996, + "learning_rate": 4.9979859877861155e-06, + "loss": 0.1277, + "reward": 3.191666603088379, + "reward_std": 1.2146673202514648, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.6916667222976685, + "step": 223 + }, + { + "completion_length": 593.0, + "epoch": 0.7832167832167832, + "grad_norm": 0.8852243423461914, + "kl": 0.09442658722400665, + "learning_rate": 4.997807075247147e-06, + "loss": 0.0038, + "reward": 3.2750003337860107, + "reward_std": 0.6691412925720215, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7749999761581421, + "step": 224 + }, + { + "completion_length": 831.1666870117188, + "epoch": 0.7867132867132867, + "grad_norm": 0.4429211914539337, + "kl": 0.04310205578804016, + "learning_rate": 4.997620553954645e-06, + "loss": 0.0017, + "reward": 3.1541666984558105, + "reward_std": 1.132741928100586, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8208333849906921, + "step": 225 + }, + { + "completion_length": 731.0, + "epoch": 0.7902097902097902, + "grad_norm": 0.4210525155067444, + "kl": 0.0507250651717186, + "learning_rate": 4.997426424476787e-06, + "loss": 0.002, + "reward": 3.758333206176758, + "reward_std": 0.40052053332328796, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 226 + }, + { + "completion_length": 683.1666870117188, + "epoch": 0.7937062937062938, + "grad_norm": 1.443489670753479, + "kl": 0.1432674527168274, + "learning_rate": 4.9972246874049254e-06, + "loss": 0.0057, + "reward": 2.7166666984558105, + "reward_std": 1.075019359588623, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 227 + }, + { + "completion_length": 749.0, + "epoch": 0.7972027972027972, + "grad_norm": 0.4731828272342682, + "kl": 0.05084119364619255, + "learning_rate": 4.9970153433535855e-06, + "loss": 0.002, + "reward": 2.5250000953674316, + "reward_std": 0.49371039867401123, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8583332896232605, + "step": 228 + }, + { + "completion_length": 749.6666870117188, + "epoch": 0.8006993006993007, + "grad_norm": 1.1463042497634888, + "kl": 0.0917380303144455, + "learning_rate": 4.996798392960466e-06, + "loss": 0.0037, + "reward": 3.1000001430511475, + "reward_std": 1.1304867267608643, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 229 + }, + { + "completion_length": 444.3333435058594, + "epoch": 0.8041958041958042, + "grad_norm": 2.1588308811187744, + "kl": 0.2637466788291931, + "learning_rate": 4.9965738368864345e-06, + "loss": 0.0105, + "reward": 1.4583333730697632, + "reward_std": 0.665895402431488, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4583333432674408, + "step": 230 + }, + { + "completion_length": 563.8333740234375, + "epoch": 0.8076923076923077, + "grad_norm": 1.7064660787582397, + "kl": 0.15527644753456116, + "learning_rate": 4.99634167581553e-06, + "loss": 0.0062, + "reward": 2.9208335876464844, + "reward_std": 1.1095513105392456, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5874999761581421, + "step": 231 + }, + { + "completion_length": 571.6666870117188, + "epoch": 0.8111888111888111, + "grad_norm": 0.7909032106399536, + "kl": 0.10144728422164917, + "learning_rate": 4.996101910454953e-06, + "loss": 0.0041, + "reward": 3.200000286102295, + "reward_std": 0.6928204298019409, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.699999988079071, + "step": 232 + }, + { + "completion_length": 442.16668701171875, + "epoch": 0.8146853146853147, + "grad_norm": 2.3640758991241455, + "kl": 0.1561039686203003, + "learning_rate": 4.995854541535072e-06, + "loss": 0.0062, + "reward": 2.8583333492279053, + "reward_std": 1.5499732494354248, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 233 + }, + { + "completion_length": 635.0, + "epoch": 0.8181818181818182, + "grad_norm": 1.519736409187317, + "kl": 0.08059443533420563, + "learning_rate": 4.995599569809414e-06, + "loss": 0.0032, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 234 + }, + { + "completion_length": 867.1666870117188, + "epoch": 0.8216783216783217, + "grad_norm": 1.0411657094955444, + "kl": 0.18848155438899994, + "learning_rate": 4.995336996054668e-06, + "loss": 0.0075, + "reward": 2.566666603088379, + "reward_std": 0.8010410666465759, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 235 + }, + { + "completion_length": 767.0, + "epoch": 0.8251748251748252, + "grad_norm": 1.3162877559661865, + "kl": 0.1943603754043579, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.0078, + "reward": 2.8458335399627686, + "reward_std": 1.271457552909851, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8458333015441895, + "step": 236 + }, + { + "completion_length": 971.0, + "epoch": 0.8286713286713286, + "grad_norm": 0.7847824096679688, + "kl": 0.07626049965620041, + "learning_rate": 4.994789045680448e-06, + "loss": 0.0031, + "reward": 2.766666889190674, + "reward_std": 1.1245739459991455, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7666666507720947, + "step": 237 + }, + { + "completion_length": 552.0, + "epoch": 0.8321678321678322, + "grad_norm": 0.7410560250282288, + "kl": 0.10457824170589447, + "learning_rate": 4.994503670730126e-06, + "loss": 0.0042, + "reward": 3.391666889190674, + "reward_std": 0.7059863805770874, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7250000238418579, + "step": 238 + }, + { + "completion_length": 725.6666870117188, + "epoch": 0.8356643356643356, + "grad_norm": 0.4836815595626831, + "kl": 0.05600851774215698, + "learning_rate": 4.9942106970890136e-06, + "loss": 0.0022, + "reward": 2.7333333492279053, + "reward_std": 0.40207791328430176, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8999999761581421, + "step": 239 + }, + { + "completion_length": 670.1666870117188, + "epoch": 0.8391608391608392, + "grad_norm": 1.1572860479354858, + "kl": 0.09645780920982361, + "learning_rate": 4.993910125649561e-06, + "loss": 0.0039, + "reward": 1.945833444595337, + "reward_std": 1.1002748012542725, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.612500011920929, + "step": 240 + }, + { + "completion_length": 716.0, + "epoch": 0.8426573426573427, + "grad_norm": 0.6385201811790466, + "kl": 0.10877624154090881, + "learning_rate": 4.993601957327361e-06, + "loss": 0.0044, + "reward": 1.7999999523162842, + "reward_std": 1.3168143033981323, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 241 + }, + { + "completion_length": 783.0, + "epoch": 0.8461538461538461, + "grad_norm": 0.4785465598106384, + "kl": 0.06399235874414444, + "learning_rate": 4.993286193061145e-06, + "loss": 0.0026, + "reward": 2.258333444595337, + "reward_std": 0.5389031767845154, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9249999523162842, + "step": 242 + }, + { + "completion_length": 660.6666870117188, + "epoch": 0.8496503496503497, + "grad_norm": 0.7678278684616089, + "kl": 0.07323874533176422, + "learning_rate": 4.9929628338127904e-06, + "loss": 0.0029, + "reward": 2.575000047683716, + "reward_std": 1.0048632621765137, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7416666746139526, + "step": 243 + }, + { + "completion_length": 904.5, + "epoch": 0.8531468531468531, + "grad_norm": 0.41908255219459534, + "kl": 0.049275174736976624, + "learning_rate": 4.992631880567301e-06, + "loss": 0.002, + "reward": 1.9250000715255737, + "reward_std": 0.6354132890701294, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.9249999523162842, + "step": 244 + }, + { + "completion_length": 524.8333740234375, + "epoch": 0.8566433566433567, + "grad_norm": 0.9670363068580627, + "kl": 0.17363564670085907, + "learning_rate": 4.992293334332821e-06, + "loss": 0.0069, + "reward": 1.558333396911621, + "reward_std": 1.3331979513168335, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333373069763, + "step": 245 + }, + { + "completion_length": 869.1666870117188, + "epoch": 0.8601398601398601, + "grad_norm": 0.45620983839035034, + "kl": 0.0668826699256897, + "learning_rate": 4.991947196140619e-06, + "loss": 0.0027, + "reward": 2.5416667461395264, + "reward_std": 0.9057685732841492, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 246 + }, + { + "completion_length": 841.3333740234375, + "epoch": 0.8636363636363636, + "grad_norm": 0.559363603591919, + "kl": 0.0583985298871994, + "learning_rate": 4.991593467045092e-06, + "loss": 0.0023, + "reward": 2.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 247 + }, + { + "completion_length": 599.1666870117188, + "epoch": 0.8671328671328671, + "grad_norm": 0.9642091989517212, + "kl": 0.11994724720716476, + "learning_rate": 4.9912321481237616e-06, + "loss": 0.0048, + "reward": 2.5250000953674316, + "reward_std": 1.0810874700546265, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 248 + }, + { + "completion_length": 462.16668701171875, + "epoch": 0.8706293706293706, + "grad_norm": 36.93287658691406, + "kl": 9.688800811767578, + "learning_rate": 4.990863240477266e-06, + "loss": 0.3876, + "reward": 2.133333444595337, + "reward_std": 1.5154757499694824, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.46666666865348816, + "step": 249 + }, + { + "completion_length": 339.0, + "epoch": 0.8741258741258742, + "grad_norm": 26.625389099121094, + "kl": 0.959087610244751, + "learning_rate": 4.990486745229364e-06, + "loss": 0.0384, + "reward": 2.4000000953674316, + "reward_std": 1.4926488399505615, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5666666626930237, + "step": 250 + }, + { + "completion_length": 618.1666870117188, + "epoch": 0.8776223776223776, + "grad_norm": 0.8756181597709656, + "kl": 0.1540575623512268, + "learning_rate": 4.990102663526925e-06, + "loss": 0.0062, + "reward": 2.3583335876464844, + "reward_std": 0.7564169764518738, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916666030883789, + "step": 251 + }, + { + "completion_length": 659.0, + "epoch": 0.8811188811188811, + "grad_norm": 1.4729007482528687, + "kl": 0.22244331240653992, + "learning_rate": 4.989710996539926e-06, + "loss": 0.0089, + "reward": 2.6666667461395264, + "reward_std": 1.386602759361267, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6666666865348816, + "step": 252 + }, + { + "completion_length": 471.0, + "epoch": 0.8846153846153846, + "grad_norm": 1.7183626890182495, + "kl": 0.19531545042991638, + "learning_rate": 4.989311745461456e-06, + "loss": 0.0078, + "reward": 2.2624998092651367, + "reward_std": 1.547720193862915, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.42916664481163025, + "step": 253 + }, + { + "completion_length": 809.5, + "epoch": 0.8881118881118881, + "grad_norm": 1.3393943309783936, + "kl": 0.06276177614927292, + "learning_rate": 4.9889049115077e-06, + "loss": 0.0025, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 254 + }, + { + "completion_length": 696.0, + "epoch": 0.8916083916083916, + "grad_norm": 0.5159295201301575, + "kl": 0.06829811632633209, + "learning_rate": 4.988490495917948e-06, + "loss": 0.0027, + "reward": 2.375, + "reward_std": 0.8226482272148132, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.875, + "step": 255 + }, + { + "completion_length": 469.8333435058594, + "epoch": 0.8951048951048951, + "grad_norm": 15.731892585754395, + "kl": 5.195942401885986, + "learning_rate": 4.988068499954578e-06, + "loss": 0.2078, + "reward": 2.5333333015441895, + "reward_std": 1.7218208312988281, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5333333611488342, + "step": 256 + }, + { + "completion_length": 267.66668701171875, + "epoch": 0.8986013986013986, + "grad_norm": 2.6494510173797607, + "kl": 0.2645886242389679, + "learning_rate": 4.987638924903066e-06, + "loss": 0.0106, + "reward": 1.9833333492279053, + "reward_std": 1.6277797222137451, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4833333194255829, + "step": 257 + }, + { + "completion_length": 772.3333740234375, + "epoch": 0.9020979020979021, + "grad_norm": 0.4527927339076996, + "kl": 0.06693247705698013, + "learning_rate": 4.987201772071971e-06, + "loss": 0.0027, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 258 + }, + { + "completion_length": 585.6666870117188, + "epoch": 0.9055944055944056, + "grad_norm": 0.689224362373352, + "kl": 0.08530323952436447, + "learning_rate": 4.9867570427929356e-06, + "loss": 0.0034, + "reward": 0.7916666865348816, + "reward_std": 0.9183772802352905, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4583333432674408, + "step": 259 + }, + { + "completion_length": 537.1666870117188, + "epoch": 0.9090909090909091, + "grad_norm": 0.6728858947753906, + "kl": 0.0897747129201889, + "learning_rate": 4.986304738420684e-06, + "loss": 0.0036, + "reward": 3.129167079925537, + "reward_std": 1.1996268033981323, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7958333492279053, + "step": 260 + }, + { + "completion_length": 407.8333435058594, + "epoch": 0.9125874125874126, + "grad_norm": 1.1994887590408325, + "kl": 0.09183052182197571, + "learning_rate": 4.985844860333012e-06, + "loss": 0.0037, + "reward": 2.3583335876464844, + "reward_std": 1.752831220626831, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 261 + }, + { + "completion_length": 677.5, + "epoch": 0.916083916083916, + "grad_norm": 0.508855402469635, + "kl": 0.07326661795377731, + "learning_rate": 4.985377409930789e-06, + "loss": 0.0029, + "reward": 3.375, + "reward_std": 0.8635681867599487, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 262 + }, + { + "completion_length": 736.8333740234375, + "epoch": 0.9195804195804196, + "grad_norm": 0.9614912271499634, + "kl": 0.09196578711271286, + "learning_rate": 4.98490238863795e-06, + "loss": 0.0037, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 263 + }, + { + "completion_length": 770.8333740234375, + "epoch": 0.9230769230769231, + "grad_norm": 0.47455278038978577, + "kl": 0.06785900890827179, + "learning_rate": 4.984419797901491e-06, + "loss": 0.0027, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 264 + }, + { + "completion_length": 623.6666870117188, + "epoch": 0.9265734265734266, + "grad_norm": 0.5573136210441589, + "kl": 0.08627455681562424, + "learning_rate": 4.9839296391914696e-06, + "loss": 0.0035, + "reward": 3.116666793823242, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 265 + }, + { + "completion_length": 391.3333435058594, + "epoch": 0.9300699300699301, + "grad_norm": 1.9462356567382812, + "kl": 0.16661277413368225, + "learning_rate": 4.983431914000991e-06, + "loss": 0.0067, + "reward": 2.4749999046325684, + "reward_std": 1.4665435552597046, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 266 + }, + { + "completion_length": 397.3333435058594, + "epoch": 0.9335664335664335, + "grad_norm": 1.011677622795105, + "kl": 0.23764805495738983, + "learning_rate": 4.982926623846216e-06, + "loss": 0.0095, + "reward": 3.366666793823242, + "reward_std": 0.6274287104606628, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7000000476837158, + "step": 267 + }, + { + "completion_length": 417.0, + "epoch": 0.9370629370629371, + "grad_norm": 1.4490914344787598, + "kl": 0.13754335045814514, + "learning_rate": 4.9824137702663424e-06, + "loss": 0.0055, + "reward": 1.7125000953674316, + "reward_std": 0.9684717655181885, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7124999761581421, + "step": 268 + }, + { + "completion_length": 410.5, + "epoch": 0.9405594405594405, + "grad_norm": 0.8436146974563599, + "kl": 0.14260268211364746, + "learning_rate": 4.981893354823614e-06, + "loss": 0.0057, + "reward": 1.8125, + "reward_std": 1.1806514263153076, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6458333730697632, + "step": 269 + }, + { + "completion_length": 644.6666870117188, + "epoch": 0.9440559440559441, + "grad_norm": 0.7549885511398315, + "kl": 0.09023593366146088, + "learning_rate": 4.981365379103306e-06, + "loss": 0.0036, + "reward": 2.3500001430511475, + "reward_std": 1.3856406211853027, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 270 + }, + { + "completion_length": 195.5, + "epoch": 0.9475524475524476, + "grad_norm": 1.895914077758789, + "kl": 0.29670989513397217, + "learning_rate": 4.980829844713722e-06, + "loss": 0.0119, + "reward": 1.649999976158142, + "reward_std": 1.0168579816818237, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.3166666626930237, + "step": 271 + }, + { + "completion_length": 359.8333435058594, + "epoch": 0.951048951048951, + "grad_norm": 1.0856112241744995, + "kl": 0.255443274974823, + "learning_rate": 4.980286753286196e-06, + "loss": 0.0102, + "reward": 2.2916667461395264, + "reward_std": 1.2310227155685425, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.625, + "step": 272 + }, + { + "completion_length": 726.8333740234375, + "epoch": 0.9545454545454546, + "grad_norm": 0.2943981885910034, + "kl": 0.12990406155586243, + "learning_rate": 4.979736106475075e-06, + "loss": 0.0064, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 273 + }, + { + "completion_length": 680.0, + "epoch": 0.958041958041958, + "grad_norm": 0.5072641372680664, + "kl": 0.07472037523984909, + "learning_rate": 4.979177905957726e-06, + "loss": 0.003, + "reward": 3.012500286102295, + "reward_std": 1.1379531621932983, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8458333015441895, + "step": 274 + }, + { + "completion_length": 491.5, + "epoch": 0.9615384615384616, + "grad_norm": 0.6770206689834595, + "kl": 0.13075995445251465, + "learning_rate": 4.978612153434527e-06, + "loss": 0.0052, + "reward": 2.008333444595337, + "reward_std": 0.7618508338928223, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6750000715255737, + "step": 275 + }, + { + "completion_length": 749.6666870117188, + "epoch": 0.965034965034965, + "grad_norm": 0.5412439107894897, + "kl": 0.10561086982488632, + "learning_rate": 4.978038850628855e-06, + "loss": 0.0042, + "reward": 2.870833396911621, + "reward_std": 0.6615166068077087, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 276 + }, + { + "completion_length": 511.5, + "epoch": 0.9685314685314685, + "grad_norm": 1.1368520259857178, + "kl": 0.14474637806415558, + "learning_rate": 4.977457999287091e-06, + "loss": 0.0058, + "reward": 1.7583332061767578, + "reward_std": 1.0646204948425293, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 277 + }, + { + "completion_length": 750.6666870117188, + "epoch": 0.972027972027972, + "grad_norm": 1.0957084894180298, + "kl": 0.10108073800802231, + "learning_rate": 4.9768696011786095e-06, + "loss": 0.004, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 278 + }, + { + "completion_length": 324.3333435058594, + "epoch": 0.9755244755244755, + "grad_norm": 1.0172570943832397, + "kl": 0.31204575300216675, + "learning_rate": 4.976273658095772e-06, + "loss": 0.0125, + "reward": 0.908333420753479, + "reward_std": 1.0532886981964111, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40833330154418945, + "step": 279 + }, + { + "completion_length": 329.66668701171875, + "epoch": 0.9790209790209791, + "grad_norm": 0.753690242767334, + "kl": 0.09907300770282745, + "learning_rate": 4.975670171853926e-06, + "loss": 0.004, + "reward": 2.7750003337860107, + "reward_std": 1.0994317531585693, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7749999761581421, + "step": 280 + }, + { + "completion_length": 615.3333740234375, + "epoch": 0.9825174825174825, + "grad_norm": 0.8215593695640564, + "kl": 0.09376661479473114, + "learning_rate": 4.975059144291395e-06, + "loss": 0.0038, + "reward": 2.375, + "reward_std": 0.6509608626365662, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8749999403953552, + "step": 281 + }, + { + "completion_length": 435.8333435058594, + "epoch": 0.986013986013986, + "grad_norm": 1.3309355974197388, + "kl": 0.21346941590309143, + "learning_rate": 4.974440577269473e-06, + "loss": 0.0085, + "reward": 2.0333333015441895, + "reward_std": 1.6485350131988525, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5333333611488342, + "step": 282 + }, + { + "completion_length": 470.3333435058594, + "epoch": 0.9895104895104895, + "grad_norm": 1.1230376958847046, + "kl": 0.1047142893075943, + "learning_rate": 4.973814472672424e-06, + "loss": 0.0042, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 283 + }, + { + "completion_length": 887.5, + "epoch": 0.993006993006993, + "grad_norm": 0.6477030515670776, + "kl": 0.08142790198326111, + "learning_rate": 4.973180832407471e-06, + "loss": 0.0033, + "reward": 1.4250000715255737, + "reward_std": 0.9661781191825867, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5916666388511658, + "step": 284 + }, + { + "completion_length": 566.3333740234375, + "epoch": 0.9965034965034965, + "grad_norm": 0.7089259624481201, + "kl": 0.1486695259809494, + "learning_rate": 4.972539658404793e-06, + "loss": 0.0059, + "reward": 1.7166666984558105, + "reward_std": 0.7332576513290405, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 285 + }, + { + "completion_length": 899.3333740234375, + "epoch": 1.0, + "grad_norm": 0.6575971841812134, + "kl": 0.0989997610449791, + "learning_rate": 4.971890952617515e-06, + "loss": 0.004, + "reward": 2.8583335876464844, + "reward_std": 0.9960757493972778, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 286 + }, + { + "completion_length": 414.5, + "epoch": 1.0034965034965035, + "grad_norm": 1.0364247560501099, + "kl": 0.19011634588241577, + "learning_rate": 4.971234717021709e-06, + "loss": 0.0076, + "reward": 1.7916667461395264, + "reward_std": 1.7468304634094238, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6250000596046448, + "step": 287 + }, + { + "completion_length": 524.0, + "epoch": 1.006993006993007, + "grad_norm": 0.9833644032478333, + "kl": 0.14835724234580994, + "learning_rate": 4.970570953616383e-06, + "loss": 0.0059, + "reward": 2.3583335876464844, + "reward_std": 1.1191142797470093, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916666030883789, + "step": 288 + }, + { + "completion_length": 681.1666870117188, + "epoch": 1.0104895104895104, + "grad_norm": 0.6175888180732727, + "kl": 0.10941031575202942, + "learning_rate": 4.969899664423473e-06, + "loss": 0.0044, + "reward": 2.704166889190674, + "reward_std": 0.7567061185836792, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8708333373069763, + "step": 289 + }, + { + "completion_length": 386.5, + "epoch": 1.013986013986014, + "grad_norm": 2.7495882511138916, + "kl": 0.5513795614242554, + "learning_rate": 4.9692208514878445e-06, + "loss": 0.0221, + "reward": 1.3666666746139526, + "reward_std": 1.0023306608200073, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5333333611488342, + "step": 290 + }, + { + "completion_length": 679.6666870117188, + "epoch": 1.0174825174825175, + "grad_norm": 0.9174596667289734, + "kl": 0.14350205659866333, + "learning_rate": 4.968534516877279e-06, + "loss": 0.0057, + "reward": 2.879167079925537, + "reward_std": 1.0047906637191772, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7124999761581421, + "step": 291 + }, + { + "completion_length": 322.0, + "epoch": 1.020979020979021, + "grad_norm": 6.856034278869629, + "kl": 3.479478597640991, + "learning_rate": 4.96784066268247e-06, + "loss": 0.1392, + "reward": 0.875, + "reward_std": 0.9832345247268677, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.375, + "step": 292 + }, + { + "completion_length": 500.5, + "epoch": 1.0244755244755244, + "grad_norm": 0.8394511938095093, + "kl": 0.14955884218215942, + "learning_rate": 4.967139291017018e-06, + "loss": 0.006, + "reward": 2.133333206176758, + "reward_std": 1.149202585220337, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 293 + }, + { + "completion_length": 470.5, + "epoch": 1.027972027972028, + "grad_norm": 1.0547795295715332, + "kl": 0.26865124702453613, + "learning_rate": 4.966430404017424e-06, + "loss": 0.0107, + "reward": 1.7916667461395264, + "reward_std": 1.1534368991851807, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 294 + }, + { + "completion_length": 357.3333435058594, + "epoch": 1.0314685314685315, + "grad_norm": 1.61123788356781, + "kl": 0.2728823125362396, + "learning_rate": 4.965714003843079e-06, + "loss": 0.0109, + "reward": 3.266666889190674, + "reward_std": 1.6014575958251953, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7666666507720947, + "step": 295 + }, + { + "completion_length": 388.3333435058594, + "epoch": 1.034965034965035, + "grad_norm": 0.8229731917381287, + "kl": 0.33708059787750244, + "learning_rate": 4.964990092676263e-06, + "loss": 0.0135, + "reward": 0.7125000357627869, + "reward_std": 0.5300353765487671, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3791666626930237, + "step": 296 + }, + { + "completion_length": 667.0, + "epoch": 1.0384615384615385, + "grad_norm": 1.0831242799758911, + "kl": 0.26999422907829285, + "learning_rate": 4.964258672722135e-06, + "loss": 0.0108, + "reward": 2.5458335876464844, + "reward_std": 1.5425965785980225, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7124999761581421, + "step": 297 + }, + { + "completion_length": 804.1666870117188, + "epoch": 1.0419580419580419, + "grad_norm": 0.625715434551239, + "kl": 0.12136679887771606, + "learning_rate": 4.963519746208726e-06, + "loss": 0.0049, + "reward": 1.5791667699813843, + "reward_std": 1.2249915599822998, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7458333373069763, + "step": 298 + }, + { + "completion_length": 615.3333740234375, + "epoch": 1.0454545454545454, + "grad_norm": 0.9705678820610046, + "kl": 0.2214520424604416, + "learning_rate": 4.962773315386935e-06, + "loss": 0.0089, + "reward": 2.516666889190674, + "reward_std": 1.2355836629867554, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 299 + }, + { + "completion_length": 836.1666870117188, + "epoch": 1.048951048951049, + "grad_norm": 1.5465428829193115, + "kl": 0.24709966778755188, + "learning_rate": 4.962019382530521e-06, + "loss": 0.0099, + "reward": 2.0458333492279053, + "reward_std": 1.097544550895691, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 300 + }, + { + "completion_length": 597.6666870117188, + "epoch": 1.0524475524475525, + "grad_norm": 3.8257570266723633, + "kl": 0.9686455130577087, + "learning_rate": 4.961257949936092e-06, + "loss": 0.0387, + "reward": 1.4750001430511475, + "reward_std": 1.025061011314392, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 301 + }, + { + "completion_length": 516.6666870117188, + "epoch": 1.055944055944056, + "grad_norm": 2.1578736305236816, + "kl": 0.25257474184036255, + "learning_rate": 4.960489019923105e-06, + "loss": 0.0101, + "reward": 1.712499976158142, + "reward_std": 1.2360976934432983, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7125000357627869, + "step": 302 + }, + { + "completion_length": 390.3333435058594, + "epoch": 1.0594405594405594, + "grad_norm": 1.1851695775985718, + "kl": 0.30646514892578125, + "learning_rate": 4.959712594833855e-06, + "loss": 0.0123, + "reward": 1.3875000476837158, + "reward_std": 1.3440377712249756, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5541666746139526, + "step": 303 + }, + { + "completion_length": 329.66668701171875, + "epoch": 1.062937062937063, + "grad_norm": 1.7874314785003662, + "kl": 0.5978689193725586, + "learning_rate": 4.958928677033465e-06, + "loss": 0.0239, + "reward": 2.5625, + "reward_std": 1.447562575340271, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5625, + "step": 304 + }, + { + "completion_length": 676.5, + "epoch": 1.0664335664335665, + "grad_norm": 1.6353819370269775, + "kl": 0.2865048348903656, + "learning_rate": 4.958137268909887e-06, + "loss": 0.0115, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 305 + }, + { + "completion_length": 685.1666870117188, + "epoch": 1.06993006993007, + "grad_norm": 0.5405178666114807, + "kl": 0.16403402388095856, + "learning_rate": 4.957338372873886e-06, + "loss": 0.0066, + "reward": 1.1166666746139526, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 306 + }, + { + "completion_length": 377.16668701171875, + "epoch": 1.0734265734265733, + "grad_norm": 1.3861095905303955, + "kl": 0.5912900567054749, + "learning_rate": 4.956531991359038e-06, + "loss": 0.0237, + "reward": 0.9541667699813843, + "reward_std": 0.9423928260803223, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4541666507720947, + "step": 307 + }, + { + "completion_length": 568.1666870117188, + "epoch": 1.0769230769230769, + "grad_norm": 2.0841739177703857, + "kl": 0.3946326673030853, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.0158, + "reward": 1.2583333253860474, + "reward_std": 1.1876096725463867, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666388511658, + "step": 308 + }, + { + "completion_length": 610.1666870117188, + "epoch": 1.0804195804195804, + "grad_norm": 0.7838713526725769, + "kl": 0.20940952003002167, + "learning_rate": 4.95489678174111e-06, + "loss": 0.0084, + "reward": 1.1750000715255737, + "reward_std": 1.1035170555114746, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6750000715255737, + "step": 309 + }, + { + "completion_length": 780.3333740234375, + "epoch": 1.083916083916084, + "grad_norm": 0.91953444480896, + "kl": 0.13563194870948792, + "learning_rate": 4.9540679586191605e-06, + "loss": 0.0054, + "reward": 1.8500001430511475, + "reward_std": 1.006479024887085, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 310 + }, + { + "completion_length": 468.66668701171875, + "epoch": 1.0874125874125875, + "grad_norm": 1.1062681674957275, + "kl": 0.36474311351776123, + "learning_rate": 4.953231659980613e-06, + "loss": 0.0146, + "reward": 2.058333396911621, + "reward_std": 1.7576736211776733, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 311 + }, + { + "completion_length": 571.3333740234375, + "epoch": 1.0909090909090908, + "grad_norm": 0.7562583088874817, + "kl": 0.17403468489646912, + "learning_rate": 4.9523878883729794e-06, + "loss": 0.007, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 312 + }, + { + "completion_length": 580.6666870117188, + "epoch": 1.0944055944055944, + "grad_norm": 0.7236371040344238, + "kl": 0.20542237162590027, + "learning_rate": 4.9515366463665324e-06, + "loss": 0.0082, + "reward": 2.4000000953674316, + "reward_std": 0.8803409337997437, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 313 + }, + { + "completion_length": 372.5, + "epoch": 1.097902097902098, + "grad_norm": 0.736242949962616, + "kl": 0.19798314571380615, + "learning_rate": 4.9506779365543054e-06, + "loss": 0.0079, + "reward": 3.0916666984558105, + "reward_std": 0.4247548282146454, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9250000715255737, + "step": 314 + }, + { + "completion_length": 660.8333740234375, + "epoch": 1.1013986013986015, + "grad_norm": 0.7641960978507996, + "kl": 0.29524654150009155, + "learning_rate": 4.949811761552074e-06, + "loss": 0.0118, + "reward": 2.4166669845581055, + "reward_std": 1.2176480293273926, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7499999403953552, + "step": 315 + }, + { + "completion_length": 838.3333740234375, + "epoch": 1.104895104895105, + "grad_norm": 0.5717921853065491, + "kl": 0.14558419585227966, + "learning_rate": 4.94893812399836e-06, + "loss": 0.0058, + "reward": 2.258333206176758, + "reward_std": 0.8662660121917725, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9249999523162842, + "step": 316 + }, + { + "completion_length": 308.8333435058594, + "epoch": 1.1083916083916083, + "grad_norm": 1.5407124757766724, + "kl": 0.36382099986076355, + "learning_rate": 4.948057026554415e-06, + "loss": 0.0146, + "reward": 1.2291667461395264, + "reward_std": 1.2054479122161865, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5625, + "step": 317 + }, + { + "completion_length": 582.1666870117188, + "epoch": 1.1118881118881119, + "grad_norm": 0.5300387144088745, + "kl": 0.19406351447105408, + "learning_rate": 4.947168471904213e-06, + "loss": 0.0078, + "reward": 1.375, + "reward_std": 0.4937104880809784, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8749999403953552, + "step": 318 + }, + { + "completion_length": 889.3333740234375, + "epoch": 1.1153846153846154, + "grad_norm": 0.7921298146247864, + "kl": 0.14385448396205902, + "learning_rate": 4.946272462754447e-06, + "loss": 0.0058, + "reward": 1.629166603088379, + "reward_std": 0.8614546656608582, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 319 + }, + { + "completion_length": 576.6666870117188, + "epoch": 1.118881118881119, + "grad_norm": 2.1564207077026367, + "kl": 0.8259252309799194, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.033, + "reward": 1.399999976158142, + "reward_std": 1.3337916135787964, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40000003576278687, + "step": 320 + }, + { + "completion_length": 471.8333435058594, + "epoch": 1.1223776223776223, + "grad_norm": 1.2515596151351929, + "kl": 0.24163812398910522, + "learning_rate": 4.944458091896515e-06, + "loss": 0.0097, + "reward": 1.883333444595337, + "reward_std": 1.3325413465499878, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666984558105, + "step": 321 + }, + { + "completion_length": 416.66668701171875, + "epoch": 1.1258741258741258, + "grad_norm": 0.7721207141876221, + "kl": 0.2213769555091858, + "learning_rate": 4.9435397357152406e-06, + "loss": 0.0089, + "reward": 1.899999976158142, + "reward_std": 0.6442049741744995, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 322 + }, + { + "completion_length": 1349.5, + "epoch": 1.1293706293706294, + "grad_norm": 0.3130567967891693, + "kl": 0.10197386145591736, + "learning_rate": 4.94261393608816e-06, + "loss": 0.0041, + "reward": 1.9666666984558105, + "reward_std": 0.9277212023735046, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7999999523162842, + "step": 323 + }, + { + "completion_length": 669.5, + "epoch": 1.132867132867133, + "grad_norm": 0.9291994571685791, + "kl": 0.22598087787628174, + "learning_rate": 4.9416806958354206e-06, + "loss": 0.009, + "reward": 0.949999988079071, + "reward_std": 0.6595453023910522, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 324 + }, + { + "completion_length": 184.1666717529297, + "epoch": 1.1363636363636362, + "grad_norm": 2.9357590675354004, + "kl": 0.44805118441581726, + "learning_rate": 4.9407400177998335e-06, + "loss": 0.0179, + "reward": 2.450000047683716, + "reward_std": 1.4673106670379639, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 325 + }, + { + "completion_length": 786.8333740234375, + "epoch": 1.1398601398601398, + "grad_norm": 0.7112540006637573, + "kl": 0.23709163069725037, + "learning_rate": 4.939791904846869e-06, + "loss": 0.0095, + "reward": 2.7333335876464844, + "reward_std": 0.40207797288894653, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 326 + }, + { + "completion_length": 391.16668701171875, + "epoch": 1.1433566433566433, + "grad_norm": 1.6311299800872803, + "kl": 0.31598275899887085, + "learning_rate": 4.938836359864641e-06, + "loss": 0.0126, + "reward": 2.2791666984558105, + "reward_std": 0.9937827587127686, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6124999523162842, + "step": 327 + }, + { + "completion_length": 325.8333435058594, + "epoch": 1.1468531468531469, + "grad_norm": 1.6858141422271729, + "kl": 0.40026235580444336, + "learning_rate": 4.937873385763909e-06, + "loss": 0.016, + "reward": 2.0250000953674316, + "reward_std": 1.1339092254638672, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 328 + }, + { + "completion_length": 313.3333435058594, + "epoch": 1.1503496503496504, + "grad_norm": 1.9852374792099, + "kl": 0.36842843890190125, + "learning_rate": 4.936902985478055e-06, + "loss": 0.0147, + "reward": 2.5250000953674316, + "reward_std": 1.1206024885177612, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 329 + }, + { + "completion_length": 333.66668701171875, + "epoch": 1.1538461538461537, + "grad_norm": 1.0456072092056274, + "kl": 0.3002980351448059, + "learning_rate": 4.935925161963089e-06, + "loss": 0.012, + "reward": 2.1083335876464844, + "reward_std": 0.9068719744682312, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7749999761581421, + "step": 330 + }, + { + "completion_length": 419.16668701171875, + "epoch": 1.1573426573426573, + "grad_norm": 0.9209095239639282, + "kl": 0.19463126361370087, + "learning_rate": 4.93493991819763e-06, + "loss": 0.0078, + "reward": 3.566666603088379, + "reward_std": 0.9389710426330566, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 331 + }, + { + "completion_length": 501.3333435058594, + "epoch": 1.1608391608391608, + "grad_norm": 0.9894822239875793, + "kl": 0.23653444647789001, + "learning_rate": 4.933947257182901e-06, + "loss": 0.0095, + "reward": 2.4583334922790527, + "reward_std": 1.6280101537704468, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 332 + }, + { + "completion_length": 283.8333435058594, + "epoch": 1.1643356643356644, + "grad_norm": 1.3056206703186035, + "kl": 0.3558562397956848, + "learning_rate": 4.932947181942721e-06, + "loss": 0.0142, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 333 + }, + { + "completion_length": 617.8333740234375, + "epoch": 1.167832167832168, + "grad_norm": 0.7905691266059875, + "kl": 0.2221965491771698, + "learning_rate": 4.9319396955234925e-06, + "loss": 0.0089, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 334 + }, + { + "completion_length": 802.3333740234375, + "epoch": 1.1713286713286712, + "grad_norm": 0.650930643081665, + "kl": 0.2902371287345886, + "learning_rate": 4.930924800994192e-06, + "loss": 0.0116, + "reward": 2.9375, + "reward_std": 0.9523326754570007, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7708332538604736, + "step": 335 + }, + { + "completion_length": 571.5, + "epoch": 1.1748251748251748, + "grad_norm": 2.592233180999756, + "kl": 0.44388240575790405, + "learning_rate": 4.9299025014463665e-06, + "loss": 0.0178, + "reward": 2.625, + "reward_std": 1.0167349576950073, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 336 + }, + { + "completion_length": 765.0, + "epoch": 1.1783216783216783, + "grad_norm": 0.8478806018829346, + "kl": 0.23496964573860168, + "learning_rate": 4.928872799994116e-06, + "loss": 0.0094, + "reward": 2.4166665077209473, + "reward_std": 1.0943796634674072, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.75, + "step": 337 + }, + { + "completion_length": 369.5, + "epoch": 1.1818181818181819, + "grad_norm": 1.2003388404846191, + "kl": 0.283313125371933, + "learning_rate": 4.92783569977409e-06, + "loss": 0.0113, + "reward": 2.4625000953674316, + "reward_std": 1.1056389808654785, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 338 + }, + { + "completion_length": 241.1666717529297, + "epoch": 1.1853146853146854, + "grad_norm": 1.1362509727478027, + "kl": 0.36542683839797974, + "learning_rate": 4.926791203945477e-06, + "loss": 0.0146, + "reward": 2.941667079925537, + "reward_std": 1.237908124923706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7749999761581421, + "step": 339 + }, + { + "completion_length": 262.3333435058594, + "epoch": 1.1888111888111887, + "grad_norm": 2.5425589084625244, + "kl": 0.46542689204216003, + "learning_rate": 4.925739315689991e-06, + "loss": 0.0186, + "reward": 2.2166666984558105, + "reward_std": 1.3840761184692383, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666388511658, + "step": 340 + }, + { + "completion_length": 458.8333435058594, + "epoch": 1.1923076923076923, + "grad_norm": 1.0685269832611084, + "kl": 0.28533288836479187, + "learning_rate": 4.924680038211868e-06, + "loss": 0.0114, + "reward": 3.0375001430511475, + "reward_std": 0.7974568605422974, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 341 + }, + { + "completion_length": 680.6666870117188, + "epoch": 1.1958041958041958, + "grad_norm": 1.049636960029602, + "kl": 0.2565695643424988, + "learning_rate": 4.923613374737848e-06, + "loss": 0.0103, + "reward": 2.875, + "reward_std": 0.7834221124649048, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.875, + "step": 342 + }, + { + "completion_length": 669.5, + "epoch": 1.1993006993006994, + "grad_norm": 0.47562330961227417, + "kl": 0.15911276638507843, + "learning_rate": 4.922539328517174e-06, + "loss": 0.0064, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 343 + }, + { + "completion_length": 533.1666870117188, + "epoch": 1.2027972027972027, + "grad_norm": 2.7278823852539062, + "kl": 0.42878812551498413, + "learning_rate": 4.921457902821578e-06, + "loss": 0.0172, + "reward": 2.191666603088379, + "reward_std": 1.1499637365341187, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 344 + }, + { + "completion_length": 410.5, + "epoch": 1.2062937062937062, + "grad_norm": 1.2009421586990356, + "kl": 0.30361247062683105, + "learning_rate": 4.92036910094527e-06, + "loss": 0.0121, + "reward": 2.2958333492279053, + "reward_std": 0.7362772822380066, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7958333492279053, + "step": 345 + }, + { + "completion_length": 678.0, + "epoch": 1.2097902097902098, + "grad_norm": 1.1339452266693115, + "kl": 0.36994367837905884, + "learning_rate": 4.9192729262049285e-06, + "loss": 0.0148, + "reward": 1.375, + "reward_std": 1.7195203304290771, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.375, + "step": 346 + }, + { + "completion_length": 364.66668701171875, + "epoch": 1.2132867132867133, + "grad_norm": 1.0105022192001343, + "kl": 0.22824347019195557, + "learning_rate": 4.918169381939693e-06, + "loss": 0.0091, + "reward": 1.75, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.75, + "step": 347 + }, + { + "completion_length": 231.83334350585938, + "epoch": 1.2167832167832167, + "grad_norm": 2.2665371894836426, + "kl": 0.5012367963790894, + "learning_rate": 4.917058471511149e-06, + "loss": 0.02, + "reward": 0.8916667699813843, + "reward_std": 0.8929818868637085, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 348 + }, + { + "completion_length": 149.6666717529297, + "epoch": 1.2202797202797202, + "grad_norm": 1.465401530265808, + "kl": 0.71610426902771, + "learning_rate": 4.915940198303324e-06, + "loss": 0.0286, + "reward": 2.183333396911621, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5166666507720947, + "step": 349 + }, + { + "completion_length": 265.66668701171875, + "epoch": 1.2237762237762237, + "grad_norm": 1.1324924230575562, + "kl": 0.39196571707725525, + "learning_rate": 4.914814565722671e-06, + "loss": 0.0157, + "reward": 2.016666889190674, + "reward_std": 0.9521905779838562, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 350 + }, + { + "completion_length": 228.1666717529297, + "epoch": 1.2272727272727273, + "grad_norm": 2.361294746398926, + "kl": 0.5443918704986572, + "learning_rate": 4.913681577198063e-06, + "loss": 0.0218, + "reward": 2.7916667461395264, + "reward_std": 1.5844295024871826, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 351 + }, + { + "completion_length": 645.1666870117188, + "epoch": 1.2307692307692308, + "grad_norm": 1.6541866064071655, + "kl": 0.3587082326412201, + "learning_rate": 4.912541236180779e-06, + "loss": 0.0143, + "reward": 3.0208334922790527, + "reward_std": 1.1969144344329834, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6875, + "step": 352 + }, + { + "completion_length": 592.1666870117188, + "epoch": 1.2342657342657342, + "grad_norm": 3.038172483444214, + "kl": 0.6741119623184204, + "learning_rate": 4.9113935461444955e-06, + "loss": 0.027, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 353 + }, + { + "completion_length": 416.16668701171875, + "epoch": 1.2377622377622377, + "grad_norm": 1.0763347148895264, + "kl": 0.32444697618484497, + "learning_rate": 4.910238510585275e-06, + "loss": 0.013, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 354 + }, + { + "completion_length": 276.3333435058594, + "epoch": 1.2412587412587412, + "grad_norm": 2.7986843585968018, + "kl": 0.9174998998641968, + "learning_rate": 4.909076133021558e-06, + "loss": 0.0367, + "reward": 2.625, + "reward_std": 1.616709589958191, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 355 + }, + { + "completion_length": 269.16668701171875, + "epoch": 1.2447552447552448, + "grad_norm": 0.9633187055587769, + "kl": 0.3955456614494324, + "learning_rate": 4.907906416994146e-06, + "loss": 0.0158, + "reward": 3.066667079925537, + "reward_std": 0.4490731656551361, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8999999761581421, + "step": 356 + }, + { + "completion_length": 313.16668701171875, + "epoch": 1.2482517482517483, + "grad_norm": 2.313849449157715, + "kl": 0.662523627281189, + "learning_rate": 4.906729366066197e-06, + "loss": 0.0265, + "reward": 1.7666667699813843, + "reward_std": 1.1767185926437378, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 357 + }, + { + "completion_length": 216.0, + "epoch": 1.2517482517482517, + "grad_norm": 4.379472255706787, + "kl": 0.7677586078643799, + "learning_rate": 4.905544983823214e-06, + "loss": 0.0307, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 358 + }, + { + "completion_length": 860.3333740234375, + "epoch": 1.2552447552447552, + "grad_norm": 2.9275009632110596, + "kl": 0.6438803672790527, + "learning_rate": 4.904353273873029e-06, + "loss": 0.0258, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 359 + }, + { + "completion_length": 217.83334350585938, + "epoch": 1.2587412587412588, + "grad_norm": 2.738201141357422, + "kl": 0.6947124004364014, + "learning_rate": 4.903154239845798e-06, + "loss": 0.0278, + "reward": 2.125, + "reward_std": 1.3129165172576904, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 360 + }, + { + "completion_length": 850.6666870117188, + "epoch": 1.2622377622377623, + "grad_norm": 0.6407853364944458, + "kl": 0.21777069568634033, + "learning_rate": 4.901947885393986e-06, + "loss": 0.0087, + "reward": 3.066667079925537, + "reward_std": 0.9389710426330566, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 361 + }, + { + "completion_length": 430.5, + "epoch": 1.2657342657342658, + "grad_norm": 3.934774398803711, + "kl": 1.3171093463897705, + "learning_rate": 4.900734214192358e-06, + "loss": 0.0527, + "reward": 2.4666666984558105, + "reward_std": 1.7380064725875854, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 362 + }, + { + "completion_length": 1049.0, + "epoch": 1.2692307692307692, + "grad_norm": 1.0587317943572998, + "kl": 0.3339938521385193, + "learning_rate": 4.899513229937968e-06, + "loss": 0.0134, + "reward": 1.183333396911621, + "reward_std": 0.6088240146636963, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8499999046325684, + "step": 363 + }, + { + "completion_length": 752.5, + "epoch": 1.2727272727272727, + "grad_norm": 0.9463182687759399, + "kl": 0.2867739796638489, + "learning_rate": 4.898284936350144e-06, + "loss": 0.0115, + "reward": 1.445833444595337, + "reward_std": 1.1011831760406494, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.612500011920929, + "step": 364 + }, + { + "completion_length": 302.3333435058594, + "epoch": 1.2762237762237763, + "grad_norm": 1.0470837354660034, + "kl": 0.4384109377861023, + "learning_rate": 4.897049337170483e-06, + "loss": 0.0175, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 365 + }, + { + "completion_length": 299.5, + "epoch": 1.2797202797202798, + "grad_norm": 1.4532350301742554, + "kl": 0.48457586765289307, + "learning_rate": 4.8958064361628334e-06, + "loss": 0.0194, + "reward": 2.183333396911621, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 366 + }, + { + "completion_length": 591.1666870117188, + "epoch": 1.2832167832167833, + "grad_norm": 1.7987697124481201, + "kl": 0.44638824462890625, + "learning_rate": 4.894556237113287e-06, + "loss": 0.0179, + "reward": 1.1166666746139526, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 367 + }, + { + "completion_length": 1384.5, + "epoch": 1.2867132867132867, + "grad_norm": 0.4040040373802185, + "kl": 0.12767352163791656, + "learning_rate": 4.893298743830168e-06, + "loss": 0.0051, + "reward": 1.691666841506958, + "reward_std": 1.4019334316253662, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 368 + }, + { + "completion_length": 440.3333435058594, + "epoch": 1.2902097902097902, + "grad_norm": 1.9347208738327026, + "kl": 0.46111249923706055, + "learning_rate": 4.89203396014402e-06, + "loss": 0.0184, + "reward": 1.9333332777023315, + "reward_std": 1.0510313510894775, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7666666507720947, + "step": 369 + }, + { + "completion_length": 602.8333740234375, + "epoch": 1.2937062937062938, + "grad_norm": 1.7568728923797607, + "kl": 0.5643346309661865, + "learning_rate": 4.890761889907589e-06, + "loss": 0.0226, + "reward": 1.2333333492279053, + "reward_std": 1.1513760089874268, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40000003576278687, + "step": 370 + }, + { + "completion_length": 584.1666870117188, + "epoch": 1.297202797202797, + "grad_norm": 2.6727964878082275, + "kl": 0.5424228310585022, + "learning_rate": 4.889482536995826e-06, + "loss": 0.0217, + "reward": 2.7916667461395264, + "reward_std": 1.4527275562286377, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 371 + }, + { + "completion_length": 302.16668701171875, + "epoch": 1.3006993006993006, + "grad_norm": 1.0215359926223755, + "kl": 0.38776999711990356, + "learning_rate": 4.888195905305859e-06, + "loss": 0.0155, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 372 + }, + { + "completion_length": 1038.5, + "epoch": 1.3041958041958042, + "grad_norm": 0.8328973054885864, + "kl": 0.31271958351135254, + "learning_rate": 4.886901998756995e-06, + "loss": 0.0125, + "reward": 1.4750001430511475, + "reward_std": 1.0486897230148315, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 373 + }, + { + "completion_length": 407.16668701171875, + "epoch": 1.3076923076923077, + "grad_norm": 1.812672734260559, + "kl": 0.3156376779079437, + "learning_rate": 4.885600821290692e-06, + "loss": 0.0126, + "reward": 3.183333396911621, + "reward_std": 1.2027745246887207, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 374 + }, + { + "completion_length": 264.16668701171875, + "epoch": 1.3111888111888113, + "grad_norm": 4.727421760559082, + "kl": 1.329188585281372, + "learning_rate": 4.884292376870567e-06, + "loss": 0.0532, + "reward": 2.0916666984558105, + "reward_std": 0.94890296459198, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 375 + }, + { + "completion_length": 516.5, + "epoch": 1.3146853146853146, + "grad_norm": 2.27711820602417, + "kl": 0.6330995559692383, + "learning_rate": 4.882976669482368e-06, + "loss": 0.0253, + "reward": 1.3583333492279053, + "reward_std": 1.1029127836227417, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6916667222976685, + "step": 376 + }, + { + "completion_length": 420.66668701171875, + "epoch": 1.3181818181818181, + "grad_norm": 2.9678735733032227, + "kl": 0.8875288367271423, + "learning_rate": 4.881653703133966e-06, + "loss": 0.0355, + "reward": 1.883333444595337, + "reward_std": 1.3325413465499878, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 377 + }, + { + "completion_length": 753.1666870117188, + "epoch": 1.3216783216783217, + "grad_norm": 0.774476945400238, + "kl": 0.36767667531967163, + "learning_rate": 4.880323481855347e-06, + "loss": 0.0147, + "reward": 2.3583335876464844, + "reward_std": 1.55962073802948, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 378 + }, + { + "completion_length": 182.5, + "epoch": 1.3251748251748252, + "grad_norm": 1.207739233970642, + "kl": 0.43915602564811707, + "learning_rate": 4.878986009698596e-06, + "loss": 0.0176, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 379 + }, + { + "completion_length": 341.0, + "epoch": 1.3286713286713288, + "grad_norm": 0.7512596249580383, + "kl": 0.3403867483139038, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.0136, + "reward": 3.0416667461395264, + "reward_std": 1.4800056219100952, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.875, + "step": 380 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.332167832167832, + "grad_norm": 2.4150354862213135, + "kl": 0.6687287092208862, + "learning_rate": 4.87628932906946e-06, + "loss": 0.0267, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 381 + }, + { + "completion_length": 657.5, + "epoch": 1.3356643356643356, + "grad_norm": 1.1033812761306763, + "kl": 0.2525772750377655, + "learning_rate": 4.874930128811631e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 382 + }, + { + "completion_length": 655.6666870117188, + "epoch": 1.3391608391608392, + "grad_norm": 2.7283008098602295, + "kl": 0.7087686061859131, + "learning_rate": 4.87356369410476e-06, + "loss": 0.0284, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6333333253860474, + "step": 383 + }, + { + "completion_length": 1037.166748046875, + "epoch": 1.3426573426573427, + "grad_norm": 1.4860605001449585, + "kl": 0.35516053438186646, + "learning_rate": 4.8721900291112415e-06, + "loss": 0.0142, + "reward": 1.3416666984558105, + "reward_std": 1.0956352949142456, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6750000715255737, + "step": 384 + }, + { + "completion_length": 776.0, + "epoch": 1.3461538461538463, + "grad_norm": 2.1169064044952393, + "kl": 0.6649973392486572, + "learning_rate": 4.870809138015499e-06, + "loss": 0.0266, + "reward": 1.4750001430511475, + "reward_std": 1.2451908588409424, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 385 + }, + { + "completion_length": 803.8333740234375, + "epoch": 1.3496503496503496, + "grad_norm": 1.5138658285140991, + "kl": 0.5593903064727783, + "learning_rate": 4.869421025023965e-06, + "loss": 0.0224, + "reward": 1.2250001430511475, + "reward_std": 1.229125738143921, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5583333373069763, + "step": 386 + }, + { + "completion_length": 579.8333740234375, + "epoch": 1.3531468531468531, + "grad_norm": 0.8988491892814636, + "kl": 0.2851899266242981, + "learning_rate": 4.868025694365073e-06, + "loss": 0.0114, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 387 + }, + { + "completion_length": 173.5, + "epoch": 1.3566433566433567, + "grad_norm": 1.3644022941589355, + "kl": 0.5744073390960693, + "learning_rate": 4.866623150289241e-06, + "loss": 0.023, + "reward": 1.9666666984558105, + "reward_std": 1.2412359714508057, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 388 + }, + { + "completion_length": 578.3333740234375, + "epoch": 1.3601398601398602, + "grad_norm": 0.8156600594520569, + "kl": 0.2687755227088928, + "learning_rate": 4.865213397068864e-06, + "loss": 0.0108, + "reward": 2.2916667461395264, + "reward_std": 1.4901063442230225, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 389 + }, + { + "completion_length": 1756.8333740234375, + "epoch": 1.3636363636363638, + "grad_norm": 0.36968812346458435, + "kl": 0.11372655630111694, + "learning_rate": 4.863796438998293e-06, + "loss": 0.0045, + "reward": 1.4666666984558105, + "reward_std": 0.9174239635467529, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6333333253860474, + "step": 390 + }, + { + "completion_length": 605.5, + "epoch": 1.367132867132867, + "grad_norm": 1.086455225944519, + "kl": 0.2938157916069031, + "learning_rate": 4.862372280393828e-06, + "loss": 0.0118, + "reward": 2.4375, + "reward_std": 1.2702115774154663, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7708333730697632, + "step": 391 + }, + { + "completion_length": 736.0, + "epoch": 1.3706293706293706, + "grad_norm": 3.411510705947876, + "kl": 0.9218753576278687, + "learning_rate": 4.860940925593703e-06, + "loss": 0.0369, + "reward": 1.4583333730697632, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666865348816, + "step": 392 + }, + { + "completion_length": 166.5, + "epoch": 1.3741258741258742, + "grad_norm": 1.464406132698059, + "kl": 0.34225571155548096, + "learning_rate": 4.8595023789580745e-06, + "loss": 0.0137, + "reward": 1.6041667461395264, + "reward_std": 0.7573666572570801, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7708332538604736, + "step": 393 + }, + { + "completion_length": 646.5, + "epoch": 1.3776223776223775, + "grad_norm": 1.6122732162475586, + "kl": 0.4424184560775757, + "learning_rate": 4.858056644869002e-06, + "loss": 0.0177, + "reward": 1.3250000476837158, + "reward_std": 0.9527591466903687, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8250000476837158, + "step": 394 + }, + { + "completion_length": 641.1666870117188, + "epoch": 1.381118881118881, + "grad_norm": 0.6985570192337036, + "kl": 0.23967330157756805, + "learning_rate": 4.856603727730446e-06, + "loss": 0.0096, + "reward": 2.5458333492279053, + "reward_std": 1.5425965785980225, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7125000357627869, + "step": 395 + }, + { + "completion_length": 161.83334350585938, + "epoch": 1.3846153846153846, + "grad_norm": 1.9270485639572144, + "kl": 0.7514389753341675, + "learning_rate": 4.855143631968242e-06, + "loss": 0.0301, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 396 + }, + { + "completion_length": 166.0, + "epoch": 1.3881118881118881, + "grad_norm": 1.2144757509231567, + "kl": 0.35039469599723816, + "learning_rate": 4.853676362030095e-06, + "loss": 0.014, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 397 + }, + { + "completion_length": 569.0, + "epoch": 1.3916083916083917, + "grad_norm": 6.755039215087891, + "kl": 0.7890805006027222, + "learning_rate": 4.852201922385564e-06, + "loss": 0.0316, + "reward": 2.1083333492279053, + "reward_std": 1.7987264394760132, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6083333492279053, + "step": 398 + }, + { + "completion_length": 909.0, + "epoch": 1.395104895104895, + "grad_norm": 0.7347401976585388, + "kl": 0.18117789924144745, + "learning_rate": 4.850720317526047e-06, + "loss": 0.0072, + "reward": 1.962499976158142, + "reward_std": 0.534263551235199, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7958333492279053, + "step": 399 + }, + { + "completion_length": 793.5, + "epoch": 1.3986013986013985, + "grad_norm": 0.849243700504303, + "kl": 0.27008673548698425, + "learning_rate": 4.849231551964771e-06, + "loss": 0.0108, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 400 + }, + { + "completion_length": 554.1666870117188, + "epoch": 1.402097902097902, + "grad_norm": 2.7050747871398926, + "kl": 0.5240260362625122, + "learning_rate": 4.847735630236773e-06, + "loss": 0.021, + "reward": 3.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 401 + }, + { + "completion_length": 215.83334350585938, + "epoch": 1.4055944055944056, + "grad_norm": 0.9243234992027283, + "kl": 0.3121068477630615, + "learning_rate": 4.84623255689889e-06, + "loss": 0.0125, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 402 + }, + { + "completion_length": 235.6666717529297, + "epoch": 1.4090909090909092, + "grad_norm": 3.3891875743865967, + "kl": 0.5218031406402588, + "learning_rate": 4.844722336529745e-06, + "loss": 0.0209, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 403 + }, + { + "completion_length": 923.5, + "epoch": 1.4125874125874125, + "grad_norm": 3.197908878326416, + "kl": 0.7076524496078491, + "learning_rate": 4.84320497372973e-06, + "loss": 0.0283, + "reward": 2.0458335876464844, + "reward_std": 1.3396285772323608, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7125000357627869, + "step": 404 + }, + { + "completion_length": 197.83334350585938, + "epoch": 1.416083916083916, + "grad_norm": 1.1261261701583862, + "kl": 0.3264281153678894, + "learning_rate": 4.841680473120994e-06, + "loss": 0.0131, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 405 + }, + { + "completion_length": 554.5, + "epoch": 1.4195804195804196, + "grad_norm": 3.3561604022979736, + "kl": 0.8642048835754395, + "learning_rate": 4.840148839347434e-06, + "loss": 0.0346, + "reward": 1.8500001430511475, + "reward_std": 1.0315039157867432, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 406 + }, + { + "completion_length": 795.8333740234375, + "epoch": 1.4230769230769231, + "grad_norm": 4.25921630859375, + "kl": 0.770601749420166, + "learning_rate": 4.838610077074669e-06, + "loss": 0.0308, + "reward": 1.2916667461395264, + "reward_std": 1.0551856756210327, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 407 + }, + { + "completion_length": 915.0, + "epoch": 1.4265734265734267, + "grad_norm": 0.571506142616272, + "kl": 0.20412606000900269, + "learning_rate": 4.837064190990036e-06, + "loss": 0.0082, + "reward": 2.241666793823242, + "reward_std": 1.3698238134384155, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7416666746139526, + "step": 408 + }, + { + "completion_length": 520.6666870117188, + "epoch": 1.43006993006993, + "grad_norm": 0.9773194193840027, + "kl": 0.29276588559150696, + "learning_rate": 4.835511185802574e-06, + "loss": 0.0117, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 409 + }, + { + "completion_length": 357.5, + "epoch": 1.4335664335664335, + "grad_norm": 2.5951545238494873, + "kl": 0.4989779591560364, + "learning_rate": 4.833951066243004e-06, + "loss": 0.02, + "reward": 1.945833444595337, + "reward_std": 1.279689073562622, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.612500011920929, + "step": 410 + }, + { + "completion_length": 794.3333740234375, + "epoch": 1.437062937062937, + "grad_norm": 0.761000394821167, + "kl": 0.20721551775932312, + "learning_rate": 4.832383837063723e-06, + "loss": 0.0083, + "reward": 2.0416667461395264, + "reward_std": 1.100189447402954, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 411 + }, + { + "completion_length": 1086.5, + "epoch": 1.4405594405594406, + "grad_norm": 0.9872347116470337, + "kl": 0.296750009059906, + "learning_rate": 4.830809503038781e-06, + "loss": 0.0119, + "reward": 2.0916666984558105, + "reward_std": 1.442365050315857, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 412 + }, + { + "completion_length": 168.5, + "epoch": 1.4440559440559442, + "grad_norm": 1.2185351848602295, + "kl": 0.34197482466697693, + "learning_rate": 4.829228068963873e-06, + "loss": 0.0137, + "reward": 3.2333333492279053, + "reward_std": 0.5671566724777222, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 413 + }, + { + "completion_length": 775.3333740234375, + "epoch": 1.4475524475524475, + "grad_norm": 1.1913334131240845, + "kl": 0.3759481906890869, + "learning_rate": 4.8276395396563215e-06, + "loss": 0.015, + "reward": 0.8916667699813843, + "reward_std": 0.5633975267410278, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7250000834465027, + "step": 414 + }, + { + "completion_length": 203.6666717529297, + "epoch": 1.451048951048951, + "grad_norm": 1.0359302759170532, + "kl": 0.31211602687835693, + "learning_rate": 4.826043919955062e-06, + "loss": 0.0125, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 415 + }, + { + "completion_length": 543.6666870117188, + "epoch": 1.4545454545454546, + "grad_norm": 0.7396105527877808, + "kl": 0.25116777420043945, + "learning_rate": 4.824441214720629e-06, + "loss": 0.01, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 416 + }, + { + "completion_length": 253.0, + "epoch": 1.458041958041958, + "grad_norm": 2.3947131633758545, + "kl": 0.3577002286911011, + "learning_rate": 4.8228314288351405e-06, + "loss": 0.0143, + "reward": 1.8500001430511475, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 417 + }, + { + "completion_length": 776.0, + "epoch": 1.4615384615384617, + "grad_norm": 0.9339893460273743, + "kl": 0.2636467218399048, + "learning_rate": 4.821214567202284e-06, + "loss": 0.0105, + "reward": 2.2333333492279053, + "reward_std": 0.5671566724777222, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 418 + }, + { + "completion_length": 185.33334350585938, + "epoch": 1.465034965034965, + "grad_norm": 3.6216635704040527, + "kl": 0.6233493685722351, + "learning_rate": 4.8195906347473e-06, + "loss": 0.0249, + "reward": 1.8000000715255737, + "reward_std": 1.579240322113037, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 419 + }, + { + "completion_length": 1112.0, + "epoch": 1.4685314685314685, + "grad_norm": 0.6356344223022461, + "kl": 0.26539915800094604, + "learning_rate": 4.817959636416969e-06, + "loss": 0.0106, + "reward": 2.375, + "reward_std": 1.001873254776001, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 420 + }, + { + "completion_length": 531.1666870117188, + "epoch": 1.472027972027972, + "grad_norm": 0.8300501108169556, + "kl": 0.31844228506088257, + "learning_rate": 4.816321577179594e-06, + "loss": 0.0127, + "reward": 2.875, + "reward_std": 0.7834221124649048, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 421 + }, + { + "completion_length": 218.83334350585938, + "epoch": 1.4755244755244754, + "grad_norm": 0.796237051486969, + "kl": 0.331187903881073, + "learning_rate": 4.814676462024988e-06, + "loss": 0.0132, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 422 + }, + { + "completion_length": 186.83334350585938, + "epoch": 1.479020979020979, + "grad_norm": 1.279965877532959, + "kl": 0.3236890733242035, + "learning_rate": 4.8130242959644555e-06, + "loss": 0.0129, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 423 + }, + { + "completion_length": 249.0, + "epoch": 1.4825174825174825, + "grad_norm": 4.079779624938965, + "kl": 0.39256423711776733, + "learning_rate": 4.811365084030784e-06, + "loss": 0.0157, + "reward": 1.7125000953674316, + "reward_std": 0.9684717655181885, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7125000357627869, + "step": 424 + }, + { + "completion_length": 183.33334350585938, + "epoch": 1.486013986013986, + "grad_norm": 1.1069165468215942, + "kl": 0.262847363948822, + "learning_rate": 4.809698831278217e-06, + "loss": 0.0105, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 425 + }, + { + "completion_length": 199.6666717529297, + "epoch": 1.4895104895104896, + "grad_norm": 1.413517713546753, + "kl": 0.39733991026878357, + "learning_rate": 4.808025542782453e-06, + "loss": 0.0159, + "reward": 2.7083334922790527, + "reward_std": 0.8662660121917725, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 426 + }, + { + "completion_length": 235.6666717529297, + "epoch": 1.493006993006993, + "grad_norm": 0.9659198522567749, + "kl": 0.2365071177482605, + "learning_rate": 4.806345223640616e-06, + "loss": 0.0095, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 427 + }, + { + "completion_length": 774.1666870117188, + "epoch": 1.4965034965034965, + "grad_norm": 0.830765962600708, + "kl": 0.33350443840026855, + "learning_rate": 4.804657878971252e-06, + "loss": 0.0133, + "reward": 2.183333396911621, + "reward_std": 1.3265244960784912, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 428 + }, + { + "completion_length": 203.0, + "epoch": 1.5, + "grad_norm": 1.0319793224334717, + "kl": 0.27221041917800903, + "learning_rate": 4.802963513914304e-06, + "loss": 0.0109, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 429 + }, + { + "completion_length": 461.16668701171875, + "epoch": 1.5034965034965035, + "grad_norm": 1.0231879949569702, + "kl": 0.24733422696590424, + "learning_rate": 4.801262133631101e-06, + "loss": 0.0099, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 430 + }, + { + "completion_length": 244.83334350585938, + "epoch": 1.506993006993007, + "grad_norm": 0.9520881772041321, + "kl": 0.31419527530670166, + "learning_rate": 4.799553743304345e-06, + "loss": 0.0126, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 431 + }, + { + "completion_length": 206.83334350585938, + "epoch": 1.5104895104895104, + "grad_norm": 0.8148533701896667, + "kl": 0.2550124228000641, + "learning_rate": 4.7978383481380865e-06, + "loss": 0.0102, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 432 + }, + { + "completion_length": 1087.8333740234375, + "epoch": 1.513986013986014, + "grad_norm": 0.3516090214252472, + "kl": 0.2816867530345917, + "learning_rate": 4.796115953357718e-06, + "loss": 0.0113, + "reward": 2.2833333015441895, + "reward_std": 1.2408331632614136, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 433 + }, + { + "completion_length": 556.3333740234375, + "epoch": 1.5174825174825175, + "grad_norm": 3.6779227256774902, + "kl": 0.4250108003616333, + "learning_rate": 4.794386564209953e-06, + "loss": 0.017, + "reward": 2.4083335399627686, + "reward_std": 1.687132716178894, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 434 + }, + { + "completion_length": 707.8333740234375, + "epoch": 1.5209790209790208, + "grad_norm": 1.121485948562622, + "kl": 0.24696388840675354, + "learning_rate": 4.79265018596281e-06, + "loss": 0.0099, + "reward": 2.9000000953674316, + "reward_std": 0.9027735590934753, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 435 + }, + { + "completion_length": 469.8333435058594, + "epoch": 1.5244755244755246, + "grad_norm": 2.6518046855926514, + "kl": 0.7716752886772156, + "learning_rate": 4.790906823905599e-06, + "loss": 0.0309, + "reward": 1.8000000715255737, + "reward_std": 1.447066068649292, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 436 + }, + { + "completion_length": 192.83334350585938, + "epoch": 1.527972027972028, + "grad_norm": 1.165176272392273, + "kl": 0.2884241044521332, + "learning_rate": 4.7891564833489035e-06, + "loss": 0.0115, + "reward": 2.450000286102295, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 437 + }, + { + "completion_length": 254.6666717529297, + "epoch": 1.5314685314685315, + "grad_norm": 0.8783808350563049, + "kl": 0.26613113284111023, + "learning_rate": 4.787399169624562e-06, + "loss": 0.0106, + "reward": 3.370833396911621, + "reward_std": 1.011238932609558, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 438 + }, + { + "completion_length": 158.5, + "epoch": 1.534965034965035, + "grad_norm": 2.008617877960205, + "kl": 0.5028926134109497, + "learning_rate": 4.7856348880856595e-06, + "loss": 0.0201, + "reward": 1.7416666746139526, + "reward_std": 1.1517016887664795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7416666746139526, + "step": 439 + }, + { + "completion_length": 208.5, + "epoch": 1.5384615384615383, + "grad_norm": 0.8693957924842834, + "kl": 0.2799164056777954, + "learning_rate": 4.783863644106502e-06, + "loss": 0.0112, + "reward": 2.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 440 + }, + { + "completion_length": 211.5, + "epoch": 1.541958041958042, + "grad_norm": 1.5437381267547607, + "kl": 0.3011782467365265, + "learning_rate": 4.782085443082607e-06, + "loss": 0.012, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 441 + }, + { + "completion_length": 491.8333435058594, + "epoch": 1.5454545454545454, + "grad_norm": 3.308060884475708, + "kl": 0.43526870012283325, + "learning_rate": 4.780300290430683e-06, + "loss": 0.0174, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 442 + }, + { + "completion_length": 177.1666717529297, + "epoch": 1.548951048951049, + "grad_norm": 2.3108198642730713, + "kl": 0.6005208492279053, + "learning_rate": 4.778508191588613e-06, + "loss": 0.024, + "reward": 2.683333396911621, + "reward_std": 1.2110600471496582, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 443 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.5524475524475525, + "grad_norm": 0.9576809406280518, + "kl": 0.3041282296180725, + "learning_rate": 4.776709152015443e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 444 + }, + { + "completion_length": 807.3333740234375, + "epoch": 1.5559440559440558, + "grad_norm": 0.6298768520355225, + "kl": 0.2337806224822998, + "learning_rate": 4.774903177191358e-06, + "loss": 0.0094, + "reward": 2.5458335876464844, + "reward_std": 1.3377609252929688, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 445 + }, + { + "completion_length": 523.6666870117188, + "epoch": 1.5594405594405596, + "grad_norm": 1.1019190549850464, + "kl": 0.39509618282318115, + "learning_rate": 4.773090272617672e-06, + "loss": 0.0158, + "reward": 2.049999952316284, + "reward_std": 1.5391557216644287, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 446 + }, + { + "completion_length": 787.6666870117188, + "epoch": 1.562937062937063, + "grad_norm": 0.893694281578064, + "kl": 0.37470337748527527, + "learning_rate": 4.771270443816805e-06, + "loss": 0.015, + "reward": 2.2083334922790527, + "reward_std": 0.8720186948776245, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 447 + }, + { + "completion_length": 546.8333740234375, + "epoch": 1.5664335664335665, + "grad_norm": 0.837485134601593, + "kl": 0.22402605414390564, + "learning_rate": 4.769443696332272e-06, + "loss": 0.009, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 448 + }, + { + "completion_length": 177.6666717529297, + "epoch": 1.56993006993007, + "grad_norm": 1.617317795753479, + "kl": 0.3958384692668915, + "learning_rate": 4.767610035728663e-06, + "loss": 0.0158, + "reward": 2.875, + "reward_std": 1.0068515539169312, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 449 + }, + { + "completion_length": 147.33334350585938, + "epoch": 1.5734265734265733, + "grad_norm": 0.9628480076789856, + "kl": 0.3490566611289978, + "learning_rate": 4.765769467591626e-06, + "loss": 0.014, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 450 + }, + { + "completion_length": 203.83334350585938, + "epoch": 1.5769230769230769, + "grad_norm": 0.9194980263710022, + "kl": 0.3181028962135315, + "learning_rate": 4.763921997527849e-06, + "loss": 0.0127, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 451 + }, + { + "completion_length": 167.5, + "epoch": 1.5804195804195804, + "grad_norm": 3.041954517364502, + "kl": 0.426164835691452, + "learning_rate": 4.762067631165049e-06, + "loss": 0.017, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 452 + }, + { + "completion_length": 212.33334350585938, + "epoch": 1.583916083916084, + "grad_norm": 1.1762245893478394, + "kl": 0.2974995970726013, + "learning_rate": 4.760206374151947e-06, + "loss": 0.0119, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 453 + }, + { + "completion_length": 493.66668701171875, + "epoch": 1.5874125874125875, + "grad_norm": 1.3206851482391357, + "kl": 0.36789295077323914, + "learning_rate": 4.7583382321582525e-06, + "loss": 0.0147, + "reward": 1.9166667461395264, + "reward_std": 1.2738393545150757, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.75, + "step": 454 + }, + { + "completion_length": 205.0, + "epoch": 1.5909090909090908, + "grad_norm": 1.0482568740844727, + "kl": 0.2594867944717407, + "learning_rate": 4.7564632108746524e-06, + "loss": 0.0104, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 455 + }, + { + "completion_length": 502.8333435058594, + "epoch": 1.5944055944055944, + "grad_norm": 2.1341159343719482, + "kl": 0.4591405391693115, + "learning_rate": 4.754581316012785e-06, + "loss": 0.0184, + "reward": 3.7083334922790527, + "reward_std": 0.591960072517395, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 456 + }, + { + "completion_length": 633.3333740234375, + "epoch": 1.597902097902098, + "grad_norm": 1.0107204914093018, + "kl": 0.24642407894134521, + "learning_rate": 4.752692553305229e-06, + "loss": 0.0099, + "reward": 3.0375001430511475, + "reward_std": 0.7974569201469421, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708333373069763, + "step": 457 + }, + { + "completion_length": 517.0, + "epoch": 1.6013986013986012, + "grad_norm": 0.6217291355133057, + "kl": 0.22938358783721924, + "learning_rate": 4.750796928505484e-06, + "loss": 0.0092, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 458 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.604895104895105, + "grad_norm": 0.5446264743804932, + "kl": 0.1968853920698166, + "learning_rate": 4.7488944473879515e-06, + "loss": 0.0079, + "reward": 1.625, + "reward_std": 0.7960842847824097, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 459 + }, + { + "completion_length": 193.83334350585938, + "epoch": 1.6083916083916083, + "grad_norm": 0.8946224451065063, + "kl": 0.25773894786834717, + "learning_rate": 4.746985115747918e-06, + "loss": 0.0103, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 460 + }, + { + "completion_length": 204.6666717529297, + "epoch": 1.6118881118881119, + "grad_norm": 0.8260864019393921, + "kl": 0.2527741491794586, + "learning_rate": 4.745068939401539e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 461 + }, + { + "completion_length": 848.6666870117188, + "epoch": 1.6153846153846154, + "grad_norm": 1.5746495723724365, + "kl": 0.3351367712020874, + "learning_rate": 4.743145924185821e-06, + "loss": 0.0134, + "reward": 2.25, + "reward_std": 0.7803846597671509, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.75, + "step": 462 + }, + { + "completion_length": 190.0, + "epoch": 1.6188811188811187, + "grad_norm": 1.0435597896575928, + "kl": 0.26553571224212646, + "learning_rate": 4.741216075958602e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 463 + }, + { + "completion_length": 209.6666717529297, + "epoch": 1.6223776223776225, + "grad_norm": 1.0996354818344116, + "kl": 0.31133967638015747, + "learning_rate": 4.7392794005985324e-06, + "loss": 0.0125, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 464 + }, + { + "completion_length": 512.6666870117188, + "epoch": 1.6258741258741258, + "grad_norm": 0.7010518908500671, + "kl": 0.21432137489318848, + "learning_rate": 4.737335904005063e-06, + "loss": 0.0086, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 465 + }, + { + "completion_length": 527.0, + "epoch": 1.6293706293706294, + "grad_norm": 0.5995029211044312, + "kl": 0.22433510422706604, + "learning_rate": 4.735385592098421e-06, + "loss": 0.009, + "reward": 2.7916667461395264, + "reward_std": 1.4527275562286377, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 466 + }, + { + "completion_length": 191.0, + "epoch": 1.632867132867133, + "grad_norm": 1.2079272270202637, + "kl": 0.2614157795906067, + "learning_rate": 4.733428470819595e-06, + "loss": 0.0105, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 467 + }, + { + "completion_length": 783.1666870117188, + "epoch": 1.6363636363636362, + "grad_norm": 2.2251851558685303, + "kl": 0.6713162660598755, + "learning_rate": 4.731464546130315e-06, + "loss": 0.0269, + "reward": 2.4375, + "reward_std": 1.3401259183883667, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7708333730697632, + "step": 468 + }, + { + "completion_length": 529.1666870117188, + "epoch": 1.63986013986014, + "grad_norm": 0.5742272138595581, + "kl": 0.23623262345790863, + "learning_rate": 4.729493824013036e-06, + "loss": 0.0094, + "reward": 2.2125000953674316, + "reward_std": 1.234073519706726, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7124999761581421, + "step": 469 + }, + { + "completion_length": 181.0, + "epoch": 1.6433566433566433, + "grad_norm": 1.7596086263656616, + "kl": 0.33919036388397217, + "learning_rate": 4.72751631047092e-06, + "loss": 0.0136, + "reward": 1.8500001430511475, + "reward_std": 1.2247450351715088, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 470 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.6468531468531469, + "grad_norm": 1.0671755075454712, + "kl": 0.27314767241477966, + "learning_rate": 4.725532011527817e-06, + "loss": 0.0109, + "reward": 3.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 471 + }, + { + "completion_length": 189.6666717529297, + "epoch": 1.6503496503496504, + "grad_norm": 1.0676515102386475, + "kl": 0.2805836498737335, + "learning_rate": 4.723540933228245e-06, + "loss": 0.0112, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 472 + }, + { + "completion_length": 836.5, + "epoch": 1.6538461538461537, + "grad_norm": 0.8203516006469727, + "kl": 0.172221839427948, + "learning_rate": 4.721543081637372e-06, + "loss": 0.0069, + "reward": 1.5833333730697632, + "reward_std": 1.0308573246002197, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7499999403953552, + "step": 473 + }, + { + "completion_length": 169.0, + "epoch": 1.6573426573426573, + "grad_norm": 1.7924721240997314, + "kl": 0.30363911390304565, + "learning_rate": 4.719538462841003e-06, + "loss": 0.0121, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 474 + }, + { + "completion_length": 176.6666717529297, + "epoch": 1.6608391608391608, + "grad_norm": 0.19596193730831146, + "kl": 0.24111799895763397, + "learning_rate": 4.717527082945555e-06, + "loss": 0.0108, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 475 + }, + { + "completion_length": 234.6666717529297, + "epoch": 1.6643356643356644, + "grad_norm": 0.9966434240341187, + "kl": 0.25714850425720215, + "learning_rate": 4.715508948078037e-06, + "loss": 0.0103, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 476 + }, + { + "completion_length": 1046.8333740234375, + "epoch": 1.667832167832168, + "grad_norm": 0.6285001635551453, + "kl": 0.1687658280134201, + "learning_rate": 4.71348406438604e-06, + "loss": 0.0068, + "reward": 2.0250000953674316, + "reward_std": 1.4372718334197998, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 477 + }, + { + "completion_length": 219.1666717529297, + "epoch": 1.6713286713286712, + "grad_norm": 1.0476932525634766, + "kl": 0.29544544219970703, + "learning_rate": 4.71145243803771e-06, + "loss": 0.0118, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 478 + }, + { + "completion_length": 561.1666870117188, + "epoch": 1.6748251748251748, + "grad_norm": 1.0641223192214966, + "kl": 0.1950298398733139, + "learning_rate": 4.709414075221734e-06, + "loss": 0.0078, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 479 + }, + { + "completion_length": 228.5, + "epoch": 1.6783216783216783, + "grad_norm": 0.8561164736747742, + "kl": 0.26422810554504395, + "learning_rate": 4.707368982147318e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 480 + }, + { + "completion_length": 509.3333435058594, + "epoch": 1.6818181818181817, + "grad_norm": 0.5843437314033508, + "kl": 0.20474323630332947, + "learning_rate": 4.70531716504417e-06, + "loss": 0.0082, + "reward": 2.183333396911621, + "reward_std": 1.2027745246887207, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 481 + }, + { + "completion_length": 548.6666870117188, + "epoch": 1.6853146853146854, + "grad_norm": 0.648353636264801, + "kl": 0.18905925750732422, + "learning_rate": 4.703258630162481e-06, + "loss": 0.0076, + "reward": 2.2125000953674316, + "reward_std": 1.3164108991622925, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7124999761581421, + "step": 482 + }, + { + "completion_length": 219.6666717529297, + "epoch": 1.6888111888111887, + "grad_norm": 4.2207932472229, + "kl": 1.0905920267105103, + "learning_rate": 4.701193383772905e-06, + "loss": 0.0436, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 483 + }, + { + "completion_length": 1049.166748046875, + "epoch": 1.6923076923076923, + "grad_norm": 0.5171648859977722, + "kl": 0.20516209304332733, + "learning_rate": 4.699121432166542e-06, + "loss": 0.0082, + "reward": 2.2333333492279053, + "reward_std": 0.9174240827560425, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 484 + }, + { + "completion_length": 201.6666717529297, + "epoch": 1.6958041958041958, + "grad_norm": 1.1004559993743896, + "kl": 0.2839426100254059, + "learning_rate": 4.697042781654913e-06, + "loss": 0.0114, + "reward": 1.870833396911621, + "reward_std": 0.193917915225029, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 485 + }, + { + "completion_length": 190.33334350585938, + "epoch": 1.6993006993006992, + "grad_norm": 1.0573567152023315, + "kl": 0.22315821051597595, + "learning_rate": 4.6949574385699514e-06, + "loss": 0.0089, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 486 + }, + { + "completion_length": 835.5, + "epoch": 1.702797202797203, + "grad_norm": 0.7173390984535217, + "kl": 0.1510881930589676, + "learning_rate": 4.6928654092639725e-06, + "loss": 0.006, + "reward": 1.5500000715255737, + "reward_std": 1.0904128551483154, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666388511658, + "step": 487 + }, + { + "completion_length": 615.8333740234375, + "epoch": 1.7062937062937062, + "grad_norm": 0.8014463186264038, + "kl": 0.22651296854019165, + "learning_rate": 4.690766700109659e-06, + "loss": 0.0091, + "reward": 2.7083334922790527, + "reward_std": 1.315453052520752, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 488 + }, + { + "completion_length": 169.6666717529297, + "epoch": 1.7097902097902098, + "grad_norm": 3.6473190784454346, + "kl": 0.40026336908340454, + "learning_rate": 4.688661317500045e-06, + "loss": 0.016, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 489 + }, + { + "completion_length": 1151.5, + "epoch": 1.7132867132867133, + "grad_norm": 0.8561959266662598, + "kl": 0.16577297449111938, + "learning_rate": 4.68654926784849e-06, + "loss": 0.0066, + "reward": 2.7083334922790527, + "reward_std": 1.0641508102416992, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.875, + "step": 490 + }, + { + "completion_length": 397.3333435058594, + "epoch": 1.7167832167832167, + "grad_norm": 1.0723934173583984, + "kl": 0.21682481467723846, + "learning_rate": 4.6844305575886635e-06, + "loss": 0.0087, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 491 + }, + { + "completion_length": 169.6666717529297, + "epoch": 1.7202797202797204, + "grad_norm": 1.4164685010910034, + "kl": 0.245243638753891, + "learning_rate": 4.682305193174524e-06, + "loss": 0.0098, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 492 + }, + { + "completion_length": 110.33333587646484, + "epoch": 1.7237762237762237, + "grad_norm": 5.974154949188232, + "kl": 1.1889418363571167, + "learning_rate": 4.680173181080302e-06, + "loss": 0.0476, + "reward": 3.075000286102295, + "reward_std": 1.1660832166671753, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 493 + }, + { + "completion_length": 215.5, + "epoch": 1.7272727272727273, + "grad_norm": 0.9199399352073669, + "kl": 0.2431143820285797, + "learning_rate": 4.6780345278004744e-06, + "loss": 0.0097, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 494 + }, + { + "completion_length": 190.83334350585938, + "epoch": 1.7307692307692308, + "grad_norm": 0.9801461696624756, + "kl": 0.25382137298583984, + "learning_rate": 4.675889239849749e-06, + "loss": 0.0102, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 495 + }, + { + "completion_length": 846.6666870117188, + "epoch": 1.7342657342657342, + "grad_norm": 0.6822401881217957, + "kl": 0.21501430869102478, + "learning_rate": 4.673737323763048e-06, + "loss": 0.0086, + "reward": 2.679166793823242, + "reward_std": 1.3748105764389038, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8458333015441895, + "step": 496 + }, + { + "completion_length": 182.33334350585938, + "epoch": 1.737762237762238, + "grad_norm": 6.3415422439575195, + "kl": 1.284159541130066, + "learning_rate": 4.671578786095479e-06, + "loss": 0.0514, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 497 + }, + { + "completion_length": 164.83334350585938, + "epoch": 1.7412587412587412, + "grad_norm": 1.421428918838501, + "kl": 0.3243716359138489, + "learning_rate": 4.669413633422322e-06, + "loss": 0.013, + "reward": 3.566666603088379, + "reward_std": 0.6013872623443604, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 498 + }, + { + "completion_length": 229.6666717529297, + "epoch": 1.7447552447552448, + "grad_norm": 0.8355535864830017, + "kl": 0.24279817938804626, + "learning_rate": 4.667241872339007e-06, + "loss": 0.0097, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 499 + }, + { + "completion_length": 672.6666870117188, + "epoch": 1.7482517482517483, + "grad_norm": 0.5215955376625061, + "kl": 0.19877499341964722, + "learning_rate": 4.665063509461098e-06, + "loss": 0.008, + "reward": 2.924999952316284, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 500 + }, + { + "completion_length": 198.83334350585938, + "epoch": 1.7517482517482517, + "grad_norm": 0.9148537516593933, + "kl": 0.24169328808784485, + "learning_rate": 4.6628785514242615e-06, + "loss": 0.0097, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 501 + }, + { + "completion_length": 928.5, + "epoch": 1.7552447552447552, + "grad_norm": 0.4413454532623291, + "kl": 0.15593400597572327, + "learning_rate": 4.6606870048842626e-06, + "loss": 0.0062, + "reward": 2.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 502 + }, + { + "completion_length": 508.0, + "epoch": 1.7587412587412588, + "grad_norm": 0.7536454796791077, + "kl": 0.24186736345291138, + "learning_rate": 4.658488876516929e-06, + "loss": 0.0097, + "reward": 2.625, + "reward_std": 1.616709589958191, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 503 + }, + { + "completion_length": 208.33334350585938, + "epoch": 1.762237762237762, + "grad_norm": 1.1730728149414062, + "kl": 0.2987002432346344, + "learning_rate": 4.656284173018144e-06, + "loss": 0.0119, + "reward": 2.758333206176758, + "reward_std": 1.0394309759140015, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 504 + }, + { + "completion_length": 197.1666717529297, + "epoch": 1.7657342657342658, + "grad_norm": 2.2083706855773926, + "kl": 0.3215945363044739, + "learning_rate": 4.654072901103815e-06, + "loss": 0.0129, + "reward": 2.0416667461395264, + "reward_std": 0.9002315402030945, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 505 + }, + { + "completion_length": 572.0, + "epoch": 1.7692307692307692, + "grad_norm": 0.8655341863632202, + "kl": 0.24153539538383484, + "learning_rate": 4.65185506750986e-06, + "loss": 0.0097, + "reward": 1.870833396911621, + "reward_std": 1.0137083530426025, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 506 + }, + { + "completion_length": 517.5, + "epoch": 1.7727272727272727, + "grad_norm": 0.49979329109191895, + "kl": 0.16330799460411072, + "learning_rate": 4.649630678992184e-06, + "loss": 0.0065, + "reward": 2.4000000953674316, + "reward_std": 0.9460445642471313, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 507 + }, + { + "completion_length": 324.16668701171875, + "epoch": 1.7762237762237763, + "grad_norm": 0.9129101037979126, + "kl": 0.26079505681991577, + "learning_rate": 4.6473997423266615e-06, + "loss": 0.0104, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 508 + }, + { + "completion_length": 316.16668701171875, + "epoch": 1.7797202797202796, + "grad_norm": 0.7381297945976257, + "kl": 0.34089159965515137, + "learning_rate": 4.645162264309112e-06, + "loss": 0.0136, + "reward": 3.2333335876464844, + "reward_std": 0.849509596824646, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 509 + }, + { + "completion_length": 207.83334350585938, + "epoch": 1.7832167832167833, + "grad_norm": 1.0436253547668457, + "kl": 0.2835765480995178, + "learning_rate": 4.642918251755281e-06, + "loss": 0.0113, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 510 + }, + { + "completion_length": 230.33334350585938, + "epoch": 1.7867132867132867, + "grad_norm": 0.9628374576568604, + "kl": 0.2641430199146271, + "learning_rate": 4.640667711500821e-06, + "loss": 0.0106, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 511 + }, + { + "completion_length": 507.66668701171875, + "epoch": 1.7902097902097902, + "grad_norm": 0.3851446211338043, + "kl": 0.251933217048645, + "learning_rate": 4.638410650401267e-06, + "loss": 0.0101, + "reward": 3.516666889190674, + "reward_std": 1.0614454746246338, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 512 + }, + { + "completion_length": 192.0, + "epoch": 1.7937062937062938, + "grad_norm": 1.3856638669967651, + "kl": 0.2984909415245056, + "learning_rate": 4.636147075332019e-06, + "loss": 0.0119, + "reward": 3.0916666984558105, + "reward_std": 1.2249150276184082, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 513 + }, + { + "completion_length": 206.83334350585938, + "epoch": 1.797202797202797, + "grad_norm": 0.9139816164970398, + "kl": 0.24960675835609436, + "learning_rate": 4.633876993188319e-06, + "loss": 0.01, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 514 + }, + { + "completion_length": 538.0, + "epoch": 1.8006993006993008, + "grad_norm": 0.7666388750076294, + "kl": 0.2067805826663971, + "learning_rate": 4.631600410885231e-06, + "loss": 0.0083, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 515 + }, + { + "completion_length": 186.0, + "epoch": 1.8041958041958042, + "grad_norm": 0.9322411417961121, + "kl": 0.24232684075832367, + "learning_rate": 4.62931733535762e-06, + "loss": 0.0097, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 516 + }, + { + "completion_length": 170.6666717529297, + "epoch": 1.8076923076923077, + "grad_norm": 1.5746034383773804, + "kl": 0.36948150396347046, + "learning_rate": 4.627027773560129e-06, + "loss": 0.0148, + "reward": 2.516666889190674, + "reward_std": 1.525341510772705, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 517 + }, + { + "completion_length": 193.0, + "epoch": 1.8111888111888113, + "grad_norm": 0.9759989380836487, + "kl": 0.3557225167751312, + "learning_rate": 4.62473173246716e-06, + "loss": 0.0142, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 518 + }, + { + "completion_length": 523.6666870117188, + "epoch": 1.8146853146853146, + "grad_norm": 0.9804190993309021, + "kl": 0.2574712038040161, + "learning_rate": 4.622429219072854e-06, + "loss": 0.0103, + "reward": 1.633333444595337, + "reward_std": 1.1919171810150146, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 519 + }, + { + "completion_length": 1029.166748046875, + "epoch": 1.8181818181818183, + "grad_norm": 0.5941687822341919, + "kl": 0.1915300190448761, + "learning_rate": 4.620120240391065e-06, + "loss": 0.0077, + "reward": 2.883333206176758, + "reward_std": 1.7192052602767944, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 520 + }, + { + "completion_length": 157.1666717529297, + "epoch": 1.8216783216783217, + "grad_norm": 3.1836304664611816, + "kl": 0.6161837577819824, + "learning_rate": 4.6178048034553435e-06, + "loss": 0.0246, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 521 + }, + { + "completion_length": 201.33334350585938, + "epoch": 1.8251748251748252, + "grad_norm": 1.5185062885284424, + "kl": 0.31097742915153503, + "learning_rate": 4.6154829153189105e-06, + "loss": 0.0124, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 522 + }, + { + "completion_length": 186.1666717529297, + "epoch": 1.8286713286713288, + "grad_norm": 0.936562180519104, + "kl": 0.3272198438644409, + "learning_rate": 4.613154583054641e-06, + "loss": 0.0131, + "reward": 2.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 523 + }, + { + "completion_length": 216.6666717529297, + "epoch": 1.832167832167832, + "grad_norm": 0.9323495626449585, + "kl": 0.3112618923187256, + "learning_rate": 4.610819813755038e-06, + "loss": 0.0125, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 524 + }, + { + "completion_length": 525.3333740234375, + "epoch": 1.8356643356643356, + "grad_norm": 0.40873953700065613, + "kl": 0.241009920835495, + "learning_rate": 4.608478614532215e-06, + "loss": 0.0096, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 525 + }, + { + "completion_length": 160.83334350585938, + "epoch": 1.8391608391608392, + "grad_norm": 1.1447237730026245, + "kl": 0.37633103132247925, + "learning_rate": 4.60613099251787e-06, + "loss": 0.0151, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 526 + }, + { + "completion_length": 176.5, + "epoch": 1.8426573426573427, + "grad_norm": 1.4215019941329956, + "kl": 0.31421756744384766, + "learning_rate": 4.603776954863266e-06, + "loss": 0.0126, + "reward": 2.2083334922790527, + "reward_std": 0.6003471612930298, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 527 + }, + { + "completion_length": 511.16668701171875, + "epoch": 1.8461538461538463, + "grad_norm": 0.7890862226486206, + "kl": 0.21260276436805725, + "learning_rate": 4.601416508739211e-06, + "loss": 0.0085, + "reward": 2.2916667461395264, + "reward_std": 1.4901063442230225, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 528 + }, + { + "completion_length": 145.6666717529297, + "epoch": 1.8496503496503496, + "grad_norm": 2.972633123397827, + "kl": 1.6821321249008179, + "learning_rate": 4.599049661336033e-06, + "loss": 0.0673, + "reward": 2.4583334922790527, + "reward_std": 1.3603004217147827, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 529 + }, + { + "completion_length": 337.66668701171875, + "epoch": 1.8531468531468531, + "grad_norm": 0.4933686852455139, + "kl": 0.2972989082336426, + "learning_rate": 4.596676419863561e-06, + "loss": 0.0119, + "reward": 3.758333206176758, + "reward_std": 0.4694856107234955, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9250000715255737, + "step": 530 + }, + { + "completion_length": 1491.166748046875, + "epoch": 1.8566433566433567, + "grad_norm": 0.7114420533180237, + "kl": 0.16526620090007782, + "learning_rate": 4.5942967915510975e-06, + "loss": 0.0066, + "reward": 2.683333396911621, + "reward_std": 0.8942409753799438, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 531 + }, + { + "completion_length": 822.0, + "epoch": 1.86013986013986, + "grad_norm": 0.4190931022167206, + "kl": 0.21502110362052917, + "learning_rate": 4.591910783647405e-06, + "loss": 0.0086, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 532 + }, + { + "completion_length": 739.5, + "epoch": 1.8636363636363638, + "grad_norm": 0.5615747570991516, + "kl": 0.223265141248703, + "learning_rate": 4.589518403420676e-06, + "loss": 0.0089, + "reward": 2.3500001430511475, + "reward_std": 1.5231547355651855, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 533 + }, + { + "completion_length": 188.6666717529297, + "epoch": 1.867132867132867, + "grad_norm": 0.754673957824707, + "kl": 0.2731919288635254, + "learning_rate": 4.587119658158517e-06, + "loss": 0.0109, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 534 + }, + { + "completion_length": 528.3333740234375, + "epoch": 1.8706293706293706, + "grad_norm": 0.45285508036613464, + "kl": 0.21540388464927673, + "learning_rate": 4.584714555167921e-06, + "loss": 0.0086, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 535 + }, + { + "completion_length": 513.1666870117188, + "epoch": 1.8741258741258742, + "grad_norm": 0.6436936259269714, + "kl": 0.2541727125644684, + "learning_rate": 4.582303101775249e-06, + "loss": 0.0102, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 536 + }, + { + "completion_length": 503.3333435058594, + "epoch": 1.8776223776223775, + "grad_norm": 0.5080775618553162, + "kl": 0.2073960304260254, + "learning_rate": 4.579885305326206e-06, + "loss": 0.0083, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 537 + }, + { + "completion_length": 209.6666717529297, + "epoch": 1.8811188811188813, + "grad_norm": 0.9030362963676453, + "kl": 0.283308744430542, + "learning_rate": 4.577461173185821e-06, + "loss": 0.0113, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 538 + }, + { + "completion_length": 121.5, + "epoch": 1.8846153846153846, + "grad_norm": 2.8895628452301025, + "kl": 0.8616495132446289, + "learning_rate": 4.5750307127384194e-06, + "loss": 0.0345, + "reward": 1.4666666984558105, + "reward_std": 1.2002778053283691, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 539 + }, + { + "completion_length": 208.83334350585938, + "epoch": 1.8881118881118881, + "grad_norm": 1.0781502723693848, + "kl": 0.2666887640953064, + "learning_rate": 4.572593931387604e-06, + "loss": 0.0107, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 540 + }, + { + "completion_length": 529.8333740234375, + "epoch": 1.8916083916083917, + "grad_norm": 0.8341970443725586, + "kl": 0.27578771114349365, + "learning_rate": 4.570150836556236e-06, + "loss": 0.011, + "reward": 2.683333396911621, + "reward_std": 0.9092121124267578, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 541 + }, + { + "completion_length": 509.0, + "epoch": 1.895104895104895, + "grad_norm": 0.7221694588661194, + "kl": 0.20753830671310425, + "learning_rate": 4.567701435686405e-06, + "loss": 0.0083, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 542 + }, + { + "completion_length": 999.0, + "epoch": 1.8986013986013988, + "grad_norm": 0.8567831516265869, + "kl": 0.2119346261024475, + "learning_rate": 4.5652457362394094e-06, + "loss": 0.0085, + "reward": 1.808333396911621, + "reward_std": 2.014302968978882, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 543 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.902097902097902, + "grad_norm": 0.5826951265335083, + "kl": 0.2415902316570282, + "learning_rate": 4.562783745695738e-06, + "loss": 0.0097, + "reward": 2.183333396911621, + "reward_std": 1.0230673551559448, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 544 + }, + { + "completion_length": 831.0, + "epoch": 1.9055944055944056, + "grad_norm": 0.5661029815673828, + "kl": 0.2621002495288849, + "learning_rate": 4.560315471555039e-06, + "loss": 0.0105, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 545 + }, + { + "completion_length": 190.6666717529297, + "epoch": 1.9090909090909092, + "grad_norm": 0.8984940648078918, + "kl": 0.261735200881958, + "learning_rate": 4.5578409213361055e-06, + "loss": 0.0105, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 546 + }, + { + "completion_length": 672.5, + "epoch": 1.9125874125874125, + "grad_norm": 0.6307451128959656, + "kl": 0.3331562280654907, + "learning_rate": 4.555360102576844e-06, + "loss": 0.0133, + "reward": 3.5916666984558105, + "reward_std": 0.5571505427360535, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9250000715255737, + "step": 547 + }, + { + "completion_length": 193.5, + "epoch": 1.916083916083916, + "grad_norm": 0.9689189791679382, + "kl": 0.31761375069618225, + "learning_rate": 4.55287302283426e-06, + "loss": 0.0127, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 548 + }, + { + "completion_length": 477.0, + "epoch": 1.9195804195804196, + "grad_norm": 1.1217161417007446, + "kl": 0.4803551435470581, + "learning_rate": 4.550379689684431e-06, + "loss": 0.0192, + "reward": 2.924999952316284, + "reward_std": 0.06123730167746544, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.9249999523162842, + "step": 549 + }, + { + "completion_length": 501.66668701171875, + "epoch": 1.9230769230769231, + "grad_norm": 0.48732584714889526, + "kl": 0.3280116021633148, + "learning_rate": 4.54788011072248e-06, + "loss": 0.0131, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 550 + }, + { + "completion_length": 190.5, + "epoch": 1.9265734265734267, + "grad_norm": 0.05169845372438431, + "kl": 0.2321687638759613, + "learning_rate": 4.545374293562559e-06, + "loss": 0.0117, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 551 + }, + { + "completion_length": 226.33334350585938, + "epoch": 1.93006993006993, + "grad_norm": 1.1284880638122559, + "kl": 0.3435511291027069, + "learning_rate": 4.542862245837821e-06, + "loss": 0.0137, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 552 + }, + { + "completion_length": 197.5, + "epoch": 1.9335664335664335, + "grad_norm": 0.8085185289382935, + "kl": 0.2905815541744232, + "learning_rate": 4.540343975200401e-06, + "loss": 0.0116, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 553 + }, + { + "completion_length": 504.8333435058594, + "epoch": 1.937062937062937, + "grad_norm": 0.38323989510536194, + "kl": 0.26971811056137085, + "learning_rate": 4.537819489321385e-06, + "loss": 0.0108, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 554 + }, + { + "completion_length": 172.5, + "epoch": 1.9405594405594404, + "grad_norm": 1.8462821245193481, + "kl": 0.32645952701568604, + "learning_rate": 4.535288795890799e-06, + "loss": 0.0131, + "reward": 2.8500001430511475, + "reward_std": 1.3190906047821045, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 555 + }, + { + "completion_length": 508.66668701171875, + "epoch": 1.9440559440559442, + "grad_norm": 0.48262494802474976, + "kl": 0.26610442996025085, + "learning_rate": 4.5327519026175694e-06, + "loss": 0.0106, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 556 + }, + { + "completion_length": 205.33334350585938, + "epoch": 1.9475524475524475, + "grad_norm": 0.8724077343940735, + "kl": 0.34979626536369324, + "learning_rate": 4.530208817229516e-06, + "loss": 0.014, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 557 + }, + { + "completion_length": 466.3333435058594, + "epoch": 1.951048951048951, + "grad_norm": 1.2409106492996216, + "kl": 0.5075003504753113, + "learning_rate": 4.527659547473317e-06, + "loss": 0.0203, + "reward": 1.774999976158142, + "reward_std": 1.3299436569213867, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6083333492279053, + "step": 558 + }, + { + "completion_length": 201.0, + "epoch": 1.9545454545454546, + "grad_norm": 0.9538130760192871, + "kl": 0.22750967741012573, + "learning_rate": 4.5251041011144905e-06, + "loss": 0.0091, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 559 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.958041958041958, + "grad_norm": 0.8161240220069885, + "kl": 0.28019654750823975, + "learning_rate": 4.522542485937369e-06, + "loss": 0.0112, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 560 + }, + { + "completion_length": 515.5, + "epoch": 1.9615384615384617, + "grad_norm": 0.6905736327171326, + "kl": 0.20913702249526978, + "learning_rate": 4.519974709745076e-06, + "loss": 0.0084, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 561 + }, + { + "completion_length": 201.5, + "epoch": 1.965034965034965, + "grad_norm": 1.109075665473938, + "kl": 0.29383933544158936, + "learning_rate": 4.517400780359505e-06, + "loss": 0.0118, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 562 + }, + { + "completion_length": 849.0, + "epoch": 1.9685314685314685, + "grad_norm": 0.5454800128936768, + "kl": 0.16988810896873474, + "learning_rate": 4.51482070562129e-06, + "loss": 0.0068, + "reward": 2.4666666984558105, + "reward_std": 1.949530005455017, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 563 + }, + { + "completion_length": 826.0, + "epoch": 1.972027972027972, + "grad_norm": 0.521063506603241, + "kl": 0.2149253934621811, + "learning_rate": 4.512234493389785e-06, + "loss": 0.0086, + "reward": 1.9666666984558105, + "reward_std": 1.5233734846115112, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 564 + }, + { + "completion_length": 502.8333435058594, + "epoch": 1.9755244755244754, + "grad_norm": 0.4798555076122284, + "kl": 0.26902374625205994, + "learning_rate": 4.509642151543043e-06, + "loss": 0.0108, + "reward": 1.625, + "reward_std": 0.7960842847824097, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 565 + }, + { + "completion_length": 525.0, + "epoch": 1.9790209790209792, + "grad_norm": 0.566384494304657, + "kl": 0.2703857123851776, + "learning_rate": 4.507043687977787e-06, + "loss": 0.0108, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 566 + }, + { + "completion_length": 194.33334350585938, + "epoch": 1.9825174825174825, + "grad_norm": 2.502077579498291, + "kl": 0.4179210364818573, + "learning_rate": 4.504439110609385e-06, + "loss": 0.0167, + "reward": 1.383333444595337, + "reward_std": 0.8920015096664429, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 567 + }, + { + "completion_length": 199.33334350585938, + "epoch": 1.986013986013986, + "grad_norm": 0.07109465450048447, + "kl": 0.2686344385147095, + "learning_rate": 4.501828427371834e-06, + "loss": 0.0131, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 568 + }, + { + "completion_length": 190.83334350585938, + "epoch": 1.9895104895104896, + "grad_norm": 1.11842942237854, + "kl": 0.2603175640106201, + "learning_rate": 4.4992116462177274e-06, + "loss": 0.0104, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 569 + }, + { + "completion_length": 513.8333740234375, + "epoch": 1.993006993006993, + "grad_norm": 0.47602808475494385, + "kl": 0.20756664872169495, + "learning_rate": 4.496588775118232e-06, + "loss": 0.0083, + "reward": 2.2916667461395264, + "reward_std": 1.19181227684021, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 570 + }, + { + "completion_length": 197.1666717529297, + "epoch": 1.9965034965034965, + "grad_norm": 0.7599025368690491, + "kl": 0.23664715886116028, + "learning_rate": 4.4939598220630724e-06, + "loss": 0.0095, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 571 + }, + { + "completion_length": 207.83334350585938, + "epoch": 2.0, + "grad_norm": 0.7908173203468323, + "kl": 0.28615739941596985, + "learning_rate": 4.491324795060491e-06, + "loss": 0.0114, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 572 + }, + { + "completion_length": 194.6666717529297, + "epoch": 2.0034965034965033, + "grad_norm": 0.9715352654457092, + "kl": 0.3183891177177429, + "learning_rate": 4.48868370213724e-06, + "loss": 0.0127, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 573 + }, + { + "completion_length": 173.83334350585938, + "epoch": 2.006993006993007, + "grad_norm": 2.3841874599456787, + "kl": 1.3214149475097656, + "learning_rate": 4.4860365513385456e-06, + "loss": 0.0529, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 574 + }, + { + "completion_length": 213.83334350585938, + "epoch": 2.0104895104895104, + "grad_norm": 0.9496575593948364, + "kl": 0.22735705971717834, + "learning_rate": 4.4833833507280884e-06, + "loss": 0.0091, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 575 + }, + { + "completion_length": 511.0, + "epoch": 2.013986013986014, + "grad_norm": 0.6045878529548645, + "kl": 0.25393787026405334, + "learning_rate": 4.4807241083879774e-06, + "loss": 0.0102, + "reward": 1.4583333730697632, + "reward_std": 0.8187898397445679, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 576 + }, + { + "completion_length": 222.1666717529297, + "epoch": 2.0174825174825175, + "grad_norm": 0.7379043102264404, + "kl": 0.22020569443702698, + "learning_rate": 4.478058832418726e-06, + "loss": 0.0088, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 577 + }, + { + "completion_length": 204.6666717529297, + "epoch": 2.020979020979021, + "grad_norm": 0.9404547810554504, + "kl": 0.2797861695289612, + "learning_rate": 4.475387530939226e-06, + "loss": 0.0112, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 578 + }, + { + "completion_length": 206.6666717529297, + "epoch": 2.0244755244755246, + "grad_norm": 0.8784480690956116, + "kl": 0.24152153730392456, + "learning_rate": 4.4727102120867274e-06, + "loss": 0.0097, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 579 + }, + { + "completion_length": 414.66668701171875, + "epoch": 2.027972027972028, + "grad_norm": 0.6715477705001831, + "kl": 0.21307629346847534, + "learning_rate": 4.470026884016805e-06, + "loss": 0.0085, + "reward": 2.4583334922790527, + "reward_std": 1.5001389980316162, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 580 + }, + { + "completion_length": 528.5, + "epoch": 2.0314685314685317, + "grad_norm": 0.7886191010475159, + "kl": 0.4145243763923645, + "learning_rate": 4.467337554903344e-06, + "loss": 0.0166, + "reward": 3.5416667461395264, + "reward_std": 1.0002083778381348, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.875, + "step": 581 + }, + { + "completion_length": 457.5, + "epoch": 2.034965034965035, + "grad_norm": 5.719381809234619, + "kl": 1.370613932609558, + "learning_rate": 4.464642232938505e-06, + "loss": 0.0548, + "reward": 1.9750001430511475, + "reward_std": 2.163504123687744, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 582 + }, + { + "completion_length": 361.5, + "epoch": 2.0384615384615383, + "grad_norm": 0.5381609201431274, + "kl": 0.23687216639518738, + "learning_rate": 4.461940926332708e-06, + "loss": 0.0095, + "reward": 1.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 583 + }, + { + "completion_length": 874.6666870117188, + "epoch": 2.041958041958042, + "grad_norm": 0.45025861263275146, + "kl": 0.16833463311195374, + "learning_rate": 4.4592336433146e-06, + "loss": 0.0067, + "reward": 2.9583334922790527, + "reward_std": 1.6554203033447266, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 584 + }, + { + "completion_length": 726.3333740234375, + "epoch": 2.0454545454545454, + "grad_norm": 0.4446694254875183, + "kl": 0.17844387888908386, + "learning_rate": 4.456520392131035e-06, + "loss": 0.0071, + "reward": 1.133333444595337, + "reward_std": 0.9595138430595398, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 585 + }, + { + "completion_length": 830.3333740234375, + "epoch": 2.0489510489510487, + "grad_norm": 0.8371572494506836, + "kl": 0.16316595673561096, + "learning_rate": 4.453801181047047e-06, + "loss": 0.0065, + "reward": 1.524999976158142, + "reward_std": 1.1206024885177612, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 586 + }, + { + "completion_length": 110.5, + "epoch": 2.0524475524475525, + "grad_norm": 3.6648356914520264, + "kl": 0.4860494136810303, + "learning_rate": 4.4510760183458246e-06, + "loss": 0.0194, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 587 + }, + { + "completion_length": 228.6666717529297, + "epoch": 2.055944055944056, + "grad_norm": 0.8717478513717651, + "kl": 0.28448450565338135, + "learning_rate": 4.448344912328686e-06, + "loss": 0.0114, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 588 + }, + { + "completion_length": 614.0, + "epoch": 2.0594405594405596, + "grad_norm": 0.352130651473999, + "kl": 0.19009076058864594, + "learning_rate": 4.445607871315053e-06, + "loss": 0.0076, + "reward": 1.7333333492279053, + "reward_std": 0.5307227969169617, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 589 + }, + { + "completion_length": 476.3333435058594, + "epoch": 2.062937062937063, + "grad_norm": 2.5581870079040527, + "kl": 0.5677192807197571, + "learning_rate": 4.442864903642428e-06, + "loss": 0.0227, + "reward": 1.8000000715255737, + "reward_std": 1.5792405605316162, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 590 + }, + { + "completion_length": 314.66668701171875, + "epoch": 2.0664335664335662, + "grad_norm": 0.657811164855957, + "kl": 0.20458662509918213, + "learning_rate": 4.440116017666365e-06, + "loss": 0.0082, + "reward": 3.116666793823242, + "reward_std": 1.3291600942611694, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 591 + }, + { + "completion_length": 516.0, + "epoch": 2.06993006993007, + "grad_norm": 0.473056823015213, + "kl": 0.19687163829803467, + "learning_rate": 4.437361221760449e-06, + "loss": 0.0079, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 592 + }, + { + "completion_length": 217.0, + "epoch": 2.0734265734265733, + "grad_norm": 0.793745756149292, + "kl": 0.2862774133682251, + "learning_rate": 4.434600524316266e-06, + "loss": 0.0115, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 593 + }, + { + "completion_length": 216.0, + "epoch": 2.076923076923077, + "grad_norm": 0.7589979767799377, + "kl": 0.2887541651725769, + "learning_rate": 4.431833933743378e-06, + "loss": 0.0116, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 594 + }, + { + "completion_length": 234.0, + "epoch": 2.0804195804195804, + "grad_norm": 0.952064037322998, + "kl": 0.30340343713760376, + "learning_rate": 4.4290614584693005e-06, + "loss": 0.0121, + "reward": 2.5375001430511475, + "reward_std": 0.9115578532218933, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 595 + }, + { + "completion_length": 1109.8333740234375, + "epoch": 2.0839160839160837, + "grad_norm": 0.382217139005661, + "kl": 0.1974603831768036, + "learning_rate": 4.426283106939474e-06, + "loss": 0.0079, + "reward": 1.7166666984558105, + "reward_std": 0.967298686504364, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7166666984558105, + "step": 596 + }, + { + "completion_length": 497.66668701171875, + "epoch": 2.0874125874125875, + "grad_norm": 0.7741627097129822, + "kl": 0.2393149733543396, + "learning_rate": 4.423498887617238e-06, + "loss": 0.0096, + "reward": 1.9583333730697632, + "reward_std": 1.400148868560791, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 597 + }, + { + "completion_length": 518.0, + "epoch": 2.090909090909091, + "grad_norm": 0.534230649471283, + "kl": 0.22715210914611816, + "learning_rate": 4.420708808983809e-06, + "loss": 0.0091, + "reward": 2.4583334922790527, + "reward_std": 1.3603003025054932, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 598 + }, + { + "completion_length": 502.8333435058594, + "epoch": 2.0944055944055946, + "grad_norm": 0.5411605834960938, + "kl": 0.2008448839187622, + "learning_rate": 4.41791287953825e-06, + "loss": 0.008, + "reward": 2.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 599 + }, + { + "completion_length": 545.6666870117188, + "epoch": 2.097902097902098, + "grad_norm": 0.44943779706954956, + "kl": 0.225155770778656, + "learning_rate": 4.415111107797445e-06, + "loss": 0.009, + "reward": 3.016666889190674, + "reward_std": 1.3952300548553467, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 600 + }, + { + "completion_length": 239.0, + "epoch": 2.1013986013986012, + "grad_norm": 0.9387716054916382, + "kl": 0.2535586357116699, + "learning_rate": 4.412303502296081e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 601 + }, + { + "completion_length": 188.0, + "epoch": 2.104895104895105, + "grad_norm": 3.3025033473968506, + "kl": 0.3564508557319641, + "learning_rate": 4.409490071586606e-06, + "loss": 0.0143, + "reward": 2.9583334922790527, + "reward_std": 1.6554205417633057, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 602 + }, + { + "completion_length": 526.8333740234375, + "epoch": 2.1083916083916083, + "grad_norm": 0.7135488986968994, + "kl": 0.25961729884147644, + "learning_rate": 4.406670824239221e-06, + "loss": 0.0104, + "reward": 2.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 603 + }, + { + "completion_length": 201.0, + "epoch": 2.111888111888112, + "grad_norm": 0.5526494979858398, + "kl": 0.26036110520362854, + "learning_rate": 4.403845768841842e-06, + "loss": 0.0104, + "reward": 2.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 604 + }, + { + "completion_length": 516.8333740234375, + "epoch": 2.1153846153846154, + "grad_norm": 0.4089651107788086, + "kl": 0.2617362141609192, + "learning_rate": 4.401014914000078e-06, + "loss": 0.0105, + "reward": 3.2916667461395264, + "reward_std": 1.6125807762145996, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 605 + }, + { + "completion_length": 192.5, + "epoch": 2.1188811188811187, + "grad_norm": 0.7996219396591187, + "kl": 0.30715522170066833, + "learning_rate": 4.398178268337202e-06, + "loss": 0.0123, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 606 + }, + { + "completion_length": 793.3333740234375, + "epoch": 2.1223776223776225, + "grad_norm": 0.8545472025871277, + "kl": 0.20438644289970398, + "learning_rate": 4.395335840494131e-06, + "loss": 0.0082, + "reward": 3.375, + "reward_std": 0.493710458278656, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.875, + "step": 607 + }, + { + "completion_length": 197.5, + "epoch": 2.125874125874126, + "grad_norm": 0.09662449359893799, + "kl": 0.2624778151512146, + "learning_rate": 4.3924876391293915e-06, + "loss": 0.0117, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 608 + }, + { + "completion_length": 199.0, + "epoch": 2.129370629370629, + "grad_norm": 0.8693634867668152, + "kl": 0.232680082321167, + "learning_rate": 4.389633672919099e-06, + "loss": 0.0093, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 609 + }, + { + "completion_length": 213.1666717529297, + "epoch": 2.132867132867133, + "grad_norm": 0.23271039128303528, + "kl": 0.2889987826347351, + "learning_rate": 4.386773950556931e-06, + "loss": 0.0139, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 610 + }, + { + "completion_length": 197.83334350585938, + "epoch": 2.1363636363636362, + "grad_norm": 0.8127601742744446, + "kl": 0.35951054096221924, + "learning_rate": 4.3839084807540956e-06, + "loss": 0.0144, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 611 + }, + { + "completion_length": 164.6666717529297, + "epoch": 2.13986013986014, + "grad_norm": 1.0649946928024292, + "kl": 0.26743820309638977, + "learning_rate": 4.381037272239311e-06, + "loss": 0.0107, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 612 + }, + { + "completion_length": 217.6666717529297, + "epoch": 2.1433566433566433, + "grad_norm": 0.8122753500938416, + "kl": 0.27118992805480957, + "learning_rate": 4.378160333758779e-06, + "loss": 0.0108, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 613 + }, + { + "completion_length": 201.6666717529297, + "epoch": 2.1468531468531467, + "grad_norm": 0.8640854358673096, + "kl": 0.2445271909236908, + "learning_rate": 4.3752776740761495e-06, + "loss": 0.0098, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 614 + }, + { + "completion_length": 188.6666717529297, + "epoch": 2.1503496503496504, + "grad_norm": 1.3168154954910278, + "kl": 0.2900705933570862, + "learning_rate": 4.372389301972506e-06, + "loss": 0.0116, + "reward": 1.7083333730697632, + "reward_std": 0.591960072517395, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 615 + }, + { + "completion_length": 241.6666717529297, + "epoch": 2.1538461538461537, + "grad_norm": 1.1053791046142578, + "kl": 0.4096168875694275, + "learning_rate": 4.36949522624633e-06, + "loss": 0.0164, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 616 + }, + { + "completion_length": 147.83334350585938, + "epoch": 2.1573426573426575, + "grad_norm": 3.980419874191284, + "kl": 1.5825055837631226, + "learning_rate": 4.366595455713479e-06, + "loss": 0.0633, + "reward": 2.3000001907348633, + "reward_std": 1.4812158346176147, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 617 + }, + { + "completion_length": 197.0, + "epoch": 2.160839160839161, + "grad_norm": 0.8954426050186157, + "kl": 0.23646585643291473, + "learning_rate": 4.3636899992071555e-06, + "loss": 0.0095, + "reward": 2.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 618 + }, + { + "completion_length": 221.33334350585938, + "epoch": 2.164335664335664, + "grad_norm": 0.8455007076263428, + "kl": 0.25921204686164856, + "learning_rate": 4.360778865577885e-06, + "loss": 0.0104, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 619 + }, + { + "completion_length": 196.5, + "epoch": 2.167832167832168, + "grad_norm": 0.8735758662223816, + "kl": 0.27696120738983154, + "learning_rate": 4.357862063693486e-06, + "loss": 0.0111, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 620 + }, + { + "completion_length": 177.83334350585938, + "epoch": 2.1713286713286712, + "grad_norm": 32.12022018432617, + "kl": 2.4454264640808105, + "learning_rate": 4.354939602439041e-06, + "loss": 0.0978, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 621 + }, + { + "completion_length": 184.6666717529297, + "epoch": 2.174825174825175, + "grad_norm": 2.8916237354278564, + "kl": 0.3946024775505066, + "learning_rate": 4.352011490716875e-06, + "loss": 0.0158, + "reward": 3.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 622 + }, + { + "completion_length": 210.33334350585938, + "epoch": 2.1783216783216783, + "grad_norm": 1.4287588596343994, + "kl": 0.32967257499694824, + "learning_rate": 4.349077737446525e-06, + "loss": 0.0132, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 623 + }, + { + "completion_length": 229.83334350585938, + "epoch": 2.1818181818181817, + "grad_norm": 0.04024571180343628, + "kl": 0.2965821325778961, + "learning_rate": 4.346138351564711e-06, + "loss": 0.0142, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 624 + }, + { + "completion_length": 153.83334350585938, + "epoch": 2.1853146853146854, + "grad_norm": 0.9452215433120728, + "kl": 0.26284661889076233, + "learning_rate": 4.34319334202531e-06, + "loss": 0.0105, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 625 + }, + { + "completion_length": 162.1666717529297, + "epoch": 2.1888111888111887, + "grad_norm": 32.100563049316406, + "kl": 7.969426155090332, + "learning_rate": 4.340242717799337e-06, + "loss": 0.3188, + "reward": 2.2916667461395264, + "reward_std": 0.8002604246139526, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 626 + }, + { + "completion_length": 175.5, + "epoch": 2.1923076923076925, + "grad_norm": 6.515329360961914, + "kl": 0.3849031627178192, + "learning_rate": 4.3372864878749e-06, + "loss": 0.0154, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 627 + }, + { + "completion_length": 504.3333435058594, + "epoch": 2.195804195804196, + "grad_norm": 0.6083482503890991, + "kl": 0.19082359969615936, + "learning_rate": 4.334324661257191e-06, + "loss": 0.0076, + "reward": 2.4583334922790527, + "reward_std": 1.5001389980316162, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 628 + }, + { + "completion_length": 196.0, + "epoch": 2.199300699300699, + "grad_norm": 0.9820056557655334, + "kl": 0.2912360727787018, + "learning_rate": 4.331357246968447e-06, + "loss": 0.0116, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 629 + }, + { + "completion_length": 544.0, + "epoch": 2.202797202797203, + "grad_norm": 0.5948340892791748, + "kl": 0.22720639407634735, + "learning_rate": 4.328384254047927e-06, + "loss": 0.0091, + "reward": 2.375, + "reward_std": 0.6509608626365662, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 630 + }, + { + "completion_length": 237.0, + "epoch": 2.2062937062937062, + "grad_norm": 0.0632646456360817, + "kl": 0.2671894431114197, + "learning_rate": 4.3254056915518815e-06, + "loss": 0.0131, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 631 + }, + { + "completion_length": 501.16668701171875, + "epoch": 2.20979020979021, + "grad_norm": 0.44626739621162415, + "kl": 0.2233467698097229, + "learning_rate": 4.322421568553529e-06, + "loss": 0.0089, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 632 + }, + { + "completion_length": 187.5, + "epoch": 2.2132867132867133, + "grad_norm": 0.9024590849876404, + "kl": 0.299750417470932, + "learning_rate": 4.319431894143027e-06, + "loss": 0.012, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 633 + }, + { + "completion_length": 532.5, + "epoch": 2.2167832167832167, + "grad_norm": 0.38001272082328796, + "kl": 0.28776365518569946, + "learning_rate": 4.316436677427441e-06, + "loss": 0.0115, + "reward": 3.566666603088379, + "reward_std": 0.9389711618423462, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 634 + }, + { + "completion_length": 201.6666717529297, + "epoch": 2.2202797202797204, + "grad_norm": 1.1841076612472534, + "kl": 0.3013113737106323, + "learning_rate": 4.313435927530719e-06, + "loss": 0.0121, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 635 + }, + { + "completion_length": 185.6666717529297, + "epoch": 2.2237762237762237, + "grad_norm": 0.8018883466720581, + "kl": 0.2923080325126648, + "learning_rate": 4.3104296535936695e-06, + "loss": 0.0117, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 636 + }, + { + "completion_length": 525.3333740234375, + "epoch": 2.227272727272727, + "grad_norm": 0.4936811923980713, + "kl": 0.25341111421585083, + "learning_rate": 4.3074178647739205e-06, + "loss": 0.0101, + "reward": 3.2083334922790527, + "reward_std": 0.9697508215904236, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 637 + }, + { + "completion_length": 191.83334350585938, + "epoch": 2.230769230769231, + "grad_norm": 0.6575815677642822, + "kl": 0.3100575804710388, + "learning_rate": 4.3044005702459055e-06, + "loss": 0.0124, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 638 + }, + { + "completion_length": 178.5, + "epoch": 2.234265734265734, + "grad_norm": 0.8525052666664124, + "kl": 0.31076908111572266, + "learning_rate": 4.301377779200826e-06, + "loss": 0.0124, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 639 + }, + { + "completion_length": 185.33334350585938, + "epoch": 2.237762237762238, + "grad_norm": 1.0106300115585327, + "kl": 0.30621784925460815, + "learning_rate": 4.2983495008466285e-06, + "loss": 0.0122, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 640 + }, + { + "completion_length": 186.5, + "epoch": 2.2412587412587412, + "grad_norm": 0.885761022567749, + "kl": 0.3738858103752136, + "learning_rate": 4.295315744407972e-06, + "loss": 0.015, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 641 + }, + { + "completion_length": 171.6666717529297, + "epoch": 2.2447552447552446, + "grad_norm": 1.113839030265808, + "kl": 0.3465404212474823, + "learning_rate": 4.2922765191262075e-06, + "loss": 0.0139, + "reward": 2.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 642 + }, + { + "completion_length": 203.0, + "epoch": 2.2482517482517483, + "grad_norm": 0.8950809836387634, + "kl": 0.2658528983592987, + "learning_rate": 4.28923183425934e-06, + "loss": 0.0106, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 643 + }, + { + "completion_length": 198.5, + "epoch": 2.2517482517482517, + "grad_norm": 0.9561752080917358, + "kl": 0.31710129976272583, + "learning_rate": 4.286181699082008e-06, + "loss": 0.0127, + "reward": 2.2833335399627686, + "reward_std": 1.0327956676483154, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 644 + }, + { + "completion_length": 168.1666717529297, + "epoch": 2.2552447552447554, + "grad_norm": 0.8310069441795349, + "kl": 0.27687615156173706, + "learning_rate": 4.283126122885455e-06, + "loss": 0.0111, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 645 + }, + { + "completion_length": 196.83334350585938, + "epoch": 2.2587412587412588, + "grad_norm": 0.09269661456346512, + "kl": 0.2699682414531708, + "learning_rate": 4.280065114977492e-06, + "loss": 0.012, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 646 + }, + { + "completion_length": 163.6666717529297, + "epoch": 2.262237762237762, + "grad_norm": 1.2992812395095825, + "kl": 0.3616819381713867, + "learning_rate": 4.276998684682482e-06, + "loss": 0.0145, + "reward": 2.375, + "reward_std": 1.1847995519638062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 647 + }, + { + "completion_length": 198.6666717529297, + "epoch": 2.265734265734266, + "grad_norm": 0.8000275492668152, + "kl": 0.2609575390815735, + "learning_rate": 4.273926841341303e-06, + "loss": 0.0104, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 648 + }, + { + "completion_length": 196.1666717529297, + "epoch": 2.269230769230769, + "grad_norm": 0.8786153197288513, + "kl": 0.3877195119857788, + "learning_rate": 4.270849594311323e-06, + "loss": 0.0155, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 649 + }, + { + "completion_length": 201.0, + "epoch": 2.2727272727272725, + "grad_norm": 0.9727340936660767, + "kl": 0.3743540942668915, + "learning_rate": 4.267766952966369e-06, + "loss": 0.015, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 650 + }, + { + "completion_length": 205.33334350585938, + "epoch": 2.2762237762237763, + "grad_norm": 0.09209764748811722, + "kl": 0.27989333868026733, + "learning_rate": 4.264678926696703e-06, + "loss": 0.0136, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 651 + }, + { + "completion_length": 202.5, + "epoch": 2.2797202797202796, + "grad_norm": 0.9205158948898315, + "kl": 0.3037436008453369, + "learning_rate": 4.261585524908987e-06, + "loss": 0.0121, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 652 + }, + { + "completion_length": 304.66668701171875, + "epoch": 2.2832167832167833, + "grad_norm": 0.8844843506813049, + "kl": 0.3668223023414612, + "learning_rate": 4.25848675702626e-06, + "loss": 0.0147, + "reward": 1.9500001668930054, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 653 + }, + { + "completion_length": 194.6666717529297, + "epoch": 2.2867132867132867, + "grad_norm": 1.0558805465698242, + "kl": 0.3064219057559967, + "learning_rate": 4.255382632487907e-06, + "loss": 0.0123, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 654 + }, + { + "completion_length": 185.6666717529297, + "epoch": 2.29020979020979, + "grad_norm": 0.9313608407974243, + "kl": 0.31230098009109497, + "learning_rate": 4.2522731607496275e-06, + "loss": 0.0125, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 655 + }, + { + "completion_length": 211.1666717529297, + "epoch": 2.2937062937062938, + "grad_norm": 0.19107016921043396, + "kl": 0.373710036277771, + "learning_rate": 4.249158351283414e-06, + "loss": 0.0173, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 656 + }, + { + "completion_length": 348.8333435058594, + "epoch": 2.297202797202797, + "grad_norm": 0.7309221029281616, + "kl": 0.3733287751674652, + "learning_rate": 4.246038213577516e-06, + "loss": 0.0149, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 657 + }, + { + "completion_length": 180.6666717529297, + "epoch": 2.300699300699301, + "grad_norm": 0.8861889839172363, + "kl": 0.35562607645988464, + "learning_rate": 4.242912757136412e-06, + "loss": 0.0142, + "reward": 2.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 658 + }, + { + "completion_length": 204.5, + "epoch": 2.304195804195804, + "grad_norm": 0.7407400608062744, + "kl": 0.28287678956985474, + "learning_rate": 4.239781991480786e-06, + "loss": 0.0113, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 659 + }, + { + "completion_length": 174.0, + "epoch": 2.3076923076923075, + "grad_norm": 8.534856796264648, + "kl": 1.5403010845184326, + "learning_rate": 4.236645926147493e-06, + "loss": 0.0616, + "reward": 2.2916667461395264, + "reward_std": 0.8002604246139526, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 660 + }, + { + "completion_length": 184.33334350585938, + "epoch": 2.3111888111888113, + "grad_norm": 0.06887773424386978, + "kl": 0.2856985628604889, + "learning_rate": 4.233504570689533e-06, + "loss": 0.0138, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 661 + }, + { + "completion_length": 202.83334350585938, + "epoch": 2.3146853146853146, + "grad_norm": 0.8288156986236572, + "kl": 0.2896421253681183, + "learning_rate": 4.230357934676017e-06, + "loss": 0.0116, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 662 + }, + { + "completion_length": 207.6666717529297, + "epoch": 2.3181818181818183, + "grad_norm": 1.119509220123291, + "kl": 0.4124630391597748, + "learning_rate": 4.227206027692146e-06, + "loss": 0.0165, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 663 + }, + { + "completion_length": 198.1666717529297, + "epoch": 2.3216783216783217, + "grad_norm": 0.8312250971794128, + "kl": 0.3108134865760803, + "learning_rate": 4.224048859339175e-06, + "loss": 0.0124, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 664 + }, + { + "completion_length": 610.8333740234375, + "epoch": 2.325174825174825, + "grad_norm": 0.5707215070724487, + "kl": 0.23091670870780945, + "learning_rate": 4.220886439234385e-06, + "loss": 0.0092, + "reward": 2.383333444595337, + "reward_std": 1.5413198471069336, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 665 + }, + { + "completion_length": 460.0, + "epoch": 2.3286713286713288, + "grad_norm": 10.873461723327637, + "kl": 2.6264634132385254, + "learning_rate": 4.217718777011058e-06, + "loss": 0.1051, + "reward": 1.4666666984558105, + "reward_std": 1.356711745262146, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 666 + }, + { + "completion_length": 207.0, + "epoch": 2.332167832167832, + "grad_norm": 0.6674370765686035, + "kl": 0.2692621350288391, + "learning_rate": 4.2145458823184414e-06, + "loss": 0.0108, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 667 + }, + { + "completion_length": 567.8333740234375, + "epoch": 2.335664335664336, + "grad_norm": 0.42179885506629944, + "kl": 0.2716664671897888, + "learning_rate": 4.211367764821722e-06, + "loss": 0.0109, + "reward": 3.566666603088379, + "reward_std": 0.938971221446991, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 668 + }, + { + "completion_length": 223.5, + "epoch": 2.339160839160839, + "grad_norm": 0.6866164803504944, + "kl": 0.24070698022842407, + "learning_rate": 4.208184434201999e-06, + "loss": 0.0096, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 669 + }, + { + "completion_length": 214.5, + "epoch": 2.3426573426573425, + "grad_norm": 0.9751102924346924, + "kl": 0.2499878704547882, + "learning_rate": 4.204995900156247e-06, + "loss": 0.01, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 670 + }, + { + "completion_length": 182.33334350585938, + "epoch": 2.3461538461538463, + "grad_norm": 3.7804720401763916, + "kl": 0.46188828349113464, + "learning_rate": 4.201802172397295e-06, + "loss": 0.0185, + "reward": 3.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 671 + }, + { + "completion_length": 1019.6666870117188, + "epoch": 2.3496503496503496, + "grad_norm": 0.4247821569442749, + "kl": 0.21799665689468384, + "learning_rate": 4.198603260653792e-06, + "loss": 0.0087, + "reward": 2.7166669368743896, + "reward_std": 1.6418485641479492, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666388511658, + "step": 672 + }, + { + "completion_length": 753.8333740234375, + "epoch": 2.3531468531468533, + "grad_norm": 0.5194523334503174, + "kl": 0.22523364424705505, + "learning_rate": 4.195399174670177e-06, + "loss": 0.009, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 673 + }, + { + "completion_length": 573.6666870117188, + "epoch": 2.3566433566433567, + "grad_norm": 0.5000849366188049, + "kl": 0.22850388288497925, + "learning_rate": 4.192189924206652e-06, + "loss": 0.0091, + "reward": 2.950000286102295, + "reward_std": 1.095445156097412, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 674 + }, + { + "completion_length": 1213.8333740234375, + "epoch": 2.36013986013986, + "grad_norm": 0.5522187352180481, + "kl": 0.177886962890625, + "learning_rate": 4.188975519039151e-06, + "loss": 0.0071, + "reward": 1.3916667699813843, + "reward_std": 1.4026464223861694, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5583333969116211, + "step": 675 + }, + { + "completion_length": 872.6666870117188, + "epoch": 2.3636363636363638, + "grad_norm": 0.4857361912727356, + "kl": 0.20906971395015717, + "learning_rate": 4.185755968959308e-06, + "loss": 0.0084, + "reward": 2.9083335399627686, + "reward_std": 1.696000337600708, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 676 + }, + { + "completion_length": 502.0, + "epoch": 2.367132867132867, + "grad_norm": 0.5935739278793335, + "kl": 0.27800655364990234, + "learning_rate": 4.182531283774434e-06, + "loss": 0.0111, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 677 + }, + { + "completion_length": 738.5, + "epoch": 2.370629370629371, + "grad_norm": 0.5985221862792969, + "kl": 0.2548876702785492, + "learning_rate": 4.179301473307476e-06, + "loss": 0.0102, + "reward": 2.2125000953674316, + "reward_std": 1.3164108991622925, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 678 + }, + { + "completion_length": 173.83334350585938, + "epoch": 2.374125874125874, + "grad_norm": 1.7061294317245483, + "kl": 0.3693540692329407, + "learning_rate": 4.176066547396998e-06, + "loss": 0.0148, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 679 + }, + { + "completion_length": 203.6666717529297, + "epoch": 2.3776223776223775, + "grad_norm": 1.0101178884506226, + "kl": 0.31931599974632263, + "learning_rate": 4.172826515897146e-06, + "loss": 0.0128, + "reward": 2.2083334922790527, + "reward_std": 1.1577637195587158, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 680 + }, + { + "completion_length": 205.83334350585938, + "epoch": 2.3811188811188813, + "grad_norm": 0.8966777920722961, + "kl": 0.3051684498786926, + "learning_rate": 4.169581388677617e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 681 + }, + { + "completion_length": 210.6666717529297, + "epoch": 2.3846153846153846, + "grad_norm": 0.7840998768806458, + "kl": 0.31647345423698425, + "learning_rate": 4.166331175623631e-06, + "loss": 0.0127, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 682 + }, + { + "completion_length": 209.1666717529297, + "epoch": 2.3881118881118883, + "grad_norm": 0.9048584699630737, + "kl": 0.25157231092453003, + "learning_rate": 4.163075886635902e-06, + "loss": 0.0101, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 683 + }, + { + "completion_length": 494.66668701171875, + "epoch": 2.3916083916083917, + "grad_norm": 0.612885057926178, + "kl": 0.1984379142522812, + "learning_rate": 4.159815531630604e-06, + "loss": 0.0079, + "reward": 2.125, + "reward_std": 1.3129165172576904, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 684 + }, + { + "completion_length": 182.83334350585938, + "epoch": 2.395104895104895, + "grad_norm": 1.069145679473877, + "kl": 0.33643895387649536, + "learning_rate": 4.1565501205393445e-06, + "loss": 0.0135, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 685 + }, + { + "completion_length": 192.6666717529297, + "epoch": 2.3986013986013988, + "grad_norm": 0.8116271495819092, + "kl": 0.29202282428741455, + "learning_rate": 4.15327966330913e-06, + "loss": 0.0117, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 686 + }, + { + "completion_length": 196.0, + "epoch": 2.402097902097902, + "grad_norm": 0.9276851415634155, + "kl": 0.31228408217430115, + "learning_rate": 4.150004169902343e-06, + "loss": 0.0125, + "reward": 1.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 687 + }, + { + "completion_length": 213.6666717529297, + "epoch": 2.4055944055944054, + "grad_norm": 1.0499162673950195, + "kl": 0.24672053754329681, + "learning_rate": 4.146723650296701e-06, + "loss": 0.0099, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 688 + }, + { + "completion_length": 219.1666717529297, + "epoch": 2.409090909090909, + "grad_norm": 0.7051374912261963, + "kl": 0.24717721343040466, + "learning_rate": 4.14343811448524e-06, + "loss": 0.0099, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 689 + }, + { + "completion_length": 226.5, + "epoch": 2.4125874125874125, + "grad_norm": 0.7789434194564819, + "kl": 0.2564643919467926, + "learning_rate": 4.140147572476269e-06, + "loss": 0.0103, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 690 + }, + { + "completion_length": 212.0, + "epoch": 2.4160839160839163, + "grad_norm": 0.8126075267791748, + "kl": 0.23958399891853333, + "learning_rate": 4.136852034293349e-06, + "loss": 0.0096, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 691 + }, + { + "completion_length": 210.6666717529297, + "epoch": 2.4195804195804196, + "grad_norm": 0.8626409769058228, + "kl": 0.2777412533760071, + "learning_rate": 4.133551509975264e-06, + "loss": 0.0111, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 692 + }, + { + "completion_length": 529.8333740234375, + "epoch": 2.423076923076923, + "grad_norm": 0.5266372561454773, + "kl": 0.2946487069129944, + "learning_rate": 4.130246009575981e-06, + "loss": 0.0118, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 693 + }, + { + "completion_length": 217.6666717529297, + "epoch": 2.4265734265734267, + "grad_norm": 0.814607560634613, + "kl": 0.31643202900886536, + "learning_rate": 4.126935543164628e-06, + "loss": 0.0127, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 694 + }, + { + "completion_length": 206.1666717529297, + "epoch": 2.43006993006993, + "grad_norm": 0.6121898293495178, + "kl": 0.24353787302970886, + "learning_rate": 4.123620120825459e-06, + "loss": 0.0097, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 695 + }, + { + "completion_length": 416.16668701171875, + "epoch": 2.4335664335664333, + "grad_norm": 0.65854811668396, + "kl": 0.29339665174484253, + "learning_rate": 4.120299752657828e-06, + "loss": 0.0117, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 696 + }, + { + "completion_length": 524.3333740234375, + "epoch": 2.437062937062937, + "grad_norm": 0.5596239566802979, + "kl": 0.26455265283584595, + "learning_rate": 4.11697444877615e-06, + "loss": 0.0106, + "reward": 2.7916667461395264, + "reward_std": 1.4527273178100586, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 697 + }, + { + "completion_length": 173.6666717529297, + "epoch": 2.4405594405594404, + "grad_norm": 2.7013747692108154, + "kl": 0.5755926370620728, + "learning_rate": 4.113644219309877e-06, + "loss": 0.023, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 698 + }, + { + "completion_length": 893.0, + "epoch": 2.444055944055944, + "grad_norm": 0.5892761945724487, + "kl": 0.22364209592342377, + "learning_rate": 4.110309074403467e-06, + "loss": 0.0089, + "reward": 2.433333396911621, + "reward_std": 1.3728317022323608, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 699 + }, + { + "completion_length": 1029.166748046875, + "epoch": 2.4475524475524475, + "grad_norm": 0.41362571716308594, + "kl": 0.20189592242240906, + "learning_rate": 4.106969024216348e-06, + "loss": 0.0081, + "reward": 1.7416666746139526, + "reward_std": 0.9625055193901062, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7416666746139526, + "step": 700 + }, + { + "completion_length": 200.5, + "epoch": 2.451048951048951, + "grad_norm": 0.9199966788291931, + "kl": 0.29405680298805237, + "learning_rate": 4.103624078922895e-06, + "loss": 0.0118, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 701 + }, + { + "completion_length": 551.8333740234375, + "epoch": 2.4545454545454546, + "grad_norm": 0.5847578644752502, + "kl": 0.30494964122772217, + "learning_rate": 4.1002742487123896e-06, + "loss": 0.0122, + "reward": 2.4583334922790527, + "reward_std": 1.3603003025054932, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 702 + }, + { + "completion_length": 158.5, + "epoch": 2.458041958041958, + "grad_norm": 3.148179054260254, + "kl": 0.33209604024887085, + "learning_rate": 4.096919543788995e-06, + "loss": 0.0133, + "reward": 2.9583334922790527, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 703 + }, + { + "completion_length": 635.1666870117188, + "epoch": 2.4615384615384617, + "grad_norm": 0.7368152141571045, + "kl": 0.2001763880252838, + "learning_rate": 4.093559974371725e-06, + "loss": 0.008, + "reward": 3.204166889190674, + "reward_std": 0.42143115401268005, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 704 + }, + { + "completion_length": 206.83334350585938, + "epoch": 2.465034965034965, + "grad_norm": 0.7404118776321411, + "kl": 0.2592664361000061, + "learning_rate": 4.09019555069441e-06, + "loss": 0.0104, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 705 + }, + { + "completion_length": 203.5, + "epoch": 2.4685314685314683, + "grad_norm": 0.7086665630340576, + "kl": 0.28512802720069885, + "learning_rate": 4.086826283005669e-06, + "loss": 0.0114, + "reward": 2.704166889190674, + "reward_std": 0.6021662950515747, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 706 + }, + { + "completion_length": 175.33334350585938, + "epoch": 2.472027972027972, + "grad_norm": 3.0447657108306885, + "kl": 0.38635802268981934, + "learning_rate": 4.083452181568876e-06, + "loss": 0.0155, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 707 + }, + { + "completion_length": 200.6666717529297, + "epoch": 2.4755244755244754, + "grad_norm": 0.7985562682151794, + "kl": 0.30575287342071533, + "learning_rate": 4.080073256662128e-06, + "loss": 0.0122, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 708 + }, + { + "completion_length": 212.5, + "epoch": 2.479020979020979, + "grad_norm": 1.0262845754623413, + "kl": 0.30596381425857544, + "learning_rate": 4.076689518578217e-06, + "loss": 0.0122, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 709 + }, + { + "completion_length": 199.5, + "epoch": 2.4825174825174825, + "grad_norm": 0.8163771629333496, + "kl": 0.23148366808891296, + "learning_rate": 4.073300977624594e-06, + "loss": 0.0093, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 710 + }, + { + "completion_length": 228.33334350585938, + "epoch": 2.486013986013986, + "grad_norm": 0.6531832218170166, + "kl": 0.27565860748291016, + "learning_rate": 4.069907644123346e-06, + "loss": 0.011, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 711 + }, + { + "completion_length": 487.16668701171875, + "epoch": 2.4895104895104896, + "grad_norm": 0.3693908452987671, + "kl": 0.32342347502708435, + "learning_rate": 4.066509528411151e-06, + "loss": 0.0129, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 712 + }, + { + "completion_length": 191.83334350585938, + "epoch": 2.493006993006993, + "grad_norm": 0.822213351726532, + "kl": 0.3490138649940491, + "learning_rate": 4.063106640839264e-06, + "loss": 0.014, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 713 + }, + { + "completion_length": 178.6666717529297, + "epoch": 2.4965034965034967, + "grad_norm": 0.7303230166435242, + "kl": 0.26454809308052063, + "learning_rate": 4.059698991773466e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 714 + }, + { + "completion_length": 213.6666717529297, + "epoch": 2.5, + "grad_norm": 0.792052149772644, + "kl": 0.32973194122314453, + "learning_rate": 4.056286591594049e-06, + "loss": 0.0132, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 715 + }, + { + "completion_length": 211.0, + "epoch": 2.5034965034965033, + "grad_norm": 0.6441434025764465, + "kl": 0.3346059024333954, + "learning_rate": 4.052869450695776e-06, + "loss": 0.0134, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 716 + }, + { + "completion_length": 181.6666717529297, + "epoch": 2.506993006993007, + "grad_norm": 2.2384145259857178, + "kl": 0.4402106702327728, + "learning_rate": 4.049447579487851e-06, + "loss": 0.0176, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 717 + }, + { + "completion_length": 825.8333740234375, + "epoch": 2.5104895104895104, + "grad_norm": 0.4227934777736664, + "kl": 0.19202569127082825, + "learning_rate": 4.046020988393886e-06, + "loss": 0.0077, + "reward": 2.7916667461395264, + "reward_std": 1.5844295024871826, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 718 + }, + { + "completion_length": 199.6666717529297, + "epoch": 2.513986013986014, + "grad_norm": 0.7948997020721436, + "kl": 0.30144181847572327, + "learning_rate": 4.0425896878518725e-06, + "loss": 0.0121, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 719 + }, + { + "completion_length": 200.6666717529297, + "epoch": 2.5174825174825175, + "grad_norm": 0.7969666123390198, + "kl": 0.2623240351676941, + "learning_rate": 4.039153688314146e-06, + "loss": 0.0105, + "reward": 2.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 720 + }, + { + "completion_length": 184.6666717529297, + "epoch": 2.520979020979021, + "grad_norm": 1.1336637735366821, + "kl": 0.2935950756072998, + "learning_rate": 4.035713000247358e-06, + "loss": 0.0117, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 721 + }, + { + "completion_length": 667.3333740234375, + "epoch": 2.5244755244755246, + "grad_norm": 0.39087414741516113, + "kl": 0.2444695681333542, + "learning_rate": 4.032267634132442e-06, + "loss": 0.0098, + "reward": 3.704166889190674, + "reward_std": 0.6021662950515747, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 722 + }, + { + "completion_length": 818.5, + "epoch": 2.527972027972028, + "grad_norm": 0.42902201414108276, + "kl": 0.18485748767852783, + "learning_rate": 4.028817600464579e-06, + "loss": 0.0074, + "reward": 1.9666666984558105, + "reward_std": 1.5233734846115112, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 723 + }, + { + "completion_length": 197.33334350585938, + "epoch": 2.5314685314685317, + "grad_norm": 0.5554837584495544, + "kl": 0.3039252758026123, + "learning_rate": 4.02536290975317e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 724 + }, + { + "completion_length": 519.3333740234375, + "epoch": 2.534965034965035, + "grad_norm": 0.44166073203086853, + "kl": 0.24431876838207245, + "learning_rate": 4.021903572521802e-06, + "loss": 0.0098, + "reward": 2.7916667461395264, + "reward_std": 1.4527273178100586, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 725 + }, + { + "completion_length": 200.0, + "epoch": 2.5384615384615383, + "grad_norm": 0.7037209868431091, + "kl": 0.3631229102611542, + "learning_rate": 4.018439599308217e-06, + "loss": 0.0145, + "reward": 3.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 726 + }, + { + "completion_length": 192.5, + "epoch": 2.541958041958042, + "grad_norm": 0.664789617061615, + "kl": 0.29182663559913635, + "learning_rate": 4.0149710006642775e-06, + "loss": 0.0117, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 727 + }, + { + "completion_length": 198.5, + "epoch": 2.5454545454545454, + "grad_norm": 1.0678514242172241, + "kl": 0.28828293085098267, + "learning_rate": 4.011497787155938e-06, + "loss": 0.0115, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 728 + }, + { + "completion_length": 213.83334350585938, + "epoch": 2.548951048951049, + "grad_norm": 0.8395413756370544, + "kl": 0.3076155185699463, + "learning_rate": 4.008019969363206e-06, + "loss": 0.0123, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 729 + }, + { + "completion_length": 212.0, + "epoch": 2.5524475524475525, + "grad_norm": 0.7780301570892334, + "kl": 0.2876867651939392, + "learning_rate": 4.0045375578801216e-06, + "loss": 0.0115, + "reward": 2.616666793823242, + "reward_std": 1.0327956676483154, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 730 + }, + { + "completion_length": 183.33334350585938, + "epoch": 2.555944055944056, + "grad_norm": 0.043716005980968475, + "kl": 0.40688663721084595, + "learning_rate": 4.001050563314711e-06, + "loss": 0.0187, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 731 + }, + { + "completion_length": 181.6666717529297, + "epoch": 2.5594405594405596, + "grad_norm": 0.7270947098731995, + "kl": 0.2820360064506531, + "learning_rate": 3.997558996288965e-06, + "loss": 0.0113, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 732 + }, + { + "completion_length": 522.0, + "epoch": 2.562937062937063, + "grad_norm": 0.5480185747146606, + "kl": 0.2843058109283447, + "learning_rate": 3.994062867438803e-06, + "loss": 0.0114, + "reward": 3.016666889190674, + "reward_std": 0.9521905183792114, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 733 + }, + { + "completion_length": 192.0, + "epoch": 2.5664335664335667, + "grad_norm": 0.733644962310791, + "kl": 0.27982231974601746, + "learning_rate": 3.9905621874140396e-06, + "loss": 0.0112, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 734 + }, + { + "completion_length": 198.6666717529297, + "epoch": 2.56993006993007, + "grad_norm": 0.7122451066970825, + "kl": 0.36668699979782104, + "learning_rate": 3.987056966878354e-06, + "loss": 0.0147, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 735 + }, + { + "completion_length": 204.33334350585938, + "epoch": 2.5734265734265733, + "grad_norm": 0.07662484794855118, + "kl": 0.3632362484931946, + "learning_rate": 3.983547216509254e-06, + "loss": 0.0169, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 736 + }, + { + "completion_length": 189.1666717529297, + "epoch": 2.5769230769230766, + "grad_norm": 0.34811052680015564, + "kl": 0.4749183654785156, + "learning_rate": 3.9800329469980495e-06, + "loss": 0.0214, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 737 + }, + { + "completion_length": 583.5, + "epoch": 2.5804195804195804, + "grad_norm": 0.3855575919151306, + "kl": 0.2539462447166443, + "learning_rate": 3.976514169049814e-06, + "loss": 0.0102, + "reward": 2.704166889190674, + "reward_std": 0.6021661758422852, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 738 + }, + { + "completion_length": 174.6666717529297, + "epoch": 2.583916083916084, + "grad_norm": 1.0900449752807617, + "kl": 0.3619951605796814, + "learning_rate": 3.972990893383356e-06, + "loss": 0.0145, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 739 + }, + { + "completion_length": 203.1666717529297, + "epoch": 2.5874125874125875, + "grad_norm": 0.9708390831947327, + "kl": 0.28454601764678955, + "learning_rate": 3.969463130731183e-06, + "loss": 0.0114, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 740 + }, + { + "completion_length": 522.1666870117188, + "epoch": 2.590909090909091, + "grad_norm": 0.6295937895774841, + "kl": 0.26834964752197266, + "learning_rate": 3.965930891839473e-06, + "loss": 0.0107, + "reward": 3.370833396911621, + "reward_std": 0.4925486445426941, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 741 + }, + { + "completion_length": 703.5, + "epoch": 2.594405594405594, + "grad_norm": 0.35760697722435, + "kl": 0.28400832414627075, + "learning_rate": 3.96239418746804e-06, + "loss": 0.0114, + "reward": 3.066666603088379, + "reward_std": 0.2857738435268402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8999999761581421, + "step": 742 + }, + { + "completion_length": 833.8333740234375, + "epoch": 2.597902097902098, + "grad_norm": 0.5528135895729065, + "kl": 0.28165918588638306, + "learning_rate": 3.958853028390294e-06, + "loss": 0.0113, + "reward": 2.3583335876464844, + "reward_std": 1.752831220626831, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916666030883789, + "step": 743 + }, + { + "completion_length": 143.33334350585938, + "epoch": 2.6013986013986012, + "grad_norm": 0.7684369683265686, + "kl": 0.3106473684310913, + "learning_rate": 3.955307425393224e-06, + "loss": 0.0124, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 744 + }, + { + "completion_length": 789.1666870117188, + "epoch": 2.604895104895105, + "grad_norm": 0.9867936372756958, + "kl": 0.2591046094894409, + "learning_rate": 3.951757389277349e-06, + "loss": 0.0104, + "reward": 3.3500001430511475, + "reward_std": 0.5224940180778503, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 745 + }, + { + "completion_length": 194.83334350585938, + "epoch": 2.6083916083916083, + "grad_norm": 0.7808223962783813, + "kl": 0.30762046575546265, + "learning_rate": 3.948202930856697e-06, + "loss": 0.0123, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 746 + }, + { + "completion_length": 503.8333435058594, + "epoch": 2.6118881118881117, + "grad_norm": 0.6441946625709534, + "kl": 0.2855534851551056, + "learning_rate": 3.944644060958764e-06, + "loss": 0.0114, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 747 + }, + { + "completion_length": 190.1666717529297, + "epoch": 2.6153846153846154, + "grad_norm": 0.8443914651870728, + "kl": 0.32207822799682617, + "learning_rate": 3.941080790424483e-06, + "loss": 0.0129, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 748 + }, + { + "completion_length": 193.6666717529297, + "epoch": 2.6188811188811187, + "grad_norm": 0.620596706867218, + "kl": 0.33432909846305847, + "learning_rate": 3.9375131301081974e-06, + "loss": 0.0134, + "reward": 2.616666793823242, + "reward_std": 1.0327956676483154, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 749 + }, + { + "completion_length": 218.33334350585938, + "epoch": 2.6223776223776225, + "grad_norm": 0.8599146604537964, + "kl": 0.2318965494632721, + "learning_rate": 3.933941090877615e-06, + "loss": 0.0093, + "reward": 2.0375001430511475, + "reward_std": 0.4857339859008789, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 750 + }, + { + "completion_length": 196.83334350585938, + "epoch": 2.625874125874126, + "grad_norm": 0.042067479342222214, + "kl": 0.25582045316696167, + "learning_rate": 3.930364683613791e-06, + "loss": 0.0114, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 751 + }, + { + "completion_length": 187.6666717529297, + "epoch": 2.629370629370629, + "grad_norm": 0.6770573854446411, + "kl": 0.2656649649143219, + "learning_rate": 3.92678391921108e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 752 + }, + { + "completion_length": 217.5, + "epoch": 2.632867132867133, + "grad_norm": 1.6130694150924683, + "kl": 0.29323238134384155, + "learning_rate": 3.923198808577111e-06, + "loss": 0.0117, + "reward": 2.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 753 + }, + { + "completion_length": 224.83334350585938, + "epoch": 2.6363636363636362, + "grad_norm": 0.7095122933387756, + "kl": 0.27353787422180176, + "learning_rate": 3.9196093626327535e-06, + "loss": 0.0109, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 754 + }, + { + "completion_length": 827.5, + "epoch": 2.63986013986014, + "grad_norm": 0.5739628076553345, + "kl": 0.21068716049194336, + "learning_rate": 3.916015592312083e-06, + "loss": 0.0084, + "reward": 2.883333206176758, + "reward_std": 1.7192052602767944, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 755 + }, + { + "completion_length": 186.0, + "epoch": 2.6433566433566433, + "grad_norm": 0.8608355522155762, + "kl": 0.3407597243785858, + "learning_rate": 3.912417508562345e-06, + "loss": 0.0136, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 756 + }, + { + "completion_length": 556.3333740234375, + "epoch": 2.6468531468531467, + "grad_norm": 0.3163861036300659, + "kl": 0.2427646368741989, + "learning_rate": 3.908815122343929e-06, + "loss": 0.0097, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 757 + }, + { + "completion_length": 187.5, + "epoch": 2.6503496503496504, + "grad_norm": 0.8031748533248901, + "kl": 0.30763155221939087, + "learning_rate": 3.905208444630326e-06, + "loss": 0.0123, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 758 + }, + { + "completion_length": 218.1666717529297, + "epoch": 2.6538461538461537, + "grad_norm": 0.8372368216514587, + "kl": 0.28790879249572754, + "learning_rate": 3.901597486408105e-06, + "loss": 0.0115, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 759 + }, + { + "completion_length": 1181.8333740234375, + "epoch": 2.6573426573426575, + "grad_norm": 0.647392988204956, + "kl": 0.20365619659423828, + "learning_rate": 3.897982258676867e-06, + "loss": 0.0081, + "reward": 1.7208335399627686, + "reward_std": 1.5208892822265625, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5541666746139526, + "step": 760 + }, + { + "completion_length": 180.1666717529297, + "epoch": 2.660839160839161, + "grad_norm": 0.6884165406227112, + "kl": 0.2719978392124176, + "learning_rate": 3.894362772449226e-06, + "loss": 0.0109, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 761 + }, + { + "completion_length": 257.8333435058594, + "epoch": 2.664335664335664, + "grad_norm": 1.337699055671692, + "kl": 0.5194430351257324, + "learning_rate": 3.890739038750763e-06, + "loss": 0.0208, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 762 + }, + { + "completion_length": 498.16668701171875, + "epoch": 2.667832167832168, + "grad_norm": 0.9563208818435669, + "kl": 0.3499029874801636, + "learning_rate": 3.887111068619999e-06, + "loss": 0.014, + "reward": 1.75, + "reward_std": 1.1730302572250366, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.75, + "step": 763 + }, + { + "completion_length": 215.33334350585938, + "epoch": 2.6713286713286712, + "grad_norm": 0.5849650502204895, + "kl": 0.21754197776317596, + "learning_rate": 3.88347887310836e-06, + "loss": 0.0087, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 764 + }, + { + "completion_length": 206.83334350585938, + "epoch": 2.674825174825175, + "grad_norm": 0.5816351771354675, + "kl": 0.2685267925262451, + "learning_rate": 3.879842463280146e-06, + "loss": 0.0107, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 765 + }, + { + "completion_length": 190.1666717529297, + "epoch": 2.6783216783216783, + "grad_norm": 0.7096436023712158, + "kl": 0.3302849531173706, + "learning_rate": 3.876201850212489e-06, + "loss": 0.0132, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 766 + }, + { + "completion_length": 205.83334350585938, + "epoch": 2.6818181818181817, + "grad_norm": 0.7019976377487183, + "kl": 0.3386441469192505, + "learning_rate": 3.87255704499533e-06, + "loss": 0.0135, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 767 + }, + { + "completion_length": 220.83334350585938, + "epoch": 2.6853146853146854, + "grad_norm": 0.7764424681663513, + "kl": 0.25025084614753723, + "learning_rate": 3.868908058731376e-06, + "loss": 0.01, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 768 + }, + { + "completion_length": 191.1666717529297, + "epoch": 2.6888111888111887, + "grad_norm": 0.6796668767929077, + "kl": 0.2684442698955536, + "learning_rate": 3.865254902536073e-06, + "loss": 0.0107, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 769 + }, + { + "completion_length": 898.3333740234375, + "epoch": 2.6923076923076925, + "grad_norm": 0.38831865787506104, + "kl": 0.14873462915420532, + "learning_rate": 3.861597587537568e-06, + "loss": 0.0059, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 770 + }, + { + "completion_length": 1279.166748046875, + "epoch": 2.695804195804196, + "grad_norm": 0.6360457539558411, + "kl": 0.1556037813425064, + "learning_rate": 3.857936124876677e-06, + "loss": 0.0062, + "reward": 2.25, + "reward_std": 1.957294225692749, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5833333730697632, + "step": 771 + }, + { + "completion_length": 185.6666717529297, + "epoch": 2.699300699300699, + "grad_norm": 0.8891352415084839, + "kl": 0.2973707318305969, + "learning_rate": 3.85427052570685e-06, + "loss": 0.0119, + "reward": 2.450000286102295, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 772 + }, + { + "completion_length": 223.0, + "epoch": 2.702797202797203, + "grad_norm": 0.9200516939163208, + "kl": 0.2344827651977539, + "learning_rate": 3.850600801194138e-06, + "loss": 0.0094, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 773 + }, + { + "completion_length": 181.6666717529297, + "epoch": 2.7062937062937062, + "grad_norm": 1.2495554685592651, + "kl": 0.4023559093475342, + "learning_rate": 3.846926962517158e-06, + "loss": 0.0161, + "reward": 2.4000000953674316, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 774 + }, + { + "completion_length": 186.83334350585938, + "epoch": 2.70979020979021, + "grad_norm": 0.7409746646881104, + "kl": 0.2839186489582062, + "learning_rate": 3.8432490208670605e-06, + "loss": 0.0114, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 775 + }, + { + "completion_length": 187.1666717529297, + "epoch": 2.7132867132867133, + "grad_norm": 0.9320999383926392, + "kl": 0.2990000247955322, + "learning_rate": 3.839566987447492e-06, + "loss": 0.012, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 776 + }, + { + "completion_length": 531.8333740234375, + "epoch": 2.7167832167832167, + "grad_norm": 0.3263534903526306, + "kl": 0.2381911277770996, + "learning_rate": 3.835880873474567e-06, + "loss": 0.0095, + "reward": 3.2916667461395264, + "reward_std": 1.6125807762145996, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 777 + }, + { + "completion_length": 1248.666748046875, + "epoch": 2.7202797202797204, + "grad_norm": 0.5097912549972534, + "kl": 0.1756594479084015, + "learning_rate": 3.832190690176825e-06, + "loss": 0.007, + "reward": 2.25, + "reward_std": 1.957294225692749, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5833333730697632, + "step": 778 + }, + { + "completion_length": 546.5, + "epoch": 2.7237762237762237, + "grad_norm": 0.38489583134651184, + "kl": 0.233808696269989, + "learning_rate": 3.828496448795208e-06, + "loss": 0.0094, + "reward": 2.7916667461395264, + "reward_std": 1.4527273178100586, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 779 + }, + { + "completion_length": 200.5, + "epoch": 2.7272727272727275, + "grad_norm": 0.6196880340576172, + "kl": 0.28656402230262756, + "learning_rate": 3.824798160583012e-06, + "loss": 0.0115, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 780 + }, + { + "completion_length": 206.83334350585938, + "epoch": 2.730769230769231, + "grad_norm": 0.06716328859329224, + "kl": 0.35444962978363037, + "learning_rate": 3.821095836805868e-06, + "loss": 0.0166, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 781 + }, + { + "completion_length": 1108.0, + "epoch": 2.734265734265734, + "grad_norm": 0.46010759472846985, + "kl": 0.2134471833705902, + "learning_rate": 3.817389488741694e-06, + "loss": 0.0085, + "reward": 2.9375, + "reward_std": 1.2437593936920166, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7708332538604736, + "step": 782 + }, + { + "completion_length": 192.83334350585938, + "epoch": 2.737762237762238, + "grad_norm": 0.7892248034477234, + "kl": 0.27930283546447754, + "learning_rate": 3.8136791276806695e-06, + "loss": 0.0112, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 783 + }, + { + "completion_length": 526.1666870117188, + "epoch": 2.7412587412587412, + "grad_norm": 0.5663818120956421, + "kl": 0.23246847093105316, + "learning_rate": 3.8099647649251984e-06, + "loss": 0.0093, + "reward": 2.183333396911621, + "reward_std": 1.0230673551559448, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 784 + }, + { + "completion_length": 302.0, + "epoch": 2.744755244755245, + "grad_norm": 0.8390914797782898, + "kl": 0.304746150970459, + "learning_rate": 3.806246411789872e-06, + "loss": 0.0122, + "reward": 2.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 785 + }, + { + "completion_length": 463.66668701171875, + "epoch": 2.7482517482517483, + "grad_norm": 0.4586171507835388, + "kl": 0.2490534633398056, + "learning_rate": 3.802524079601442e-06, + "loss": 0.01, + "reward": 2.2916667461395264, + "reward_std": 1.19181227684021, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 786 + }, + { + "completion_length": 538.1666870117188, + "epoch": 2.7517482517482517, + "grad_norm": 0.4255636930465698, + "kl": 0.21123819053173065, + "learning_rate": 3.798797779698774e-06, + "loss": 0.0084, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 787 + }, + { + "completion_length": 499.0, + "epoch": 2.755244755244755, + "grad_norm": 0.5292470455169678, + "kl": 0.24850648641586304, + "learning_rate": 3.795067523432826e-06, + "loss": 0.0099, + "reward": 1.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 788 + }, + { + "completion_length": 527.6666870117188, + "epoch": 2.7587412587412588, + "grad_norm": 0.6640042662620544, + "kl": 0.21320059895515442, + "learning_rate": 3.791333322166605e-06, + "loss": 0.0085, + "reward": 2.3500001430511475, + "reward_std": 0.6928203105926514, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 789 + }, + { + "completion_length": 212.5, + "epoch": 2.762237762237762, + "grad_norm": 0.7885140776634216, + "kl": 0.25268059968948364, + "learning_rate": 3.787595187275136e-06, + "loss": 0.0101, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 790 + }, + { + "completion_length": 185.0, + "epoch": 2.765734265734266, + "grad_norm": 1.2679868936538696, + "kl": 0.35767948627471924, + "learning_rate": 3.7838531301454257e-06, + "loss": 0.0143, + "reward": 2.3500001430511475, + "reward_std": 0.9380831718444824, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 791 + }, + { + "completion_length": 202.0, + "epoch": 2.769230769230769, + "grad_norm": 0.6652596592903137, + "kl": 0.2619841694831848, + "learning_rate": 3.780107162176429e-06, + "loss": 0.0105, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 792 + }, + { + "completion_length": 474.0, + "epoch": 2.7727272727272725, + "grad_norm": 8.084759712219238, + "kl": 2.9472758769989014, + "learning_rate": 3.776357294779015e-06, + "loss": 0.1179, + "reward": 2.133333444595337, + "reward_std": 1.6972527503967285, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 793 + }, + { + "completion_length": 487.3333435058594, + "epoch": 2.7762237762237763, + "grad_norm": 0.43876194953918457, + "kl": 0.234140545129776, + "learning_rate": 3.772603539375929e-06, + "loss": 0.0094, + "reward": 2.875, + "reward_std": 1.2624380588531494, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 794 + }, + { + "completion_length": 216.5, + "epoch": 2.7797202797202796, + "grad_norm": 0.7178113460540771, + "kl": 0.3248441517353058, + "learning_rate": 3.768845907401761e-06, + "loss": 0.013, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 795 + }, + { + "completion_length": 425.66668701171875, + "epoch": 2.7832167832167833, + "grad_norm": 0.4357425570487976, + "kl": 0.24865001440048218, + "learning_rate": 3.7650844103029093e-06, + "loss": 0.0099, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 796 + }, + { + "completion_length": 827.1666870117188, + "epoch": 2.7867132867132867, + "grad_norm": 0.36945709586143494, + "kl": 0.26294025778770447, + "learning_rate": 3.7613190595375484e-06, + "loss": 0.0105, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 797 + }, + { + "completion_length": 193.33334350585938, + "epoch": 2.79020979020979, + "grad_norm": 1.0428582429885864, + "kl": 0.3159600496292114, + "learning_rate": 3.7575498665755884e-06, + "loss": 0.0126, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 798 + }, + { + "completion_length": 573.1666870117188, + "epoch": 2.7937062937062938, + "grad_norm": 0.7567842602729797, + "kl": 0.37068232893943787, + "learning_rate": 3.753776842898644e-06, + "loss": 0.0148, + "reward": 2.704166889190674, + "reward_std": 0.8732721209526062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 799 + }, + { + "completion_length": 500.66668701171875, + "epoch": 2.797202797202797, + "grad_norm": 0.5451098680496216, + "kl": 0.24930475652217865, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.01, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 800 + }, + { + "completion_length": 534.8333740234375, + "epoch": 2.800699300699301, + "grad_norm": 0.6100650429725647, + "kl": 0.2307787835597992, + "learning_rate": 3.7462193493845763e-06, + "loss": 0.0092, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 801 + }, + { + "completion_length": 494.3333435058594, + "epoch": 2.804195804195804, + "grad_norm": 0.3402723968029022, + "kl": 0.18503499031066895, + "learning_rate": 3.742434902568889e-06, + "loss": 0.0074, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 802 + }, + { + "completion_length": 173.5, + "epoch": 2.8076923076923075, + "grad_norm": 1.8058785200119019, + "kl": 0.3168944716453552, + "learning_rate": 3.738646671081019e-06, + "loss": 0.0127, + "reward": 2.016666889190674, + "reward_std": 0.9521905183792114, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 803 + }, + { + "completion_length": 512.8333740234375, + "epoch": 2.8111888111888113, + "grad_norm": 0.526727020740509, + "kl": 0.23828241229057312, + "learning_rate": 3.7348546664605777e-06, + "loss": 0.0095, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 804 + }, + { + "completion_length": 527.5, + "epoch": 2.8146853146853146, + "grad_norm": 0.41340726613998413, + "kl": 0.18879906833171844, + "learning_rate": 3.7310589002586683e-06, + "loss": 0.0076, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 805 + }, + { + "completion_length": 505.16668701171875, + "epoch": 2.8181818181818183, + "grad_norm": 0.5648691058158875, + "kl": 0.315143346786499, + "learning_rate": 3.7272593840378526e-06, + "loss": 0.0126, + "reward": 3.3500001430511475, + "reward_std": 1.058300495147705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 806 + }, + { + "completion_length": 478.0, + "epoch": 2.8216783216783217, + "grad_norm": 0.7256986498832703, + "kl": 0.36903661489486694, + "learning_rate": 3.723456129372116e-06, + "loss": 0.0148, + "reward": 2.691666603088379, + "reward_std": 1.66505765914917, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 807 + }, + { + "completion_length": 502.3333435058594, + "epoch": 2.825174825174825, + "grad_norm": 0.44219493865966797, + "kl": 0.20148871839046478, + "learning_rate": 3.7196491478468322e-06, + "loss": 0.0081, + "reward": 3.125, + "reward_std": 1.5823242664337158, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 808 + }, + { + "completion_length": 486.5, + "epoch": 2.8286713286713288, + "grad_norm": 0.47201624512672424, + "kl": 0.21870753169059753, + "learning_rate": 3.7158384510587264e-06, + "loss": 0.0087, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 809 + }, + { + "completion_length": 537.3333740234375, + "epoch": 2.832167832167832, + "grad_norm": 0.6436510682106018, + "kl": 0.22534185647964478, + "learning_rate": 3.7120240506158433e-06, + "loss": 0.009, + "reward": 2.950000286102295, + "reward_std": 1.095445156097412, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 810 + }, + { + "completion_length": 237.5, + "epoch": 2.835664335664336, + "grad_norm": 0.646536648273468, + "kl": 0.27237847447395325, + "learning_rate": 3.708205958137506e-06, + "loss": 0.0109, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 811 + }, + { + "completion_length": 255.6666717529297, + "epoch": 2.839160839160839, + "grad_norm": 0.6682825684547424, + "kl": 0.3421524465084076, + "learning_rate": 3.7043841852542884e-06, + "loss": 0.0137, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 812 + }, + { + "completion_length": 210.33334350585938, + "epoch": 2.8426573426573425, + "grad_norm": 0.9165672063827515, + "kl": 0.307815283536911, + "learning_rate": 3.7005587436079724e-06, + "loss": 0.0123, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 813 + }, + { + "completion_length": 830.8333740234375, + "epoch": 2.8461538461538463, + "grad_norm": 0.4606754779815674, + "kl": 0.20725002884864807, + "learning_rate": 3.6967296448515176e-06, + "loss": 0.0083, + "reward": 1.633333444595337, + "reward_std": 1.3418892621994019, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 814 + }, + { + "completion_length": 165.6666717529297, + "epoch": 2.8496503496503496, + "grad_norm": 8.907544136047363, + "kl": 0.7807542085647583, + "learning_rate": 3.6928969006490212e-06, + "loss": 0.0312, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 815 + }, + { + "completion_length": 852.3333740234375, + "epoch": 2.8531468531468533, + "grad_norm": 0.5166431665420532, + "kl": 0.22571682929992676, + "learning_rate": 3.689060522675689e-06, + "loss": 0.009, + "reward": 2.191666603088379, + "reward_std": 1.2499668598175049, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 816 + }, + { + "completion_length": 223.6666717529297, + "epoch": 2.8566433566433567, + "grad_norm": 0.7508683204650879, + "kl": 0.2404782623052597, + "learning_rate": 3.6852205226177907e-06, + "loss": 0.0096, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 817 + }, + { + "completion_length": 197.33334350585938, + "epoch": 2.86013986013986, + "grad_norm": 0.9129036664962769, + "kl": 0.3411031663417816, + "learning_rate": 3.6813769121726356e-06, + "loss": 0.0136, + "reward": 2.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 818 + }, + { + "completion_length": 516.5, + "epoch": 2.8636363636363638, + "grad_norm": 0.462089866399765, + "kl": 0.2940046191215515, + "learning_rate": 3.677529703048525e-06, + "loss": 0.0118, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 819 + }, + { + "completion_length": 200.33334350585938, + "epoch": 2.867132867132867, + "grad_norm": 0.8957485556602478, + "kl": 0.2655426859855652, + "learning_rate": 3.6736789069647273e-06, + "loss": 0.0106, + "reward": 2.370833396911621, + "reward_std": 0.4925486445426941, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 820 + }, + { + "completion_length": 198.5, + "epoch": 2.870629370629371, + "grad_norm": 0.8342744708061218, + "kl": 0.24618063867092133, + "learning_rate": 3.6698245356514337e-06, + "loss": 0.0098, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 821 + }, + { + "completion_length": 153.33334350585938, + "epoch": 2.874125874125874, + "grad_norm": 0.7607580423355103, + "kl": 0.27888309955596924, + "learning_rate": 3.6659666008497287e-06, + "loss": 0.0112, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 822 + }, + { + "completion_length": 206.1666717529297, + "epoch": 2.8776223776223775, + "grad_norm": 0.8246819972991943, + "kl": 0.2606455087661743, + "learning_rate": 3.66210511431155e-06, + "loss": 0.0104, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 823 + }, + { + "completion_length": 509.5, + "epoch": 2.8811188811188813, + "grad_norm": 0.4599984586238861, + "kl": 0.2845722734928131, + "learning_rate": 3.658240087799655e-06, + "loss": 0.0114, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 824 + }, + { + "completion_length": 577.5, + "epoch": 2.8846153846153846, + "grad_norm": 0.3288043439388275, + "kl": 0.2456766664981842, + "learning_rate": 3.654371533087586e-06, + "loss": 0.0098, + "reward": 3.516666889190674, + "reward_std": 1.0614454746246338, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 825 + }, + { + "completion_length": 516.3333740234375, + "epoch": 2.8881118881118883, + "grad_norm": 0.4140225946903229, + "kl": 0.2799639403820038, + "learning_rate": 3.6504994619596295e-06, + "loss": 0.0112, + "reward": 2.8500001430511475, + "reward_std": 1.3190906047821045, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 826 + }, + { + "completion_length": 180.0, + "epoch": 2.8916083916083917, + "grad_norm": 4.02695894241333, + "kl": 0.5962376594543457, + "learning_rate": 3.6466238862107884e-06, + "loss": 0.0238, + "reward": 2.625, + "reward_std": 1.1973929405212402, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 827 + }, + { + "completion_length": 220.5, + "epoch": 2.895104895104895, + "grad_norm": 0.7351843118667603, + "kl": 0.2822580337524414, + "learning_rate": 3.642744817646736e-06, + "loss": 0.0113, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 828 + }, + { + "completion_length": 515.0, + "epoch": 2.8986013986013988, + "grad_norm": 0.08226211369037628, + "kl": 0.25046059489250183, + "learning_rate": 3.6388622680837893e-06, + "loss": 0.0124, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 829 + }, + { + "completion_length": 505.16668701171875, + "epoch": 2.902097902097902, + "grad_norm": 0.4543350040912628, + "kl": 0.24183645844459534, + "learning_rate": 3.634976249348867e-06, + "loss": 0.0097, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 830 + }, + { + "completion_length": 218.6666717529297, + "epoch": 2.905594405594406, + "grad_norm": 0.7515471577644348, + "kl": 0.23829472064971924, + "learning_rate": 3.631086773279457e-06, + "loss": 0.0095, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 831 + }, + { + "completion_length": 213.6666717529297, + "epoch": 2.909090909090909, + "grad_norm": 0.7354035973548889, + "kl": 0.261251837015152, + "learning_rate": 3.627193851723577e-06, + "loss": 0.0105, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 832 + }, + { + "completion_length": 212.6666717529297, + "epoch": 2.9125874125874125, + "grad_norm": 0.057771261781454086, + "kl": 0.27106887102127075, + "learning_rate": 3.6232974965397416e-06, + "loss": 0.0132, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 833 + }, + { + "completion_length": 504.5, + "epoch": 2.916083916083916, + "grad_norm": 0.5733168125152588, + "kl": 0.2574426233768463, + "learning_rate": 3.6193977195969243e-06, + "loss": 0.0103, + "reward": 3.3500001430511475, + "reward_std": 1.058300495147705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 834 + }, + { + "completion_length": 201.33334350585938, + "epoch": 2.9195804195804196, + "grad_norm": 0.04076343774795532, + "kl": 0.3926897644996643, + "learning_rate": 3.6154945327745223e-06, + "loss": 0.0181, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 835 + }, + { + "completion_length": 504.66668701171875, + "epoch": 2.9230769230769234, + "grad_norm": 0.41645586490631104, + "kl": 0.28634482622146606, + "learning_rate": 3.611587947962319e-06, + "loss": 0.0115, + "reward": 2.8500001430511475, + "reward_std": 0.8366600275039673, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 836 + }, + { + "completion_length": 193.1666717529297, + "epoch": 2.9265734265734267, + "grad_norm": 0.584320068359375, + "kl": 0.36589449644088745, + "learning_rate": 3.6076779770604496e-06, + "loss": 0.0146, + "reward": 2.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 837 + }, + { + "completion_length": 151.1666717529297, + "epoch": 2.93006993006993, + "grad_norm": 20.536643981933594, + "kl": 2.479689121246338, + "learning_rate": 3.6037646319793635e-06, + "loss": 0.0992, + "reward": 2.2166666984558105, + "reward_std": 0.8577102422714233, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 838 + }, + { + "completion_length": 191.5, + "epoch": 2.9335664335664333, + "grad_norm": 16.66517448425293, + "kl": 1.5780773162841797, + "learning_rate": 3.599847924639788e-06, + "loss": 0.0631, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 839 + }, + { + "completion_length": 391.16668701171875, + "epoch": 2.937062937062937, + "grad_norm": 0.408719927072525, + "kl": 0.27218514680862427, + "learning_rate": 3.595927866972694e-06, + "loss": 0.0109, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 840 + }, + { + "completion_length": 214.83334350585938, + "epoch": 2.9405594405594404, + "grad_norm": 0.8976386189460754, + "kl": 0.3075045049190521, + "learning_rate": 3.592004470919256e-06, + "loss": 0.0123, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 841 + }, + { + "completion_length": 191.5, + "epoch": 2.944055944055944, + "grad_norm": 0.8545355796813965, + "kl": 0.3112524747848511, + "learning_rate": 3.5880777484308193e-06, + "loss": 0.0125, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 842 + }, + { + "completion_length": 169.0, + "epoch": 2.9475524475524475, + "grad_norm": 3.7284159660339355, + "kl": 0.7016023397445679, + "learning_rate": 3.5841477114688616e-06, + "loss": 0.0281, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 843 + }, + { + "completion_length": 125.16667175292969, + "epoch": 2.951048951048951, + "grad_norm": 0.08267883211374283, + "kl": 0.3959384560585022, + "learning_rate": 3.5802143720049565e-06, + "loss": 0.0182, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 844 + }, + { + "completion_length": 191.5, + "epoch": 2.9545454545454546, + "grad_norm": 0.6881157159805298, + "kl": 0.28095734119415283, + "learning_rate": 3.5762777420207382e-06, + "loss": 0.0112, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 845 + }, + { + "completion_length": 165.0, + "epoch": 2.958041958041958, + "grad_norm": 17.351783752441406, + "kl": 3.690758466720581, + "learning_rate": 3.5723378335078653e-06, + "loss": 0.1476, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 846 + }, + { + "completion_length": 498.8333435058594, + "epoch": 2.9615384615384617, + "grad_norm": 0.6604005098342896, + "kl": 0.22229203581809998, + "learning_rate": 3.5683946584679818e-06, + "loss": 0.0089, + "reward": 2.516666889190674, + "reward_std": 1.2355835437774658, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 847 + }, + { + "completion_length": 831.5, + "epoch": 2.965034965034965, + "grad_norm": 0.4960877597332001, + "kl": 0.2161356508731842, + "learning_rate": 3.564448228912682e-06, + "loss": 0.0086, + "reward": 1.691666603088379, + "reward_std": 1.2619099617004395, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916666030883789, + "step": 848 + }, + { + "completion_length": 206.0, + "epoch": 2.9685314685314683, + "grad_norm": 0.7653523683547974, + "kl": 0.2660280764102936, + "learning_rate": 3.5604985568634754e-06, + "loss": 0.0106, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 849 + }, + { + "completion_length": 188.6666717529297, + "epoch": 2.972027972027972, + "grad_norm": 5.382836818695068, + "kl": 0.5285301804542542, + "learning_rate": 3.556545654351749e-06, + "loss": 0.0211, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 850 + }, + { + "completion_length": 240.0, + "epoch": 2.9755244755244754, + "grad_norm": 0.7501710653305054, + "kl": 0.2436714470386505, + "learning_rate": 3.552589533418728e-06, + "loss": 0.0097, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 851 + }, + { + "completion_length": 243.6666717529297, + "epoch": 2.979020979020979, + "grad_norm": 0.6721853017807007, + "kl": 0.2798357903957367, + "learning_rate": 3.5486302061154433e-06, + "loss": 0.0112, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 852 + }, + { + "completion_length": 444.5, + "epoch": 2.9825174825174825, + "grad_norm": 0.47981539368629456, + "kl": 0.2967255413532257, + "learning_rate": 3.5446676845026922e-06, + "loss": 0.0119, + "reward": 3.2916667461395264, + "reward_std": 1.6125807762145996, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 853 + }, + { + "completion_length": 395.66668701171875, + "epoch": 2.986013986013986, + "grad_norm": 0.5626189708709717, + "kl": 0.27203381061553955, + "learning_rate": 3.5407019806510035e-06, + "loss": 0.0109, + "reward": 2.450000286102295, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 854 + }, + { + "completion_length": 170.33334350585938, + "epoch": 2.9895104895104896, + "grad_norm": 0.6966221928596497, + "kl": 0.28407615423202515, + "learning_rate": 3.536733106640598e-06, + "loss": 0.0114, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 855 + }, + { + "completion_length": 238.0, + "epoch": 2.993006993006993, + "grad_norm": 0.7480601668357849, + "kl": 0.2773779332637787, + "learning_rate": 3.532761074561355e-06, + "loss": 0.0111, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 856 + }, + { + "completion_length": 202.5, + "epoch": 2.9965034965034967, + "grad_norm": 0.7340975999832153, + "kl": 0.3754419982433319, + "learning_rate": 3.5287858965127726e-06, + "loss": 0.015, + "reward": 3.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 857 + }, + { + "completion_length": 197.5, + "epoch": 3.0, + "grad_norm": 0.7914144992828369, + "kl": 0.2655482590198517, + "learning_rate": 3.524807584603932e-06, + "loss": 0.0106, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 858 + }, + { + "completion_length": 200.83334350585938, + "epoch": 3.0034965034965033, + "grad_norm": 0.7124848365783691, + "kl": 0.2514963448047638, + "learning_rate": 3.5208261509534627e-06, + "loss": 0.0101, + "reward": 2.950000286102295, + "reward_std": 1.095445156097412, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 859 + }, + { + "completion_length": 238.83334350585938, + "epoch": 3.006993006993007, + "grad_norm": 0.061087291687726974, + "kl": 0.3129764497280121, + "learning_rate": 3.516841607689501e-06, + "loss": 0.0149, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 860 + }, + { + "completion_length": 237.5, + "epoch": 3.0104895104895104, + "grad_norm": 0.7671120762825012, + "kl": 0.2705675959587097, + "learning_rate": 3.512853966949657e-06, + "loss": 0.0108, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 861 + }, + { + "completion_length": 220.0, + "epoch": 3.013986013986014, + "grad_norm": 0.7371062636375427, + "kl": 0.2749347686767578, + "learning_rate": 3.5088632408809757e-06, + "loss": 0.011, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 862 + }, + { + "completion_length": 291.3333435058594, + "epoch": 3.0174825174825175, + "grad_norm": 0.43244338035583496, + "kl": 0.2551594078540802, + "learning_rate": 3.504869441639901e-06, + "loss": 0.0102, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 863 + }, + { + "completion_length": 478.5, + "epoch": 3.020979020979021, + "grad_norm": 0.6255612373352051, + "kl": 0.21007221937179565, + "learning_rate": 3.5008725813922383e-06, + "loss": 0.0084, + "reward": 2.7333333492279053, + "reward_std": 0.8256310820579529, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 864 + }, + { + "completion_length": 524.3333740234375, + "epoch": 3.0244755244755246, + "grad_norm": 3.062211751937866, + "kl": 0.5263761878013611, + "learning_rate": 3.496872672313116e-06, + "loss": 0.0211, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 865 + }, + { + "completion_length": 234.33334350585938, + "epoch": 3.027972027972028, + "grad_norm": 0.7551546692848206, + "kl": 0.28137749433517456, + "learning_rate": 3.4928697265869516e-06, + "loss": 0.0113, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 866 + }, + { + "completion_length": 784.1666870117188, + "epoch": 3.0314685314685317, + "grad_norm": 1202.5098876953125, + "kl": 133.5108184814453, + "learning_rate": 3.488863756407413e-06, + "loss": 5.3404, + "reward": 1.7000001668930054, + "reward_std": 1.4064139127731323, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5333333611488342, + "step": 867 + }, + { + "completion_length": 495.5, + "epoch": 3.034965034965035, + "grad_norm": 0.5171676278114319, + "kl": 0.2767552137374878, + "learning_rate": 3.4848547739773782e-06, + "loss": 0.0111, + "reward": 3.2916667461395264, + "reward_std": 1.6125807762145996, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 868 + }, + { + "completion_length": 237.33334350585938, + "epoch": 3.0384615384615383, + "grad_norm": 0.6683189272880554, + "kl": 0.26167619228363037, + "learning_rate": 3.480842791508904e-06, + "loss": 0.0105, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 869 + }, + { + "completion_length": 234.33334350585938, + "epoch": 3.041958041958042, + "grad_norm": 0.7994163036346436, + "kl": 0.24035631120204926, + "learning_rate": 3.476827821223184e-06, + "loss": 0.0096, + "reward": 2.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 870 + }, + { + "completion_length": 249.83334350585938, + "epoch": 3.0454545454545454, + "grad_norm": 0.06245582178235054, + "kl": 0.35248756408691406, + "learning_rate": 3.4728098753505157e-06, + "loss": 0.0165, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 871 + }, + { + "completion_length": 214.33334350585938, + "epoch": 3.0489510489510487, + "grad_norm": 0.7337875366210938, + "kl": 0.2762923240661621, + "learning_rate": 3.4687889661302577e-06, + "loss": 0.0111, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 872 + }, + { + "completion_length": 178.6666717529297, + "epoch": 3.0524475524475525, + "grad_norm": 5.453631401062012, + "kl": 0.6660811305046082, + "learning_rate": 3.4647651058107967e-06, + "loss": 0.0266, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 873 + }, + { + "completion_length": 207.0, + "epoch": 3.055944055944056, + "grad_norm": 0.7989277839660645, + "kl": 0.2972930669784546, + "learning_rate": 3.460738306649509e-06, + "loss": 0.0119, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 874 + }, + { + "completion_length": 547.6666870117188, + "epoch": 3.0594405594405596, + "grad_norm": 0.3988294005393982, + "kl": 0.19996020197868347, + "learning_rate": 3.4567085809127247e-06, + "loss": 0.008, + "reward": 1.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 875 + }, + { + "completion_length": 227.0, + "epoch": 3.062937062937063, + "grad_norm": 0.049581676721572876, + "kl": 0.3112030327320099, + "learning_rate": 3.452675940875686e-06, + "loss": 0.0148, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 876 + }, + { + "completion_length": 218.83334350585938, + "epoch": 3.0664335664335662, + "grad_norm": 0.10846934467554092, + "kl": 0.300067275762558, + "learning_rate": 3.448640398822513e-06, + "loss": 0.0144, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 877 + }, + { + "completion_length": 808.8333740234375, + "epoch": 3.06993006993007, + "grad_norm": 0.5524038672447205, + "kl": 0.21805749833583832, + "learning_rate": 3.4446019670461684e-06, + "loss": 0.0087, + "reward": 2.516666889190674, + "reward_std": 1.2355835437774658, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 878 + }, + { + "completion_length": 227.6666717529297, + "epoch": 3.0734265734265733, + "grad_norm": 0.8796508312225342, + "kl": 0.3852163255214691, + "learning_rate": 3.440560657848414e-06, + "loss": 0.0154, + "reward": 2.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 879 + }, + { + "completion_length": 493.8333435058594, + "epoch": 3.076923076923077, + "grad_norm": 0.7607131004333496, + "kl": 0.37157270312309265, + "learning_rate": 3.436516483539781e-06, + "loss": 0.0149, + "reward": 2.5250000953674316, + "reward_std": 1.6839685440063477, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 880 + }, + { + "completion_length": 322.3333435058594, + "epoch": 3.0804195804195804, + "grad_norm": 0.8827399611473083, + "kl": 0.23807722330093384, + "learning_rate": 3.4324694564395228e-06, + "loss": 0.0095, + "reward": 3.258333206176758, + "reward_std": 0.7939878702163696, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.9250000715255737, + "step": 881 + }, + { + "completion_length": 508.66668701171875, + "epoch": 3.0839160839160837, + "grad_norm": 0.619054913520813, + "kl": 0.2534877061843872, + "learning_rate": 3.4284195888755877e-06, + "loss": 0.0101, + "reward": 2.3500001430511475, + "reward_std": 1.2328827381134033, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 882 + }, + { + "completion_length": 525.0, + "epoch": 3.0874125874125875, + "grad_norm": 0.340080589056015, + "kl": 0.33849403262138367, + "learning_rate": 3.4243668931845734e-06, + "loss": 0.0135, + "reward": 3.516666889190674, + "reward_std": 1.0614454746246338, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 883 + }, + { + "completion_length": 199.33334350585938, + "epoch": 3.090909090909091, + "grad_norm": 0.8693441152572632, + "kl": 0.29566070437431335, + "learning_rate": 3.4203113817116955e-06, + "loss": 0.0118, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 884 + }, + { + "completion_length": 528.6666870117188, + "epoch": 3.0944055944055946, + "grad_norm": 0.4151526689529419, + "kl": 0.22466908395290375, + "learning_rate": 3.4162530668107435e-06, + "loss": 0.009, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 885 + }, + { + "completion_length": 166.6666717529297, + "epoch": 3.097902097902098, + "grad_norm": 0.7520537972450256, + "kl": 0.2834951877593994, + "learning_rate": 3.412191960844049e-06, + "loss": 0.0113, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 886 + }, + { + "completion_length": 1012.3333740234375, + "epoch": 3.1013986013986012, + "grad_norm": 0.3472742438316345, + "kl": 0.23108121752738953, + "learning_rate": 3.4081280761824465e-06, + "loss": 0.0092, + "reward": 3.183333396911621, + "reward_std": 1.4445298910140991, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 887 + }, + { + "completion_length": 542.0, + "epoch": 3.104895104895105, + "grad_norm": 0.3802301287651062, + "kl": 0.27087870240211487, + "learning_rate": 3.4040614252052305e-06, + "loss": 0.0108, + "reward": 3.375, + "reward_std": 1.001873254776001, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 888 + }, + { + "completion_length": 239.5, + "epoch": 3.1083916083916083, + "grad_norm": 0.7231019735336304, + "kl": 0.27221542596817017, + "learning_rate": 3.3999920203001287e-06, + "loss": 0.0109, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 889 + }, + { + "completion_length": 129.83334350585938, + "epoch": 3.111888111888112, + "grad_norm": 0.6994293332099915, + "kl": 0.5249607563018799, + "learning_rate": 3.39591987386325e-06, + "loss": 0.0234, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 890 + }, + { + "completion_length": 1352.3333740234375, + "epoch": 3.1153846153846154, + "grad_norm": 0.4665977954864502, + "kl": 0.17533139884471893, + "learning_rate": 3.391844998299063e-06, + "loss": 0.007, + "reward": 2.612499952316284, + "reward_std": 0.989412784576416, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7791666984558105, + "step": 891 + }, + { + "completion_length": 515.8333740234375, + "epoch": 3.1188811188811187, + "grad_norm": 0.7257252335548401, + "kl": 0.2537638545036316, + "learning_rate": 3.387767406020343e-06, + "loss": 0.0102, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 892 + }, + { + "completion_length": 520.1666870117188, + "epoch": 3.1223776223776225, + "grad_norm": 0.46513354778289795, + "kl": 0.2608945369720459, + "learning_rate": 3.383687109448143e-06, + "loss": 0.0104, + "reward": 3.3500001430511475, + "reward_std": 1.058300495147705, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 893 + }, + { + "completion_length": 244.0, + "epoch": 3.125874125874126, + "grad_norm": 0.08787418156862259, + "kl": 0.3290454149246216, + "learning_rate": 3.3796041210117545e-06, + "loss": 0.0155, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 894 + }, + { + "completion_length": 255.1666717529297, + "epoch": 3.129370629370629, + "grad_norm": 0.693707287311554, + "kl": 0.2893093526363373, + "learning_rate": 3.375518453148669e-06, + "loss": 0.0116, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 895 + }, + { + "completion_length": 339.0, + "epoch": 3.132867132867133, + "grad_norm": 0.6376519203186035, + "kl": 0.25471729040145874, + "learning_rate": 3.3714301183045382e-06, + "loss": 0.0102, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 896 + }, + { + "completion_length": 199.1666717529297, + "epoch": 3.1363636363636362, + "grad_norm": 0.8792543411254883, + "kl": 0.2878587245941162, + "learning_rate": 3.3673391289331398e-06, + "loss": 0.0115, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 897 + }, + { + "completion_length": 231.6666717529297, + "epoch": 3.13986013986014, + "grad_norm": 0.865860641002655, + "kl": 0.2975502610206604, + "learning_rate": 3.3632454974963368e-06, + "loss": 0.0119, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 898 + }, + { + "completion_length": 202.33334350585938, + "epoch": 3.1433566433566433, + "grad_norm": 0.7683760523796082, + "kl": 0.2757861614227295, + "learning_rate": 3.359149236464041e-06, + "loss": 0.011, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 899 + }, + { + "completion_length": 532.8333740234375, + "epoch": 3.1468531468531467, + "grad_norm": 0.4694902002811432, + "kl": 0.2863316535949707, + "learning_rate": 3.3550503583141726e-06, + "loss": 0.0115, + "reward": 3.183333396911621, + "reward_std": 1.4445298910140991, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 900 + } + ], + "logging_steps": 1, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-900/training_args.bin b/checkpoint-900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..404a67ca1097568ef818195412e92eb5df6df003 --- /dev/null +++ b/checkpoint-900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9b809202c83316443ca7c3596f9666d891e249e918f031374256726d85b5070 +size 6008 diff --git a/checkpoint-950/README.md b/checkpoint-950/README.md new file mode 100644 index 0000000000000000000000000000000000000000..342a23987f57b711334f1f7c4b72004ab4751d11 --- /dev/null +++ b/checkpoint-950/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/checkpoint-950/adapter_config.json b/checkpoint-950/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca69f90ffbea02ffd530ac27f43588458c02af39 --- /dev/null +++ b/checkpoint-950/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "up_proj", + "o_proj", + "k_proj", + "gate_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-950/adapter_model.safetensors b/checkpoint-950/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7f4bf3d8d4e6307a379ab0c1c682bdebdee0d140 --- /dev/null +++ b/checkpoint-950/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:220ec641785adb6318bd34143bae6423f662f81604dfe36b12fd3fe14a9e6803 +size 778096664 diff --git a/checkpoint-950/optimizer.pt b/checkpoint-950/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b15956a7b7da31dc178d99c95bfcb6e46e800597 --- /dev/null +++ b/checkpoint-950/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e69ce9141830d53bb60f8e7995187493e407923e185a0d75ae6866a7bc2530 +size 395571252 diff --git a/checkpoint-950/rng_state.pth b/checkpoint-950/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..42bb4f35620cfd713ec31cb2a6201271dd44ccb6 --- /dev/null +++ b/checkpoint-950/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60a6b2fd14f85c321a8d0e62f70b31150ff6dd4b7e9b3029068e57fe23e78c9d +size 14244 diff --git a/checkpoint-950/scheduler.pt b/checkpoint-950/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..68df16378e3794107e0bd87b05875597364cae63 --- /dev/null +++ b/checkpoint-950/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08553ea5fa2bf20c5e49f94ece610547c63cf8cc543858c43b1ffb0ff2376298 +size 1064 diff --git a/checkpoint-950/special_tokens_map.json b/checkpoint-950/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-950/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-950/tokenizer.json b/checkpoint-950/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-950/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-950/tokenizer_config.json b/checkpoint-950/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f29bafcf7d24e386a389486e71a4e81dfef0f5c2 --- /dev/null +++ b/checkpoint-950/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-950/trainer_state.json b/checkpoint-950/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8dfeeded47921f39e30e1eb7e4f132fc3774f4df --- /dev/null +++ b/checkpoint-950/trainer_state.json @@ -0,0 +1,14283 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.3216783216783217, + "eval_steps": 500, + "global_step": 950, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 399.0, + "epoch": 0.0034965034965034965, + "grad_norm": 0.9857833385467529, + "kl": 0.0, + "learning_rate": 2.5000000000000002e-08, + "loss": 0.0, + "reward": 1.75, + "reward_std": 1.069111704826355, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4166666865348816, + "step": 1 + }, + { + "completion_length": 305.3333435058594, + "epoch": 0.006993006993006993, + "grad_norm": 1.3122953176498413, + "kl": 0.0, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.0, + "reward": 1.0500000715255737, + "reward_std": 0.6340347528457642, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 2 + }, + { + "completion_length": 475.3333435058594, + "epoch": 0.01048951048951049, + "grad_norm": 6.344944953918457, + "kl": 0.0006356238736771047, + "learning_rate": 7.500000000000001e-08, + "loss": 0.0, + "reward": 1.4750001430511475, + "reward_std": 1.025061011314392, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 3 + }, + { + "completion_length": 378.3333435058594, + "epoch": 0.013986013986013986, + "grad_norm": 0.9831988215446472, + "kl": 0.0006719424272887409, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.0, + "reward": 1.2208333015441895, + "reward_std": 1.3383214473724365, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.22083334624767303, + "step": 4 + }, + { + "completion_length": 925.0, + "epoch": 0.017482517482517484, + "grad_norm": 1.042701005935669, + "kl": 0.000699286290910095, + "learning_rate": 1.2500000000000002e-07, + "loss": 0.0, + "reward": 2.4666666984558105, + "reward_std": 1.618847370147705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 5 + }, + { + "completion_length": 130.6666717529297, + "epoch": 0.02097902097902098, + "grad_norm": 1.276957631111145, + "kl": 0.0007741473382338881, + "learning_rate": 1.5000000000000002e-07, + "loss": 0.0, + "reward": 0.38333332538604736, + "reward_std": 0.7222649455070496, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 6 + }, + { + "completion_length": 185.5, + "epoch": 0.024475524475524476, + "grad_norm": 1.277024507522583, + "kl": 0.0007853443967178464, + "learning_rate": 1.7500000000000002e-07, + "loss": 0.0, + "reward": 0.7250000238418579, + "reward_std": 0.44017040729522705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 7 + }, + { + "completion_length": 113.83333587646484, + "epoch": 0.027972027972027972, + "grad_norm": 4.894377708435059, + "kl": 0.0010196010116487741, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.0, + "reward": 0.7250000238418579, + "reward_std": 0.5777109861373901, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 8 + }, + { + "completion_length": 195.33334350585938, + "epoch": 0.03146853146853147, + "grad_norm": 0.9491543769836426, + "kl": 0.0009398699621669948, + "learning_rate": 2.2500000000000002e-07, + "loss": 0.0, + "reward": 1.2750000953674316, + "reward_std": 0.673609733581543, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10833333432674408, + "step": 9 + }, + { + "completion_length": 168.83334350585938, + "epoch": 0.03496503496503497, + "grad_norm": 4.634313583374023, + "kl": 0.0008446139981970191, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.0, + "reward": 0.5791666507720947, + "reward_std": 1.011238932609558, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.07916666567325592, + "step": 10 + }, + { + "completion_length": 181.0, + "epoch": 0.038461538461538464, + "grad_norm": 0.9203607439994812, + "kl": 0.0005472182529047132, + "learning_rate": 2.75e-07, + "loss": 0.0, + "reward": 1.2833333015441895, + "reward_std": 0.9125057458877563, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.11666666716337204, + "step": 11 + }, + { + "completion_length": 181.1666717529297, + "epoch": 0.04195804195804196, + "grad_norm": 1.4339206218719482, + "kl": 0.0007050944259390235, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.0, + "reward": 1.7333333492279053, + "reward_std": 1.0063133239746094, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.23333333432674408, + "step": 12 + }, + { + "completion_length": 130.0, + "epoch": 0.045454545454545456, + "grad_norm": 1.073473334312439, + "kl": 0.0007636564550921321, + "learning_rate": 3.25e-07, + "loss": 0.0, + "reward": 1.058333396911621, + "reward_std": 0.7486097812652588, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 13 + }, + { + "completion_length": 356.16668701171875, + "epoch": 0.04895104895104895, + "grad_norm": 0.8452476859092712, + "kl": 0.0006562608177773654, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.0, + "reward": 0.7416666746139526, + "reward_std": 0.8662659525871277, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.24166667461395264, + "step": 14 + }, + { + "completion_length": 143.1666717529297, + "epoch": 0.05244755244755245, + "grad_norm": 0.9590725302696228, + "kl": 0.0008172739762812853, + "learning_rate": 3.75e-07, + "loss": 0.0, + "reward": 0.5541666746139526, + "reward_std": 0.9553031921386719, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05416666716337204, + "step": 15 + }, + { + "completion_length": 454.16668701171875, + "epoch": 0.055944055944055944, + "grad_norm": 1.2272268533706665, + "kl": 0.0007388863014057279, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.0, + "reward": 1.2083333730697632, + "reward_std": 1.0360583066940308, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 16 + }, + { + "completion_length": 152.5, + "epoch": 0.05944055944055944, + "grad_norm": 1.0074872970581055, + "kl": 0.0006766216829419136, + "learning_rate": 4.2500000000000006e-07, + "loss": 0.0, + "reward": 0.8916666507720947, + "reward_std": 0.8662659525871277, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 17 + }, + { + "completion_length": 250.1666717529297, + "epoch": 0.06293706293706294, + "grad_norm": 1.305372953414917, + "kl": 0.001035388559103012, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.0, + "reward": 0.7166666984558105, + "reward_std": 1.2201093435287476, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 18 + }, + { + "completion_length": 243.0, + "epoch": 0.06643356643356643, + "grad_norm": 1.0690687894821167, + "kl": 0.0006665514083579183, + "learning_rate": 4.7500000000000006e-07, + "loss": 0.0, + "reward": 0.9916666746139526, + "reward_std": 0.6167792677879333, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.15833333134651184, + "step": 19 + }, + { + "completion_length": 276.16668701171875, + "epoch": 0.06993006993006994, + "grad_norm": 1.052300214767456, + "kl": 0.0005925261066295207, + "learning_rate": 5.000000000000001e-07, + "loss": 0.0, + "reward": 1.5333333015441895, + "reward_std": 1.0186593532562256, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.20000001788139343, + "step": 20 + }, + { + "completion_length": 333.3333435058594, + "epoch": 0.07342657342657342, + "grad_norm": 0.95088130235672, + "kl": 0.0006341444095596671, + "learning_rate": 5.250000000000001e-07, + "loss": 0.0, + "reward": 1.8583333492279053, + "reward_std": 0.8458231687545776, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3583333194255829, + "step": 21 + }, + { + "completion_length": 166.6666717529297, + "epoch": 0.07692307692307693, + "grad_norm": 1.2825149297714233, + "kl": 0.0007712479564361274, + "learning_rate": 5.5e-07, + "loss": 0.0, + "reward": 0.7666666507720947, + "reward_std": 1.1881358623504639, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10000000894069672, + "step": 22 + }, + { + "completion_length": 380.0, + "epoch": 0.08041958041958042, + "grad_norm": 1.2229748964309692, + "kl": 0.0007141837850213051, + "learning_rate": 5.750000000000001e-07, + "loss": 0.0, + "reward": 1.933333396911621, + "reward_std": 0.7672461867332458, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 23 + }, + { + "completion_length": 250.0, + "epoch": 0.08391608391608392, + "grad_norm": 1.1869820356369019, + "kl": 0.0007901927456259727, + "learning_rate": 6.000000000000001e-07, + "loss": 0.0, + "reward": 0.9666666984558105, + "reward_std": 0.797287106513977, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 24 + }, + { + "completion_length": 224.33334350585938, + "epoch": 0.08741258741258741, + "grad_norm": 1.1140718460083008, + "kl": 0.0006676652701571584, + "learning_rate": 6.25e-07, + "loss": 0.0, + "reward": 1.125, + "reward_std": 1.069462537765503, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.125, + "step": 25 + }, + { + "completion_length": 112.33333587646484, + "epoch": 0.09090909090909091, + "grad_norm": 1.20625901222229, + "kl": 0.0006995900766924024, + "learning_rate": 6.5e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0, + "step": 26 + }, + { + "completion_length": 398.8333435058594, + "epoch": 0.0944055944055944, + "grad_norm": 5.332723617553711, + "kl": 0.0007186655420809984, + "learning_rate": 6.750000000000001e-07, + "loss": 0.0, + "reward": 1.6625001430511475, + "reward_std": 0.9664044380187988, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3291666805744171, + "step": 27 + }, + { + "completion_length": 336.3333435058594, + "epoch": 0.0979020979020979, + "grad_norm": 0.7707162499427795, + "kl": 0.0007305681938305497, + "learning_rate": 7.000000000000001e-07, + "loss": 0.0, + "reward": 1.441666603088379, + "reward_std": 0.9876319766044617, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.2750000059604645, + "step": 28 + }, + { + "completion_length": 355.8333435058594, + "epoch": 0.10139860139860139, + "grad_norm": 0.999113142490387, + "kl": 0.0006821553106419742, + "learning_rate": 7.25e-07, + "loss": 0.0, + "reward": 1.1583333015441895, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.15833333134651184, + "step": 29 + }, + { + "completion_length": 188.1666717529297, + "epoch": 0.1048951048951049, + "grad_norm": 1.1029480695724487, + "kl": 0.0007804523920640349, + "learning_rate": 7.5e-07, + "loss": 0.0, + "reward": 1.183333396911621, + "reward_std": 1.0680201053619385, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.18333333730697632, + "step": 30 + }, + { + "completion_length": 380.3333435058594, + "epoch": 0.10839160839160839, + "grad_norm": 0.9132871627807617, + "kl": 0.0008556495886296034, + "learning_rate": 7.750000000000001e-07, + "loss": 0.0, + "reward": 2.2375001907348633, + "reward_std": 1.4762918949127197, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40416666865348816, + "step": 31 + }, + { + "completion_length": 348.0, + "epoch": 0.11188811188811189, + "grad_norm": 1.549122929573059, + "kl": 0.0009064790210686624, + "learning_rate": 8.000000000000001e-07, + "loss": 0.0, + "reward": 0.8291666507720947, + "reward_std": 1.029613733291626, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.16250000894069672, + "step": 32 + }, + { + "completion_length": 349.5, + "epoch": 0.11538461538461539, + "grad_norm": 0.8771302700042725, + "kl": 0.0008574656676501036, + "learning_rate": 8.250000000000001e-07, + "loss": 0.0, + "reward": 1.133333444595337, + "reward_std": 0.9867455363273621, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.30000001192092896, + "step": 33 + }, + { + "completion_length": 698.8333740234375, + "epoch": 0.11888111888111888, + "grad_norm": 0.7568854689598083, + "kl": 0.0007735582767054439, + "learning_rate": 8.500000000000001e-07, + "loss": 0.0, + "reward": 1.933333396911621, + "reward_std": 1.1737406253814697, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 34 + }, + { + "completion_length": 655.3333740234375, + "epoch": 0.12237762237762238, + "grad_norm": 1.5077099800109863, + "kl": 0.0007145506679080427, + "learning_rate": 8.75e-07, + "loss": 0.0, + "reward": 1.337499976158142, + "reward_std": 0.7572566270828247, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5041667222976685, + "step": 35 + }, + { + "completion_length": 156.0, + "epoch": 0.1258741258741259, + "grad_norm": 1.1091190576553345, + "kl": 0.0010963345412164927, + "learning_rate": 9.000000000000001e-07, + "loss": 0.0, + "reward": 1.1583333015441895, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.15833333134651184, + "step": 36 + }, + { + "completion_length": 184.6666717529297, + "epoch": 0.12937062937062938, + "grad_norm": 1.1978340148925781, + "kl": 0.000993944238871336, + "learning_rate": 9.25e-07, + "loss": 0.0, + "reward": 0.8333333730697632, + "reward_std": 1.2944754362106323, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 37 + }, + { + "completion_length": 170.1666717529297, + "epoch": 0.13286713286713286, + "grad_norm": 0.9296630620956421, + "kl": 0.0012741987593472004, + "learning_rate": 9.500000000000001e-07, + "loss": 0.0001, + "reward": 1.25, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 38 + }, + { + "completion_length": 284.3333435058594, + "epoch": 0.13636363636363635, + "grad_norm": 1.3948841094970703, + "kl": 0.0010804318590089679, + "learning_rate": 9.750000000000002e-07, + "loss": 0.0, + "reward": 1.1083333492279053, + "reward_std": 1.263098120689392, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2750000059604645, + "step": 39 + }, + { + "completion_length": 132.1666717529297, + "epoch": 0.13986013986013987, + "grad_norm": 1.0202951431274414, + "kl": 0.0013121496886014938, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0001, + "reward": 0.3333333432674408, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0, + "step": 40 + }, + { + "completion_length": 156.1666717529297, + "epoch": 0.14335664335664336, + "grad_norm": 0.9724128246307373, + "kl": 0.0010785979684442282, + "learning_rate": 1.025e-06, + "loss": 0.0, + "reward": 0.6083333492279053, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.10833333432674408, + "step": 41 + }, + { + "completion_length": 603.1666870117188, + "epoch": 0.14685314685314685, + "grad_norm": 0.7776791453361511, + "kl": 0.0006764258723706007, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.0, + "reward": 1.4500001668930054, + "reward_std": 0.30659419298171997, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.45000001788139343, + "step": 42 + }, + { + "completion_length": 183.1666717529297, + "epoch": 0.15034965034965034, + "grad_norm": 1.2581369876861572, + "kl": 0.0012429999187588692, + "learning_rate": 1.075e-06, + "loss": 0.0, + "reward": 1.1749999523162842, + "reward_std": 1.0567638874053955, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17499999701976776, + "step": 43 + }, + { + "completion_length": 379.16668701171875, + "epoch": 0.15384615384615385, + "grad_norm": 2.0310208797454834, + "kl": 0.0011767616961151361, + "learning_rate": 1.1e-06, + "loss": 0.0, + "reward": 2.633333683013916, + "reward_std": 1.0595598220825195, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.46666663885116577, + "step": 44 + }, + { + "completion_length": 637.3333740234375, + "epoch": 0.15734265734265734, + "grad_norm": 1.2500090599060059, + "kl": 0.001643048133701086, + "learning_rate": 1.125e-06, + "loss": 0.0001, + "reward": 1.1500000953674316, + "reward_std": 0.7307531237602234, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 45 + }, + { + "completion_length": 182.0, + "epoch": 0.16083916083916083, + "grad_norm": 2.3323163986206055, + "kl": 0.003556631039828062, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.0001, + "reward": 0.9666666984558105, + "reward_std": 1.0230672359466553, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.13333334028720856, + "step": 46 + }, + { + "completion_length": 109.83333587646484, + "epoch": 0.16433566433566432, + "grad_norm": 1.834832787513733, + "kl": 0.002168774139136076, + "learning_rate": 1.175e-06, + "loss": 0.0001, + "reward": 0.5583333373069763, + "reward_std": 0.6248332858085632, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 47 + }, + { + "completion_length": 337.16668701171875, + "epoch": 0.16783216783216784, + "grad_norm": 1.1725846529006958, + "kl": 0.002405840437859297, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0001, + "reward": 0.6500000357627869, + "reward_std": 0.7962412238121033, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 48 + }, + { + "completion_length": 437.3333435058594, + "epoch": 0.17132867132867133, + "grad_norm": 0.743201494216919, + "kl": 0.0013375936541706324, + "learning_rate": 1.2250000000000001e-06, + "loss": 0.0001, + "reward": 1.183333396911621, + "reward_std": 1.3611271381378174, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3499999940395355, + "step": 49 + }, + { + "completion_length": 533.8333740234375, + "epoch": 0.17482517482517482, + "grad_norm": 0.7576809525489807, + "kl": 0.0019401045283302665, + "learning_rate": 1.25e-06, + "loss": 0.0001, + "reward": 1.7291667461395264, + "reward_std": 0.7050561308860779, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5625, + "step": 50 + }, + { + "completion_length": 203.5, + "epoch": 0.17832167832167833, + "grad_norm": 1.4076164960861206, + "kl": 0.0030774520710110664, + "learning_rate": 1.275e-06, + "loss": 0.0001, + "reward": 0.7750000357627869, + "reward_std": 0.5135659575462341, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2750000059604645, + "step": 51 + }, + { + "completion_length": 409.0, + "epoch": 0.18181818181818182, + "grad_norm": 0.8726016879081726, + "kl": 0.0025800741277635098, + "learning_rate": 1.3e-06, + "loss": 0.0001, + "reward": 0.5916666984558105, + "reward_std": 0.7324047088623047, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 52 + }, + { + "completion_length": 356.5, + "epoch": 0.1853146853146853, + "grad_norm": 0.877477765083313, + "kl": 0.0021268115378916264, + "learning_rate": 1.3250000000000002e-06, + "loss": 0.0001, + "reward": 1.6166666746139526, + "reward_std": 0.6976150274276733, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.28333336114883423, + "step": 53 + }, + { + "completion_length": 243.33334350585938, + "epoch": 0.1888111888111888, + "grad_norm": 0.9792532324790955, + "kl": 0.0043938253074884415, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.0002, + "reward": 1.1708333492279053, + "reward_std": 1.282616138458252, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17083333432674408, + "step": 54 + }, + { + "completion_length": 393.3333435058594, + "epoch": 0.19230769230769232, + "grad_norm": 1.205925703048706, + "kl": 0.0031106050591915846, + "learning_rate": 1.3750000000000002e-06, + "loss": 0.0001, + "reward": 0.9666666984558105, + "reward_std": 0.8084965944290161, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 55 + }, + { + "completion_length": 228.83334350585938, + "epoch": 0.1958041958041958, + "grad_norm": 0.7984407544136047, + "kl": 0.007072250358760357, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.0003, + "reward": 0.6916667222976685, + "reward_std": 1.1655113697052002, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19166666269302368, + "step": 56 + }, + { + "completion_length": 361.66668701171875, + "epoch": 0.1993006993006993, + "grad_norm": 3.0838680267333984, + "kl": 0.006738494616001844, + "learning_rate": 1.425e-06, + "loss": 0.0003, + "reward": 1.3041667938232422, + "reward_std": 0.2600080370903015, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30416667461395264, + "step": 57 + }, + { + "completion_length": 502.66668701171875, + "epoch": 0.20279720279720279, + "grad_norm": 0.7226095795631409, + "kl": 0.0058082761242985725, + "learning_rate": 1.45e-06, + "loss": 0.0002, + "reward": 1.399999976158142, + "reward_std": 1.3337916135787964, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.40000003576278687, + "step": 58 + }, + { + "completion_length": 210.5, + "epoch": 0.2062937062937063, + "grad_norm": 1.079681158065796, + "kl": 0.009464471600949764, + "learning_rate": 1.475e-06, + "loss": 0.0004, + "reward": 0.9750000238418579, + "reward_std": 1.1890122890472412, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.14166668057441711, + "step": 59 + }, + { + "completion_length": 208.5, + "epoch": 0.2097902097902098, + "grad_norm": 1.8312753438949585, + "kl": 0.03959222882986069, + "learning_rate": 1.5e-06, + "loss": 0.0016, + "reward": 0.5333333611488342, + "reward_std": 0.8553751707077026, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.20000001788139343, + "step": 60 + }, + { + "completion_length": 285.5, + "epoch": 0.21328671328671328, + "grad_norm": 0.9337784051895142, + "kl": 0.011914614588022232, + "learning_rate": 1.525e-06, + "loss": 0.0005, + "reward": 1.4458332061767578, + "reward_std": 0.4955846071243286, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.27916666865348816, + "step": 61 + }, + { + "completion_length": 276.3333435058594, + "epoch": 0.21678321678321677, + "grad_norm": 1.4266396760940552, + "kl": 0.02391706220805645, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.001, + "reward": 1.1583333015441895, + "reward_std": 0.8598934412002563, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32500001788139343, + "step": 62 + }, + { + "completion_length": 381.3333435058594, + "epoch": 0.2202797202797203, + "grad_norm": 1.1708087921142578, + "kl": 0.012987270019948483, + "learning_rate": 1.5750000000000002e-06, + "loss": 0.0005, + "reward": 1.5416667461395264, + "reward_std": 1.3807305097579956, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 63 + }, + { + "completion_length": 237.0, + "epoch": 0.22377622377622378, + "grad_norm": 1.3068374395370483, + "kl": 0.027782242745161057, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0011, + "reward": 1.433333396911621, + "reward_std": 1.162611961364746, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2666666805744171, + "step": 64 + }, + { + "completion_length": 797.6666870117188, + "epoch": 0.22727272727272727, + "grad_norm": 0.7319328784942627, + "kl": 0.013491494581103325, + "learning_rate": 1.6250000000000001e-06, + "loss": 0.0005, + "reward": 1.3166667222976685, + "reward_std": 0.8604747653007507, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3166666626930237, + "step": 65 + }, + { + "completion_length": 237.1666717529297, + "epoch": 0.23076923076923078, + "grad_norm": 1.9626200199127197, + "kl": 0.015099573880434036, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0006, + "reward": 0.9666666388511658, + "reward_std": 0.797287106513977, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 66 + }, + { + "completion_length": 221.1666717529297, + "epoch": 0.23426573426573427, + "grad_norm": 0.7815642952919006, + "kl": 0.03964684158563614, + "learning_rate": 1.6750000000000003e-06, + "loss": 0.0016, + "reward": 1.6416667699813843, + "reward_std": 1.0584973096847534, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.14166668057441711, + "step": 67 + }, + { + "completion_length": 227.33334350585938, + "epoch": 0.23776223776223776, + "grad_norm": 1.5282418727874756, + "kl": 0.0695306807756424, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0028, + "reward": 0.75, + "reward_std": 0.7375635504722595, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.25, + "step": 68 + }, + { + "completion_length": 673.3333740234375, + "epoch": 0.24125874125874125, + "grad_norm": 0.8560697436332703, + "kl": 0.03540939837694168, + "learning_rate": 1.725e-06, + "loss": 0.0014, + "reward": 2.200000047683716, + "reward_std": 0.9581232070922852, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5333333611488342, + "step": 69 + }, + { + "completion_length": 254.6666717529297, + "epoch": 0.24475524475524477, + "grad_norm": 1.2371562719345093, + "kl": 0.03692096844315529, + "learning_rate": 1.75e-06, + "loss": 0.0015, + "reward": 1.8249998092651367, + "reward_std": 0.9968700408935547, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32499998807907104, + "step": 70 + }, + { + "completion_length": 234.6666717529297, + "epoch": 0.24825174825174826, + "grad_norm": 0.9824966192245483, + "kl": 0.07421376556158066, + "learning_rate": 1.7750000000000002e-06, + "loss": 0.003, + "reward": 1.1666667461395264, + "reward_std": 0.6485882997512817, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3333333432674408, + "step": 71 + }, + { + "completion_length": 580.0, + "epoch": 0.2517482517482518, + "grad_norm": 1.0504631996154785, + "kl": 0.048039551824331284, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0019, + "reward": 1.808333396911621, + "reward_std": 1.302849531173706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 72 + }, + { + "completion_length": 788.1666870117188, + "epoch": 0.25524475524475526, + "grad_norm": 0.6447965502738953, + "kl": 0.04130098968744278, + "learning_rate": 1.825e-06, + "loss": 0.0017, + "reward": 1.3875000476837158, + "reward_std": 1.093817949295044, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5541666746139526, + "step": 73 + }, + { + "completion_length": 376.16668701171875, + "epoch": 0.25874125874125875, + "grad_norm": 1.347108244895935, + "kl": 0.19923770427703857, + "learning_rate": 1.85e-06, + "loss": 0.008, + "reward": 1.529166579246521, + "reward_std": 0.6618943214416504, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19583332538604736, + "step": 74 + }, + { + "completion_length": 227.1666717529297, + "epoch": 0.26223776223776224, + "grad_norm": 0.8091520667076111, + "kl": 0.06355344504117966, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.0025, + "reward": 0.75, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 75 + }, + { + "completion_length": 502.3333435058594, + "epoch": 0.26573426573426573, + "grad_norm": 1.1315293312072754, + "kl": 0.11514662951231003, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0046, + "reward": 1.504166603088379, + "reward_std": 1.256027102470398, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.33750003576278687, + "step": 76 + }, + { + "completion_length": 306.16668701171875, + "epoch": 0.2692307692307692, + "grad_norm": 1.6002874374389648, + "kl": 0.07964249700307846, + "learning_rate": 1.925e-06, + "loss": 0.0032, + "reward": 1.7083333730697632, + "reward_std": 1.2195971012115479, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5416666865348816, + "step": 77 + }, + { + "completion_length": 253.0, + "epoch": 0.2727272727272727, + "grad_norm": 1.134474754333496, + "kl": 0.09407778084278107, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0038, + "reward": 1.8333333730697632, + "reward_std": 1.0842816829681396, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3333333432674408, + "step": 78 + }, + { + "completion_length": 456.3333435058594, + "epoch": 0.2762237762237762, + "grad_norm": 1.4590799808502197, + "kl": 0.08163408935070038, + "learning_rate": 1.975e-06, + "loss": 0.0033, + "reward": 1.1875, + "reward_std": 1.164232611656189, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3541666865348816, + "step": 79 + }, + { + "completion_length": 273.0, + "epoch": 0.27972027972027974, + "grad_norm": 1.589087724685669, + "kl": 0.08010071516036987, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0032, + "reward": 0.9125000238418579, + "reward_std": 0.9088110327720642, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.07916666567325592, + "step": 80 + }, + { + "completion_length": 196.1666717529297, + "epoch": 0.28321678321678323, + "grad_norm": 1.4217482805252075, + "kl": 0.0619954913854599, + "learning_rate": 2.025e-06, + "loss": 0.0025, + "reward": 1.058333396911621, + "reward_std": 0.7486097812652588, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 81 + }, + { + "completion_length": 340.8333435058594, + "epoch": 0.2867132867132867, + "grad_norm": 1.056475043296814, + "kl": 0.05495650693774223, + "learning_rate": 2.05e-06, + "loss": 0.0022, + "reward": 0.8625000715255737, + "reward_std": 0.5305068492889404, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.19583332538604736, + "step": 82 + }, + { + "completion_length": 410.66668701171875, + "epoch": 0.2902097902097902, + "grad_norm": 0.5162915587425232, + "kl": 0.04134432598948479, + "learning_rate": 2.075e-06, + "loss": 0.0017, + "reward": 1.1875, + "reward_std": 0.7466174364089966, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1875, + "step": 83 + }, + { + "completion_length": 510.66668701171875, + "epoch": 0.2937062937062937, + "grad_norm": 0.9501734972000122, + "kl": 0.047528013586997986, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0019, + "reward": 1.258333444595337, + "reward_std": 1.1069854497909546, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 84 + }, + { + "completion_length": 476.0, + "epoch": 0.2972027972027972, + "grad_norm": 1.0745543241500854, + "kl": 0.04738708958029747, + "learning_rate": 2.125e-06, + "loss": 0.0019, + "reward": 0.7541666030883789, + "reward_std": 0.6050654649734497, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2541666626930237, + "step": 85 + }, + { + "completion_length": 346.16668701171875, + "epoch": 0.3006993006993007, + "grad_norm": 0.7894018888473511, + "kl": 0.03818603605031967, + "learning_rate": 2.15e-06, + "loss": 0.0015, + "reward": 1.5499999523162842, + "reward_std": 0.8729261159896851, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 86 + }, + { + "completion_length": 157.5, + "epoch": 0.3041958041958042, + "grad_norm": 1.2285088300704956, + "kl": 0.04852033406496048, + "learning_rate": 2.1750000000000004e-06, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 1.2284135818481445, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 87 + }, + { + "completion_length": 853.5, + "epoch": 0.3076923076923077, + "grad_norm": 1.1314716339111328, + "kl": 0.03052813559770584, + "learning_rate": 2.2e-06, + "loss": 0.0012, + "reward": 1.5625, + "reward_std": 1.093817949295044, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3958333432674408, + "step": 88 + }, + { + "completion_length": 372.66668701171875, + "epoch": 0.3111888111888112, + "grad_norm": 0.9353286623954773, + "kl": 0.027921725064516068, + "learning_rate": 2.2250000000000003e-06, + "loss": 0.0011, + "reward": 1.8250000476837158, + "reward_std": 0.9234446287155151, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.32500001788139343, + "step": 89 + }, + { + "completion_length": 296.3333435058594, + "epoch": 0.3146853146853147, + "grad_norm": 1.140289306640625, + "kl": 0.04811665043234825, + "learning_rate": 2.25e-06, + "loss": 0.0019, + "reward": 1.125, + "reward_std": 1.1268318891525269, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 90 + }, + { + "completion_length": 99.83333587646484, + "epoch": 0.3181818181818182, + "grad_norm": 4.178561687469482, + "kl": 0.09318779408931732, + "learning_rate": 2.2750000000000002e-06, + "loss": 0.0037, + "reward": 0.5583333373069763, + "reward_std": 0.9645810127258301, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.05833333358168602, + "step": 91 + }, + { + "completion_length": 192.1666717529297, + "epoch": 0.32167832167832167, + "grad_norm": 1.560648798942566, + "kl": 0.03698144853115082, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0015, + "reward": 1.9249999523162842, + "reward_std": 0.718853235244751, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.25833335518836975, + "step": 92 + }, + { + "completion_length": 576.5, + "epoch": 0.32517482517482516, + "grad_norm": 1.093043327331543, + "kl": 0.021529672667384148, + "learning_rate": 2.325e-06, + "loss": 0.0009, + "reward": 1.070833444595337, + "reward_std": 0.6477686166763306, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.23749999701976776, + "step": 93 + }, + { + "completion_length": 335.8333435058594, + "epoch": 0.32867132867132864, + "grad_norm": 0.8303731679916382, + "kl": 0.019405633211135864, + "learning_rate": 2.35e-06, + "loss": 0.0008, + "reward": 0.8416666984558105, + "reward_std": 0.9183772802352905, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.17499999701976776, + "step": 94 + }, + { + "completion_length": 569.5, + "epoch": 0.3321678321678322, + "grad_norm": 1.4912625551223755, + "kl": 0.014733041636645794, + "learning_rate": 2.375e-06, + "loss": 0.0006, + "reward": 1.4541667699813843, + "reward_std": 1.1459076404571533, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4541666507720947, + "step": 95 + }, + { + "completion_length": 232.83334350585938, + "epoch": 0.3356643356643357, + "grad_norm": 0.9174475073814392, + "kl": 0.018923718482255936, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0008, + "reward": 1.3333333730697632, + "reward_std": 0.9877583980560303, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.1666666716337204, + "step": 96 + }, + { + "completion_length": 742.1666870117188, + "epoch": 0.33916083916083917, + "grad_norm": 1.258750557899475, + "kl": 0.017664968967437744, + "learning_rate": 2.425e-06, + "loss": 0.0007, + "reward": 1.4583333730697632, + "reward_std": 0.6202150583267212, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 97 + }, + { + "completion_length": 270.8333435058594, + "epoch": 0.34265734265734266, + "grad_norm": 0.9259786605834961, + "kl": 0.05115365609526634, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.002, + "reward": 1.5500000715255737, + "reward_std": 0.8729261159896851, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.21666666865348816, + "step": 98 + }, + { + "completion_length": 476.3333435058594, + "epoch": 0.34615384615384615, + "grad_norm": 1.240902066230774, + "kl": 0.036602895706892014, + "learning_rate": 2.475e-06, + "loss": 0.0015, + "reward": 1.2791666984558105, + "reward_std": 1.1935679912567139, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.27916669845581055, + "step": 99 + }, + { + "completion_length": 213.6666717529297, + "epoch": 0.34965034965034963, + "grad_norm": 0.943215548992157, + "kl": 0.04590342566370964, + "learning_rate": 2.5e-06, + "loss": 0.0018, + "reward": 1.841666579246521, + "reward_std": 0.6873258352279663, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.34166666865348816, + "step": 100 + }, + { + "completion_length": 401.0, + "epoch": 0.3531468531468531, + "grad_norm": 0.7366496324539185, + "kl": 0.016905900090932846, + "learning_rate": 2.5250000000000004e-06, + "loss": 0.0007, + "reward": 1.3000000715255737, + "reward_std": 1.1256110668182373, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.30000001192092896, + "step": 101 + }, + { + "completion_length": 854.5, + "epoch": 0.35664335664335667, + "grad_norm": 8.089740753173828, + "kl": 0.08785610646009445, + "learning_rate": 2.55e-06, + "loss": 0.0035, + "reward": 1.316666603088379, + "reward_std": 1.2330517768859863, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 102 + }, + { + "completion_length": 455.16668701171875, + "epoch": 0.36013986013986016, + "grad_norm": 1.6066083908081055, + "kl": 0.03349429741501808, + "learning_rate": 2.5750000000000003e-06, + "loss": 0.0013, + "reward": 1.7333333492279053, + "reward_std": 1.6448911428451538, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40000003576278687, + "step": 103 + }, + { + "completion_length": 558.6666870117188, + "epoch": 0.36363636363636365, + "grad_norm": 1.2461860179901123, + "kl": 0.0453556627035141, + "learning_rate": 2.6e-06, + "loss": 0.0018, + "reward": 1.933333396911621, + "reward_std": 1.1851863861083984, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 104 + }, + { + "completion_length": 462.16668701171875, + "epoch": 0.36713286713286714, + "grad_norm": 0.9176071286201477, + "kl": 0.05445032939314842, + "learning_rate": 2.6250000000000003e-06, + "loss": 0.0022, + "reward": 1.2916667461395264, + "reward_std": 0.9144214391708374, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2916666865348816, + "step": 105 + }, + { + "completion_length": 357.5, + "epoch": 0.3706293706293706, + "grad_norm": 1.1796709299087524, + "kl": 0.08697855472564697, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0035, + "reward": 0.9833333492279053, + "reward_std": 1.2412359714508057, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 106 + }, + { + "completion_length": 556.8333740234375, + "epoch": 0.3741258741258741, + "grad_norm": 1.1719709634780884, + "kl": 0.09557916224002838, + "learning_rate": 2.6750000000000002e-06, + "loss": 0.0038, + "reward": 0.9541666507720947, + "reward_std": 1.0742924213409424, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2875000238418579, + "step": 107 + }, + { + "completion_length": 490.8333435058594, + "epoch": 0.3776223776223776, + "grad_norm": 0.9839584827423096, + "kl": 0.07620736211538315, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.003, + "reward": 1.3416666984558105, + "reward_std": 1.0551856756210327, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5083333253860474, + "step": 108 + }, + { + "completion_length": 459.8333435058594, + "epoch": 0.3811188811188811, + "grad_norm": 1.0232492685317993, + "kl": 0.09754881262779236, + "learning_rate": 2.7250000000000006e-06, + "loss": 0.0039, + "reward": 1.7916667461395264, + "reward_std": 1.201422929763794, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 109 + }, + { + "completion_length": 432.5, + "epoch": 0.38461538461538464, + "grad_norm": 0.7946304082870483, + "kl": 0.043154411017894745, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0017, + "reward": 2.1000001430511475, + "reward_std": 0.8933085203170776, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.4333333373069763, + "step": 110 + }, + { + "completion_length": 346.8333435058594, + "epoch": 0.3881118881118881, + "grad_norm": 0.9842674136161804, + "kl": 0.1046643778681755, + "learning_rate": 2.7750000000000005e-06, + "loss": 0.0042, + "reward": 0.8166667222976685, + "reward_std": 0.7353004217147827, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.3166666626930237, + "step": 111 + }, + { + "completion_length": 214.5, + "epoch": 0.3916083916083916, + "grad_norm": 1.1671849489212036, + "kl": 0.1281026154756546, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0051, + "reward": 1.0500000715255737, + "reward_std": 0.14832398295402527, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.21666666865348816, + "step": 112 + }, + { + "completion_length": 908.6666870117188, + "epoch": 0.3951048951048951, + "grad_norm": 0.3388780951499939, + "kl": 0.022495290264487267, + "learning_rate": 2.825e-06, + "loss": 0.0009, + "reward": 2.3375000953674316, + "reward_std": 0.3727431893348694, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6708333492279053, + "step": 113 + }, + { + "completion_length": 891.6666870117188, + "epoch": 0.3986013986013986, + "grad_norm": 0.467278391122818, + "kl": 0.025123490020632744, + "learning_rate": 2.85e-06, + "loss": 0.001, + "reward": 1.8541667461395264, + "reward_std": 0.7543899416923523, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6875, + "step": 114 + }, + { + "completion_length": 546.1666870117188, + "epoch": 0.4020979020979021, + "grad_norm": 1.054366111755371, + "kl": 0.0783834159374237, + "learning_rate": 2.875e-06, + "loss": 0.0031, + "reward": 2.4000000953674316, + "reward_std": 1.306904673576355, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5666666626930237, + "step": 115 + }, + { + "completion_length": 835.1666870117188, + "epoch": 0.40559440559440557, + "grad_norm": 0.7376688122749329, + "kl": 0.04768560454249382, + "learning_rate": 2.9e-06, + "loss": 0.0019, + "reward": 1.5291666984558105, + "reward_std": 0.32841163873672485, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5291666984558105, + "step": 116 + }, + { + "completion_length": 368.3333435058594, + "epoch": 0.4090909090909091, + "grad_norm": 1.456405758857727, + "kl": 0.1393664926290512, + "learning_rate": 2.925e-06, + "loss": 0.0056, + "reward": 0.9541666507720947, + "reward_std": 0.7450531721115112, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.2875000238418579, + "step": 117 + }, + { + "completion_length": 485.5, + "epoch": 0.4125874125874126, + "grad_norm": 1.4957919120788574, + "kl": 0.1291833370923996, + "learning_rate": 2.95e-06, + "loss": 0.0052, + "reward": 1.5833333730697632, + "reward_std": 1.4998888969421387, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4166666865348816, + "step": 118 + }, + { + "completion_length": 356.3333435058594, + "epoch": 0.4160839160839161, + "grad_norm": 1.178475022315979, + "kl": 0.10108506679534912, + "learning_rate": 2.9750000000000003e-06, + "loss": 0.004, + "reward": 0.7083333730697632, + "reward_std": 0.7506109476089478, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.375, + "step": 119 + }, + { + "completion_length": 140.33334350585938, + "epoch": 0.4195804195804196, + "grad_norm": 1.4624924659729004, + "kl": 0.2249661386013031, + "learning_rate": 3e-06, + "loss": 0.009, + "reward": 0.9166666865348816, + "reward_std": 0.9174239635467529, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.0833333358168602, + "step": 120 + }, + { + "completion_length": 673.1666870117188, + "epoch": 0.4230769230769231, + "grad_norm": 1.0837116241455078, + "kl": 0.09312133491039276, + "learning_rate": 3.0250000000000003e-06, + "loss": 0.0037, + "reward": 2.2208335399627686, + "reward_std": 0.9818881750106812, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.38749998807907104, + "step": 121 + }, + { + "completion_length": 238.1666717529297, + "epoch": 0.42657342657342656, + "grad_norm": 1.0982871055603027, + "kl": 0.05689762160181999, + "learning_rate": 3.05e-06, + "loss": 0.0023, + "reward": 1.1166666746139526, + "reward_std": 0.7567474246025085, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.11666666716337204, + "step": 122 + }, + { + "completion_length": 576.1666870117188, + "epoch": 0.43006993006993005, + "grad_norm": 1.0922025442123413, + "kl": 0.04579655081033707, + "learning_rate": 3.075e-06, + "loss": 0.0018, + "reward": 2.4000000953674316, + "reward_std": 1.0807406902313232, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5666666626930237, + "step": 123 + }, + { + "completion_length": 736.6666870117188, + "epoch": 0.43356643356643354, + "grad_norm": 1.5019290447235107, + "kl": 0.030428007245063782, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0012, + "reward": 1.504166603088379, + "reward_std": 1.2472386360168457, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5041667222976685, + "step": 124 + }, + { + "completion_length": 603.5, + "epoch": 0.4370629370629371, + "grad_norm": 4.212569713592529, + "kl": 0.37697991728782654, + "learning_rate": 3.125e-06, + "loss": 0.0151, + "reward": 1.6416667699813843, + "reward_std": 0.8303112387657166, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6416667103767395, + "step": 125 + }, + { + "completion_length": 492.0, + "epoch": 0.4405594405594406, + "grad_norm": 0.9634215831756592, + "kl": 0.06763506680727005, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0027, + "reward": 2.125, + "reward_std": 1.2069590091705322, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 126 + }, + { + "completion_length": 792.1666870117188, + "epoch": 0.44405594405594406, + "grad_norm": 0.4220138192176819, + "kl": 0.03986603766679764, + "learning_rate": 3.175e-06, + "loss": 0.0016, + "reward": 1.1375000476837158, + "reward_std": 0.5137485265731812, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6375000476837158, + "step": 127 + }, + { + "completion_length": 535.5, + "epoch": 0.44755244755244755, + "grad_norm": 4.797938823699951, + "kl": 0.13327616453170776, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0053, + "reward": 1.1791666746139526, + "reward_std": 1.1582764387130737, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.34583336114883423, + "step": 128 + }, + { + "completion_length": 444.8333435058594, + "epoch": 0.45104895104895104, + "grad_norm": 0.7808079719543457, + "kl": 0.055326174944639206, + "learning_rate": 3.2250000000000005e-06, + "loss": 0.0022, + "reward": 1.495833396911621, + "reward_std": 0.7681823968887329, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.16250000894069672, + "step": 129 + }, + { + "completion_length": 454.66668701171875, + "epoch": 0.45454545454545453, + "grad_norm": 0.8776301741600037, + "kl": 0.11162035167217255, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0045, + "reward": 1.5750001668930054, + "reward_std": 1.4812158346176147, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.24166665971279144, + "step": 130 + }, + { + "completion_length": 769.6666870117188, + "epoch": 0.458041958041958, + "grad_norm": 0.4391367733478546, + "kl": 0.025292951613664627, + "learning_rate": 3.2750000000000004e-06, + "loss": 0.001, + "reward": 2.433333396911621, + "reward_std": 0.2746209502220154, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6000000238418579, + "step": 131 + }, + { + "completion_length": 528.6666870117188, + "epoch": 0.46153846153846156, + "grad_norm": 0.8809014558792114, + "kl": 0.12223925441503525, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0049, + "reward": 2.120833396911621, + "reward_std": 1.101410150527954, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4541666507720947, + "step": 132 + }, + { + "completion_length": 491.3333435058594, + "epoch": 0.46503496503496505, + "grad_norm": 1.0070464611053467, + "kl": 0.05908138304948807, + "learning_rate": 3.3250000000000004e-06, + "loss": 0.0024, + "reward": 0.5916666984558105, + "reward_std": 0.5335416197776794, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.42500001192092896, + "step": 133 + }, + { + "completion_length": 892.5, + "epoch": 0.46853146853146854, + "grad_norm": 0.4570764899253845, + "kl": 0.037701599299907684, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0015, + "reward": 1.7249999046325684, + "reward_std": 1.292478322982788, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 134 + }, + { + "completion_length": 806.8333740234375, + "epoch": 0.47202797202797203, + "grad_norm": 0.5572299361228943, + "kl": 0.05404336377978325, + "learning_rate": 3.3750000000000003e-06, + "loss": 0.0022, + "reward": 1.4583333730697632, + "reward_std": 0.990033745765686, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7916666269302368, + "step": 135 + }, + { + "completion_length": 589.0, + "epoch": 0.4755244755244755, + "grad_norm": 0.7575751543045044, + "kl": 0.04170485585927963, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0017, + "reward": 2.683333396911621, + "reward_std": 1.1075499057769775, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8500000238418579, + "step": 136 + }, + { + "completion_length": 1060.166748046875, + "epoch": 0.479020979020979, + "grad_norm": 0.5119641423225403, + "kl": 0.04976843297481537, + "learning_rate": 3.4250000000000007e-06, + "loss": 0.002, + "reward": 1.1125000715255737, + "reward_std": 0.39457258582115173, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6124999523162842, + "step": 137 + }, + { + "completion_length": 559.8333740234375, + "epoch": 0.4825174825174825, + "grad_norm": 0.6115387082099915, + "kl": 0.05675242468714714, + "learning_rate": 3.45e-06, + "loss": 0.0023, + "reward": 2.0416667461395264, + "reward_std": 0.5715476274490356, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5416666865348816, + "step": 138 + }, + { + "completion_length": 685.6666870117188, + "epoch": 0.486013986013986, + "grad_norm": 1.2578071355819702, + "kl": 0.07080799341201782, + "learning_rate": 3.475e-06, + "loss": 0.0028, + "reward": 1.379166603088379, + "reward_std": 1.0072758197784424, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7125000357627869, + "step": 139 + }, + { + "completion_length": 987.5, + "epoch": 0.48951048951048953, + "grad_norm": 0.6280319690704346, + "kl": 0.03268418833613396, + "learning_rate": 3.5e-06, + "loss": 0.0013, + "reward": 0.9291666746139526, + "reward_std": 0.6654728651046753, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.5958333015441895, + "step": 140 + }, + { + "completion_length": 728.5, + "epoch": 0.493006993006993, + "grad_norm": 0.8773026466369629, + "kl": 0.032183535397052765, + "learning_rate": 3.525e-06, + "loss": 0.0013, + "reward": 2.862499952316284, + "reward_std": 0.7864078879356384, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6958333253860474, + "step": 141 + }, + { + "completion_length": 405.8333435058594, + "epoch": 0.4965034965034965, + "grad_norm": 0.8974792957305908, + "kl": 0.059865664690732956, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0024, + "reward": 1.6875, + "reward_std": 0.8300225734710693, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.3541666865348816, + "step": 142 + }, + { + "completion_length": 1081.666748046875, + "epoch": 0.5, + "grad_norm": 0.5286564230918884, + "kl": 0.022505857050418854, + "learning_rate": 3.575e-06, + "loss": 0.0009, + "reward": 2.370833396911621, + "reward_std": 0.4925486445426941, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8708332777023315, + "step": 143 + }, + { + "completion_length": 1141.3333740234375, + "epoch": 0.5034965034965035, + "grad_norm": 0.527409017086029, + "kl": 0.021072231233119965, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0008, + "reward": 1.9291666746139526, + "reward_std": 0.7955214381217957, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5958333611488342, + "step": 144 + }, + { + "completion_length": 515.5, + "epoch": 0.506993006993007, + "grad_norm": 2.5036261081695557, + "kl": 0.3181736469268799, + "learning_rate": 3.625e-06, + "loss": 0.0127, + "reward": 1.5833333730697632, + "reward_std": 0.9988327026367188, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5833333730697632, + "step": 145 + }, + { + "completion_length": 599.5, + "epoch": 0.5104895104895105, + "grad_norm": 0.7538139224052429, + "kl": 0.041587017476558685, + "learning_rate": 3.65e-06, + "loss": 0.0017, + "reward": 1.3583334684371948, + "reward_std": 0.6873258352279663, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6916666030883789, + "step": 146 + }, + { + "completion_length": 645.6666870117188, + "epoch": 0.513986013986014, + "grad_norm": 0.6815938353538513, + "kl": 0.031590305268764496, + "learning_rate": 3.6750000000000004e-06, + "loss": 0.0013, + "reward": 2.445833683013916, + "reward_std": 1.186003565788269, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6124999523162842, + "step": 147 + }, + { + "completion_length": 731.0, + "epoch": 0.5174825174825175, + "grad_norm": 1.4654277563095093, + "kl": 0.11272114515304565, + "learning_rate": 3.7e-06, + "loss": 0.0045, + "reward": 1.2125000953674316, + "reward_std": 0.7435977458953857, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7124999761581421, + "step": 148 + }, + { + "completion_length": 476.16668701171875, + "epoch": 0.5209790209790209, + "grad_norm": 3.388495683670044, + "kl": 0.9080104827880859, + "learning_rate": 3.7250000000000003e-06, + "loss": 0.0363, + "reward": 1.8958333730697632, + "reward_std": 0.9965461492538452, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3958333432674408, + "step": 149 + }, + { + "completion_length": 1053.166748046875, + "epoch": 0.5244755244755245, + "grad_norm": 0.4761454164981842, + "kl": 0.027715642005205154, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0011, + "reward": 3.2916667461395264, + "reward_std": 0.7417322397232056, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 150 + }, + { + "completion_length": 751.1666870117188, + "epoch": 0.527972027972028, + "grad_norm": 0.6827074885368347, + "kl": 0.0386313796043396, + "learning_rate": 3.7750000000000003e-06, + "loss": 0.0015, + "reward": 2.495833396911621, + "reward_std": 1.0227923393249512, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6625000238418579, + "step": 151 + }, + { + "completion_length": 721.8333740234375, + "epoch": 0.5314685314685315, + "grad_norm": 1.2814685106277466, + "kl": 0.041070081293582916, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0016, + "reward": 2.4666666984558105, + "reward_std": 0.8834120631217957, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 152 + }, + { + "completion_length": 513.0, + "epoch": 0.534965034965035, + "grad_norm": 0.6044140458106995, + "kl": 0.08036690950393677, + "learning_rate": 3.825000000000001e-06, + "loss": 0.0032, + "reward": 1.7875001430511475, + "reward_std": 1.1646621227264404, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6208333373069763, + "step": 153 + }, + { + "completion_length": 720.8333740234375, + "epoch": 0.5384615384615384, + "grad_norm": 0.7732751965522766, + "kl": 0.04927179962396622, + "learning_rate": 3.85e-06, + "loss": 0.002, + "reward": 2.383333206176758, + "reward_std": 1.4126808643341064, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 154 + }, + { + "completion_length": 708.8333740234375, + "epoch": 0.541958041958042, + "grad_norm": 0.6660548448562622, + "kl": 0.07937665283679962, + "learning_rate": 3.875e-06, + "loss": 0.0032, + "reward": 2.183333396911621, + "reward_std": 0.6377042531967163, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8500000238418579, + "step": 155 + }, + { + "completion_length": 1192.0, + "epoch": 0.5454545454545454, + "grad_norm": 0.3896901309490204, + "kl": 0.025209862738847733, + "learning_rate": 3.900000000000001e-06, + "loss": 0.001, + "reward": 1.8833332061767578, + "reward_std": 0.8691471815109253, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 156 + }, + { + "completion_length": 705.1666870117188, + "epoch": 0.548951048951049, + "grad_norm": 0.5750932097434998, + "kl": 0.04517858847975731, + "learning_rate": 3.9250000000000005e-06, + "loss": 0.0018, + "reward": 2.9541664123535156, + "reward_std": 0.6458360552787781, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6208333373069763, + "step": 157 + }, + { + "completion_length": 465.5, + "epoch": 0.5524475524475524, + "grad_norm": 0.8335661888122559, + "kl": 0.08351196348667145, + "learning_rate": 3.95e-06, + "loss": 0.0033, + "reward": 2.424999952316284, + "reward_std": 0.941673994064331, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5916666984558105, + "step": 158 + }, + { + "completion_length": 539.6666870117188, + "epoch": 0.5559440559440559, + "grad_norm": 1.1459757089614868, + "kl": 0.12647944688796997, + "learning_rate": 3.975000000000001e-06, + "loss": 0.0051, + "reward": 1.6416667699813843, + "reward_std": 1.2068209648132324, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 159 + }, + { + "completion_length": 798.0, + "epoch": 0.5594405594405595, + "grad_norm": 0.4939272105693817, + "kl": 0.051064085215330124, + "learning_rate": 4.000000000000001e-06, + "loss": 0.002, + "reward": 2.183333396911621, + "reward_std": 1.2081665992736816, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 160 + }, + { + "completion_length": 338.8333435058594, + "epoch": 0.5629370629370629, + "grad_norm": 0.8890612125396729, + "kl": 0.12327366322278976, + "learning_rate": 4.0250000000000004e-06, + "loss": 0.0049, + "reward": 2.575000286102295, + "reward_std": 0.9913375377655029, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40833336114883423, + "step": 161 + }, + { + "completion_length": 809.6666870117188, + "epoch": 0.5664335664335665, + "grad_norm": 0.3928314447402954, + "kl": 0.040153808891773224, + "learning_rate": 4.05e-06, + "loss": 0.0016, + "reward": 1.7208335399627686, + "reward_std": 0.5225937366485596, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7208333015441895, + "step": 162 + }, + { + "completion_length": 766.0, + "epoch": 0.5699300699300699, + "grad_norm": 0.7869060039520264, + "kl": 0.04531605541706085, + "learning_rate": 4.075e-06, + "loss": 0.0018, + "reward": 2.120833396911621, + "reward_std": 0.8866251707077026, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4541667103767395, + "step": 163 + }, + { + "completion_length": 1085.666748046875, + "epoch": 0.5734265734265734, + "grad_norm": 1.0671396255493164, + "kl": 0.06464602053165436, + "learning_rate": 4.1e-06, + "loss": 0.0026, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666865348816, + "step": 164 + }, + { + "completion_length": 628.1666870117188, + "epoch": 0.5769230769230769, + "grad_norm": 0.9583672285079956, + "kl": 0.06743767857551575, + "learning_rate": 4.125e-06, + "loss": 0.0027, + "reward": 2.137500286102295, + "reward_std": 1.376930594444275, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.637499988079071, + "step": 165 + }, + { + "completion_length": 351.8333435058594, + "epoch": 0.5804195804195804, + "grad_norm": 0.6946209669113159, + "kl": 0.09894745796918869, + "learning_rate": 4.15e-06, + "loss": 0.004, + "reward": 2.7750000953674316, + "reward_std": 0.7055140733718872, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4416666626930237, + "step": 166 + }, + { + "completion_length": 448.16668701171875, + "epoch": 0.583916083916084, + "grad_norm": 0.6712130308151245, + "kl": 0.0714031383395195, + "learning_rate": 4.175e-06, + "loss": 0.0029, + "reward": 1.9583333730697632, + "reward_std": 0.6499359011650085, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6250000596046448, + "step": 167 + }, + { + "completion_length": 763.0, + "epoch": 0.5874125874125874, + "grad_norm": 0.5934569239616394, + "kl": 0.039833370596170425, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0016, + "reward": 1.7208335399627686, + "reward_std": 0.6870983839035034, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.720833420753479, + "step": 168 + }, + { + "completion_length": 813.8333740234375, + "epoch": 0.5909090909090909, + "grad_norm": 0.46408811211586, + "kl": 0.0639135017991066, + "learning_rate": 4.225e-06, + "loss": 0.0026, + "reward": 2.6625001430511475, + "reward_std": 0.271454393863678, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.6625000238418579, + "step": 169 + }, + { + "completion_length": 621.3333740234375, + "epoch": 0.5944055944055944, + "grad_norm": 1.6175382137298584, + "kl": 0.23431169986724854, + "learning_rate": 4.25e-06, + "loss": 0.0094, + "reward": 1.5250000953674316, + "reward_std": 1.00784432888031, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 170 + }, + { + "completion_length": 685.1666870117188, + "epoch": 0.5979020979020979, + "grad_norm": 0.7504808306694031, + "kl": 0.06654171645641327, + "learning_rate": 4.2750000000000006e-06, + "loss": 0.0027, + "reward": 2.4583334922790527, + "reward_std": 0.8187898397445679, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 171 + }, + { + "completion_length": 772.6666870117188, + "epoch": 0.6013986013986014, + "grad_norm": 0.39892545342445374, + "kl": 0.030765770003199577, + "learning_rate": 4.3e-06, + "loss": 0.0012, + "reward": 1.7333333492279053, + "reward_std": 0.40207797288894653, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 172 + }, + { + "completion_length": 600.8333740234375, + "epoch": 0.6048951048951049, + "grad_norm": 0.6147928833961487, + "kl": 0.07108036428689957, + "learning_rate": 4.325e-06, + "loss": 0.0028, + "reward": 2.054166793823242, + "reward_std": 0.5684225559234619, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7208333015441895, + "step": 173 + }, + { + "completion_length": 761.3333740234375, + "epoch": 0.6083916083916084, + "grad_norm": 1.1690645217895508, + "kl": 0.11572085320949554, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0046, + "reward": 1.9583333730697632, + "reward_std": 1.2491663694381714, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7916666865348816, + "step": 174 + }, + { + "completion_length": 800.6666870117188, + "epoch": 0.6118881118881119, + "grad_norm": 1.141146183013916, + "kl": 0.0763167217373848, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.0031, + "reward": 1.4458335638046265, + "reward_std": 1.0782413482666016, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6124999523162842, + "step": 175 + }, + { + "completion_length": 582.0, + "epoch": 0.6153846153846154, + "grad_norm": 0.9667629599571228, + "kl": 0.04065123200416565, + "learning_rate": 4.4e-06, + "loss": 0.0016, + "reward": 1.5625, + "reward_std": 1.3656271696090698, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5625, + "step": 176 + }, + { + "completion_length": 653.6666870117188, + "epoch": 0.6188811188811189, + "grad_norm": 0.7743256092071533, + "kl": 0.07254478335380554, + "learning_rate": 4.425e-06, + "loss": 0.0029, + "reward": 1.308333396911621, + "reward_std": 0.7324048280715942, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6416666507720947, + "step": 177 + }, + { + "completion_length": 624.8333740234375, + "epoch": 0.6223776223776224, + "grad_norm": 1.7900493144989014, + "kl": 0.2500300407409668, + "learning_rate": 4.450000000000001e-06, + "loss": 0.01, + "reward": 1.3583333492279053, + "reward_std": 0.7825705409049988, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916667222976685, + "step": 178 + }, + { + "completion_length": 1285.0, + "epoch": 0.6258741258741258, + "grad_norm": 0.3387628197669983, + "kl": 0.025821728631854057, + "learning_rate": 4.475e-06, + "loss": 0.001, + "reward": 2.7916667461395264, + "reward_std": 0.678355872631073, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666269302368, + "step": 179 + }, + { + "completion_length": 975.8333740234375, + "epoch": 0.6293706293706294, + "grad_norm": 0.41932833194732666, + "kl": 0.04700490087270737, + "learning_rate": 4.5e-06, + "loss": 0.0019, + "reward": 1.8500001430511475, + "reward_std": 0.6782330274581909, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8499999046325684, + "step": 180 + }, + { + "completion_length": 771.8333740234375, + "epoch": 0.6328671328671329, + "grad_norm": 0.6049262881278992, + "kl": 0.05856431648135185, + "learning_rate": 4.525000000000001e-06, + "loss": 0.0023, + "reward": 1.6624999046325684, + "reward_std": 1.5213277339935303, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6625000238418579, + "step": 181 + }, + { + "completion_length": 718.3333740234375, + "epoch": 0.6363636363636364, + "grad_norm": 0.519266664981842, + "kl": 0.05408002436161041, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0022, + "reward": 3.012500286102295, + "reward_std": 1.0839452743530273, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8458333015441895, + "step": 182 + }, + { + "completion_length": 417.3333435058594, + "epoch": 0.6398601398601399, + "grad_norm": 1.159592866897583, + "kl": 0.06883987784385681, + "learning_rate": 4.575e-06, + "loss": 0.0028, + "reward": 2.308333396911621, + "reward_std": 1.089686393737793, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416666507720947, + "step": 183 + }, + { + "completion_length": 403.66668701171875, + "epoch": 0.6433566433566433, + "grad_norm": 0.9109689593315125, + "kl": 0.12938742339611053, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0052, + "reward": 2.829166889190674, + "reward_std": 0.9263390898704529, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4958333373069763, + "step": 184 + }, + { + "completion_length": 584.1666870117188, + "epoch": 0.6468531468531469, + "grad_norm": 1.3091282844543457, + "kl": 0.1182996854186058, + "learning_rate": 4.625000000000001e-06, + "loss": 0.0047, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 185 + }, + { + "completion_length": 715.8333740234375, + "epoch": 0.6503496503496503, + "grad_norm": 0.8944427967071533, + "kl": 0.07471362501382828, + "learning_rate": 4.65e-06, + "loss": 0.003, + "reward": 2.5500001907348633, + "reward_std": 1.0044898986816406, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 186 + }, + { + "completion_length": 328.66668701171875, + "epoch": 0.6538461538461539, + "grad_norm": 2.0265045166015625, + "kl": 0.3070363402366638, + "learning_rate": 4.675000000000001e-06, + "loss": 0.0123, + "reward": 2.0291666984558105, + "reward_std": 0.9910117983818054, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.36250001192092896, + "step": 187 + }, + { + "completion_length": 463.8333435058594, + "epoch": 0.6573426573426573, + "grad_norm": 1.1863874197006226, + "kl": 0.07772837579250336, + "learning_rate": 4.7e-06, + "loss": 0.0031, + "reward": 2.5333335399627686, + "reward_std": 0.9558593034744263, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5333333611488342, + "step": 188 + }, + { + "completion_length": 516.5, + "epoch": 0.6608391608391608, + "grad_norm": 0.690477192401886, + "kl": 0.08707510679960251, + "learning_rate": 4.7250000000000005e-06, + "loss": 0.0035, + "reward": 3.4000000953674316, + "reward_std": 1.2024973630905151, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 189 + }, + { + "completion_length": 656.8333740234375, + "epoch": 0.6643356643356644, + "grad_norm": 0.7191756963729858, + "kl": 0.05152536556124687, + "learning_rate": 4.75e-06, + "loss": 0.0021, + "reward": 1.7833335399627686, + "reward_std": 0.5288351774215698, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 190 + }, + { + "completion_length": 510.16668701171875, + "epoch": 0.6678321678321678, + "grad_norm": 1.589722990989685, + "kl": 0.11165278404951096, + "learning_rate": 4.775e-06, + "loss": 0.0045, + "reward": 1.5916666984558105, + "reward_std": 1.1620744466781616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5916666984558105, + "step": 191 + }, + { + "completion_length": 463.3333435058594, + "epoch": 0.6713286713286714, + "grad_norm": 1.1402506828308105, + "kl": 0.12224837392568588, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0049, + "reward": 3.0166664123535156, + "reward_std": 0.46224093437194824, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6833333373069763, + "step": 192 + }, + { + "completion_length": 668.8333740234375, + "epoch": 0.6748251748251748, + "grad_norm": 0.829407811164856, + "kl": 0.04827030003070831, + "learning_rate": 4.825e-06, + "loss": 0.0019, + "reward": 2.516666889190674, + "reward_std": 0.9416297674179077, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 193 + }, + { + "completion_length": 653.1666870117188, + "epoch": 0.6783216783216783, + "grad_norm": 0.8737359642982483, + "kl": 0.11687206476926804, + "learning_rate": 4.85e-06, + "loss": 0.0047, + "reward": 1.883333444595337, + "reward_std": 0.9978310465812683, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666388511658, + "step": 194 + }, + { + "completion_length": 521.1666870117188, + "epoch": 0.6818181818181818, + "grad_norm": 1.265020728111267, + "kl": 0.1497541069984436, + "learning_rate": 4.875e-06, + "loss": 0.006, + "reward": 1.6666667461395264, + "reward_std": 1.1578716039657593, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6666666865348816, + "step": 195 + }, + { + "completion_length": 720.3333740234375, + "epoch": 0.6853146853146853, + "grad_norm": 0.5844486355781555, + "kl": 0.07905390858650208, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0032, + "reward": 2.683333396911621, + "reward_std": 0.7659417986869812, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 196 + }, + { + "completion_length": 654.3333740234375, + "epoch": 0.6888111888111889, + "grad_norm": 1.0279442071914673, + "kl": 0.05869147181510925, + "learning_rate": 4.925e-06, + "loss": 0.0023, + "reward": 1.8250000476837158, + "reward_std": 1.047735571861267, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.824999988079071, + "step": 197 + }, + { + "completion_length": 696.5, + "epoch": 0.6923076923076923, + "grad_norm": 0.5949178338050842, + "kl": 0.10564576834440231, + "learning_rate": 4.95e-06, + "loss": 0.0042, + "reward": 2.7958333492279053, + "reward_std": 0.8044278621673584, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 198 + }, + { + "completion_length": 667.3333740234375, + "epoch": 0.6958041958041958, + "grad_norm": 1.4045933485031128, + "kl": 0.2249039262533188, + "learning_rate": 4.975000000000001e-06, + "loss": 0.009, + "reward": 1.7833333015441895, + "reward_std": 1.2967909574508667, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 199 + }, + { + "completion_length": 549.0, + "epoch": 0.6993006993006993, + "grad_norm": 11.491266250610352, + "kl": 2.7085909843444824, + "learning_rate": 5e-06, + "loss": 0.1083, + "reward": 1.883333444595337, + "reward_std": 1.0934655666351318, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 200 + }, + { + "completion_length": 1157.666748046875, + "epoch": 0.7027972027972028, + "grad_norm": 0.3758504092693329, + "kl": 0.03439244627952576, + "learning_rate": 4.99999619228322e-06, + "loss": 0.0014, + "reward": 1.5375001430511475, + "reward_std": 0.490853875875473, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 201 + }, + { + "completion_length": 276.66668701171875, + "epoch": 0.7062937062937062, + "grad_norm": 1.4240407943725586, + "kl": 0.09711845219135284, + "learning_rate": 4.999984769144476e-06, + "loss": 0.0039, + "reward": 1.774999976158142, + "reward_std": 1.4250439405441284, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.44166669249534607, + "step": 202 + }, + { + "completion_length": 506.16668701171875, + "epoch": 0.7097902097902098, + "grad_norm": 0.8863720893859863, + "kl": 0.0886097177863121, + "learning_rate": 4.999965730618567e-06, + "loss": 0.0035, + "reward": 2.4166667461395264, + "reward_std": 0.7717944979667664, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.75, + "step": 203 + }, + { + "completion_length": 558.8333740234375, + "epoch": 0.7132867132867133, + "grad_norm": 1.036176323890686, + "kl": 0.11752279102802277, + "learning_rate": 4.999939076763487e-06, + "loss": 0.0047, + "reward": 1.8583334684371948, + "reward_std": 0.7761551141738892, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916666030883789, + "step": 204 + }, + { + "completion_length": 590.3333740234375, + "epoch": 0.7167832167832168, + "grad_norm": 1.2968803644180298, + "kl": 0.1260688155889511, + "learning_rate": 4.9999048076604286e-06, + "loss": 0.005, + "reward": 1.883333444595337, + "reward_std": 1.0934655666351318, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666984558105, + "step": 205 + }, + { + "completion_length": 653.3333740234375, + "epoch": 0.7202797202797203, + "grad_norm": 1.9041389226913452, + "kl": 0.350026935338974, + "learning_rate": 4.999862923413781e-06, + "loss": 0.014, + "reward": 1.8041666746139526, + "reward_std": 0.5104941129684448, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6375000476837158, + "step": 206 + }, + { + "completion_length": 359.3333435058594, + "epoch": 0.7237762237762237, + "grad_norm": 1.4652067422866821, + "kl": 0.09337612986564636, + "learning_rate": 4.9998134241511305e-06, + "loss": 0.0037, + "reward": 1.875, + "reward_std": 1.1440061330795288, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5416666865348816, + "step": 207 + }, + { + "completion_length": 393.3333435058594, + "epoch": 0.7272727272727273, + "grad_norm": 0.8172839879989624, + "kl": 0.11479752510786057, + "learning_rate": 4.999756310023261e-06, + "loss": 0.0046, + "reward": 3.2916667461395264, + "reward_std": 0.46627962589263916, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.625, + "step": 208 + }, + { + "completion_length": 1035.166748046875, + "epoch": 0.7307692307692307, + "grad_norm": 0.45489755272865295, + "kl": 0.03647574782371521, + "learning_rate": 4.9996915812041515e-06, + "loss": 0.0015, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 209 + }, + { + "completion_length": 561.5, + "epoch": 0.7342657342657343, + "grad_norm": 0.7732179164886475, + "kl": 0.10910838097333908, + "learning_rate": 4.9996192378909785e-06, + "loss": 0.0044, + "reward": 3.075000286102295, + "reward_std": 0.9852665662765503, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416667342185974, + "step": 210 + }, + { + "completion_length": 327.3333435058594, + "epoch": 0.7377622377622378, + "grad_norm": 1.1959446668624878, + "kl": 0.18659886717796326, + "learning_rate": 4.999539280304111e-06, + "loss": 0.0075, + "reward": 1.7333333492279053, + "reward_std": 0.6875075697898865, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5666666626930237, + "step": 211 + }, + { + "completion_length": 698.1666870117188, + "epoch": 0.7412587412587412, + "grad_norm": 0.5885636806488037, + "kl": 0.06670037657022476, + "learning_rate": 4.999451708687114e-06, + "loss": 0.0027, + "reward": 2.7750003337860107, + "reward_std": 0.8341163396835327, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7749999761581421, + "step": 212 + }, + { + "completion_length": 679.8333740234375, + "epoch": 0.7447552447552448, + "grad_norm": 0.9122396111488342, + "kl": 0.10316199064254761, + "learning_rate": 4.999356523306746e-06, + "loss": 0.0041, + "reward": 2.008333444595337, + "reward_std": 1.2973692417144775, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5083333253860474, + "step": 213 + }, + { + "completion_length": 604.1666870117188, + "epoch": 0.7482517482517482, + "grad_norm": 0.7414869070053101, + "kl": 0.08340045064687729, + "learning_rate": 4.9992537244529585e-06, + "loss": 0.0033, + "reward": 3.299999952316284, + "reward_std": 0.41713306307792664, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8000000715255737, + "step": 214 + }, + { + "completion_length": 704.5, + "epoch": 0.7517482517482518, + "grad_norm": 2.09073543548584, + "kl": 0.10594753921031952, + "learning_rate": 4.999143312438893e-06, + "loss": 0.0042, + "reward": 1.7416666746139526, + "reward_std": 0.9259679317474365, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7416667342185974, + "step": 215 + }, + { + "completion_length": 587.8333740234375, + "epoch": 0.7552447552447552, + "grad_norm": 1.304240107536316, + "kl": 0.1295248121023178, + "learning_rate": 4.999025287600886e-06, + "loss": 0.0052, + "reward": 2.616666793823242, + "reward_std": 1.6061341762542725, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6166666746139526, + "step": 216 + }, + { + "completion_length": 495.8333435058594, + "epoch": 0.7587412587412588, + "grad_norm": 1.2090598344802856, + "kl": 0.11880560964345932, + "learning_rate": 4.9988996502984604e-06, + "loss": 0.0048, + "reward": 2.7333333492279053, + "reward_std": 1.022578477859497, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5666667222976685, + "step": 217 + }, + { + "completion_length": 565.6666870117188, + "epoch": 0.7622377622377622, + "grad_norm": 0.553954005241394, + "kl": 0.052788302302360535, + "learning_rate": 4.998766400914329e-06, + "loss": 0.0021, + "reward": 2.6999998092651367, + "reward_std": 0.9705669283866882, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.699999988079071, + "step": 218 + }, + { + "completion_length": 645.6666870117188, + "epoch": 0.7657342657342657, + "grad_norm": 2.507683038711548, + "kl": 0.2849184274673462, + "learning_rate": 4.998625539854394e-06, + "loss": 0.0114, + "reward": 2.6000001430511475, + "reward_std": 1.0089600086212158, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7666666507720947, + "step": 219 + }, + { + "completion_length": 321.66668701171875, + "epoch": 0.7692307692307693, + "grad_norm": 1.2175945043563843, + "kl": 0.0842239186167717, + "learning_rate": 4.99847706754774e-06, + "loss": 0.0034, + "reward": 2.933333158493042, + "reward_std": 0.6516644954681396, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6000000238418579, + "step": 220 + }, + { + "completion_length": 700.5, + "epoch": 0.7727272727272727, + "grad_norm": 2.048892021179199, + "kl": 0.16157689690589905, + "learning_rate": 4.9983209844466404e-06, + "loss": 0.0065, + "reward": 1.625, + "reward_std": 1.0167349576950073, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 221 + }, + { + "completion_length": 833.5, + "epoch": 0.7762237762237763, + "grad_norm": 0.9171572327613831, + "kl": 0.06645169854164124, + "learning_rate": 4.998157291026553e-06, + "loss": 0.0027, + "reward": 2.9083335399627686, + "reward_std": 1.2068209648132324, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7416666746139526, + "step": 222 + }, + { + "completion_length": 506.3333435058594, + "epoch": 0.7797202797202797, + "grad_norm": 19.220211029052734, + "kl": 3.192702293395996, + "learning_rate": 4.9979859877861155e-06, + "loss": 0.1277, + "reward": 3.191666603088379, + "reward_std": 1.2146673202514648, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.6916667222976685, + "step": 223 + }, + { + "completion_length": 593.0, + "epoch": 0.7832167832167832, + "grad_norm": 0.8852243423461914, + "kl": 0.09442658722400665, + "learning_rate": 4.997807075247147e-06, + "loss": 0.0038, + "reward": 3.2750003337860107, + "reward_std": 0.6691412925720215, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7749999761581421, + "step": 224 + }, + { + "completion_length": 831.1666870117188, + "epoch": 0.7867132867132867, + "grad_norm": 0.4429211914539337, + "kl": 0.04310205578804016, + "learning_rate": 4.997620553954645e-06, + "loss": 0.0017, + "reward": 3.1541666984558105, + "reward_std": 1.132741928100586, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8208333849906921, + "step": 225 + }, + { + "completion_length": 731.0, + "epoch": 0.7902097902097902, + "grad_norm": 0.4210525155067444, + "kl": 0.0507250651717186, + "learning_rate": 4.997426424476787e-06, + "loss": 0.002, + "reward": 3.758333206176758, + "reward_std": 0.40052053332328796, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 226 + }, + { + "completion_length": 683.1666870117188, + "epoch": 0.7937062937062938, + "grad_norm": 1.443489670753479, + "kl": 0.1432674527168274, + "learning_rate": 4.9972246874049254e-06, + "loss": 0.0057, + "reward": 2.7166666984558105, + "reward_std": 1.075019359588623, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 227 + }, + { + "completion_length": 749.0, + "epoch": 0.7972027972027972, + "grad_norm": 0.4731828272342682, + "kl": 0.05084119364619255, + "learning_rate": 4.9970153433535855e-06, + "loss": 0.002, + "reward": 2.5250000953674316, + "reward_std": 0.49371039867401123, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8583332896232605, + "step": 228 + }, + { + "completion_length": 749.6666870117188, + "epoch": 0.8006993006993007, + "grad_norm": 1.1463042497634888, + "kl": 0.0917380303144455, + "learning_rate": 4.996798392960466e-06, + "loss": 0.0037, + "reward": 3.1000001430511475, + "reward_std": 1.1304867267608643, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 229 + }, + { + "completion_length": 444.3333435058594, + "epoch": 0.8041958041958042, + "grad_norm": 2.1588308811187744, + "kl": 0.2637466788291931, + "learning_rate": 4.9965738368864345e-06, + "loss": 0.0105, + "reward": 1.4583333730697632, + "reward_std": 0.665895402431488, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4583333432674408, + "step": 230 + }, + { + "completion_length": 563.8333740234375, + "epoch": 0.8076923076923077, + "grad_norm": 1.7064660787582397, + "kl": 0.15527644753456116, + "learning_rate": 4.99634167581553e-06, + "loss": 0.0062, + "reward": 2.9208335876464844, + "reward_std": 1.1095513105392456, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5874999761581421, + "step": 231 + }, + { + "completion_length": 571.6666870117188, + "epoch": 0.8111888111888111, + "grad_norm": 0.7909032106399536, + "kl": 0.10144728422164917, + "learning_rate": 4.996101910454953e-06, + "loss": 0.0041, + "reward": 3.200000286102295, + "reward_std": 0.6928204298019409, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.699999988079071, + "step": 232 + }, + { + "completion_length": 442.16668701171875, + "epoch": 0.8146853146853147, + "grad_norm": 2.3640758991241455, + "kl": 0.1561039686203003, + "learning_rate": 4.995854541535072e-06, + "loss": 0.0062, + "reward": 2.8583333492279053, + "reward_std": 1.5499732494354248, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 233 + }, + { + "completion_length": 635.0, + "epoch": 0.8181818181818182, + "grad_norm": 1.519736409187317, + "kl": 0.08059443533420563, + "learning_rate": 4.995599569809414e-06, + "loss": 0.0032, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 234 + }, + { + "completion_length": 867.1666870117188, + "epoch": 0.8216783216783217, + "grad_norm": 1.0411657094955444, + "kl": 0.18848155438899994, + "learning_rate": 4.995336996054668e-06, + "loss": 0.0075, + "reward": 2.566666603088379, + "reward_std": 0.8010410666465759, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 235 + }, + { + "completion_length": 767.0, + "epoch": 0.8251748251748252, + "grad_norm": 1.3162877559661865, + "kl": 0.1943603754043579, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.0078, + "reward": 2.8458335399627686, + "reward_std": 1.271457552909851, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8458333015441895, + "step": 236 + }, + { + "completion_length": 971.0, + "epoch": 0.8286713286713286, + "grad_norm": 0.7847824096679688, + "kl": 0.07626049965620041, + "learning_rate": 4.994789045680448e-06, + "loss": 0.0031, + "reward": 2.766666889190674, + "reward_std": 1.1245739459991455, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7666666507720947, + "step": 237 + }, + { + "completion_length": 552.0, + "epoch": 0.8321678321678322, + "grad_norm": 0.7410560250282288, + "kl": 0.10457824170589447, + "learning_rate": 4.994503670730126e-06, + "loss": 0.0042, + "reward": 3.391666889190674, + "reward_std": 0.7059863805770874, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7250000238418579, + "step": 238 + }, + { + "completion_length": 725.6666870117188, + "epoch": 0.8356643356643356, + "grad_norm": 0.4836815595626831, + "kl": 0.05600851774215698, + "learning_rate": 4.9942106970890136e-06, + "loss": 0.0022, + "reward": 2.7333333492279053, + "reward_std": 0.40207791328430176, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8999999761581421, + "step": 239 + }, + { + "completion_length": 670.1666870117188, + "epoch": 0.8391608391608392, + "grad_norm": 1.1572860479354858, + "kl": 0.09645780920982361, + "learning_rate": 4.993910125649561e-06, + "loss": 0.0039, + "reward": 1.945833444595337, + "reward_std": 1.1002748012542725, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.612500011920929, + "step": 240 + }, + { + "completion_length": 716.0, + "epoch": 0.8426573426573427, + "grad_norm": 0.6385201811790466, + "kl": 0.10877624154090881, + "learning_rate": 4.993601957327361e-06, + "loss": 0.0044, + "reward": 1.7999999523162842, + "reward_std": 1.3168143033981323, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 241 + }, + { + "completion_length": 783.0, + "epoch": 0.8461538461538461, + "grad_norm": 0.4785465598106384, + "kl": 0.06399235874414444, + "learning_rate": 4.993286193061145e-06, + "loss": 0.0026, + "reward": 2.258333444595337, + "reward_std": 0.5389031767845154, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9249999523162842, + "step": 242 + }, + { + "completion_length": 660.6666870117188, + "epoch": 0.8496503496503497, + "grad_norm": 0.7678278684616089, + "kl": 0.07323874533176422, + "learning_rate": 4.9929628338127904e-06, + "loss": 0.0029, + "reward": 2.575000047683716, + "reward_std": 1.0048632621765137, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7416666746139526, + "step": 243 + }, + { + "completion_length": 904.5, + "epoch": 0.8531468531468531, + "grad_norm": 0.41908255219459534, + "kl": 0.049275174736976624, + "learning_rate": 4.992631880567301e-06, + "loss": 0.002, + "reward": 1.9250000715255737, + "reward_std": 0.6354132890701294, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.9249999523162842, + "step": 244 + }, + { + "completion_length": 524.8333740234375, + "epoch": 0.8566433566433567, + "grad_norm": 0.9670363068580627, + "kl": 0.17363564670085907, + "learning_rate": 4.992293334332821e-06, + "loss": 0.0069, + "reward": 1.558333396911621, + "reward_std": 1.3331979513168335, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333373069763, + "step": 245 + }, + { + "completion_length": 869.1666870117188, + "epoch": 0.8601398601398601, + "grad_norm": 0.45620983839035034, + "kl": 0.0668826699256897, + "learning_rate": 4.991947196140619e-06, + "loss": 0.0027, + "reward": 2.5416667461395264, + "reward_std": 0.9057685732841492, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 246 + }, + { + "completion_length": 841.3333740234375, + "epoch": 0.8636363636363636, + "grad_norm": 0.559363603591919, + "kl": 0.0583985298871994, + "learning_rate": 4.991593467045092e-06, + "loss": 0.0023, + "reward": 2.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 247 + }, + { + "completion_length": 599.1666870117188, + "epoch": 0.8671328671328671, + "grad_norm": 0.9642091989517212, + "kl": 0.11994724720716476, + "learning_rate": 4.9912321481237616e-06, + "loss": 0.0048, + "reward": 2.5250000953674316, + "reward_std": 1.0810874700546265, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 248 + }, + { + "completion_length": 462.16668701171875, + "epoch": 0.8706293706293706, + "grad_norm": 36.93287658691406, + "kl": 9.688800811767578, + "learning_rate": 4.990863240477266e-06, + "loss": 0.3876, + "reward": 2.133333444595337, + "reward_std": 1.5154757499694824, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.46666666865348816, + "step": 249 + }, + { + "completion_length": 339.0, + "epoch": 0.8741258741258742, + "grad_norm": 26.625389099121094, + "kl": 0.959087610244751, + "learning_rate": 4.990486745229364e-06, + "loss": 0.0384, + "reward": 2.4000000953674316, + "reward_std": 1.4926488399505615, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5666666626930237, + "step": 250 + }, + { + "completion_length": 618.1666870117188, + "epoch": 0.8776223776223776, + "grad_norm": 0.8756181597709656, + "kl": 0.1540575623512268, + "learning_rate": 4.990102663526925e-06, + "loss": 0.0062, + "reward": 2.3583335876464844, + "reward_std": 0.7564169764518738, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916666030883789, + "step": 251 + }, + { + "completion_length": 659.0, + "epoch": 0.8811188811188811, + "grad_norm": 1.4729007482528687, + "kl": 0.22244331240653992, + "learning_rate": 4.989710996539926e-06, + "loss": 0.0089, + "reward": 2.6666667461395264, + "reward_std": 1.386602759361267, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6666666865348816, + "step": 252 + }, + { + "completion_length": 471.0, + "epoch": 0.8846153846153846, + "grad_norm": 1.7183626890182495, + "kl": 0.19531545042991638, + "learning_rate": 4.989311745461456e-06, + "loss": 0.0078, + "reward": 2.2624998092651367, + "reward_std": 1.547720193862915, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.42916664481163025, + "step": 253 + }, + { + "completion_length": 809.5, + "epoch": 0.8881118881118881, + "grad_norm": 1.3393943309783936, + "kl": 0.06276177614927292, + "learning_rate": 4.9889049115077e-06, + "loss": 0.0025, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 254 + }, + { + "completion_length": 696.0, + "epoch": 0.8916083916083916, + "grad_norm": 0.5159295201301575, + "kl": 0.06829811632633209, + "learning_rate": 4.988490495917948e-06, + "loss": 0.0027, + "reward": 2.375, + "reward_std": 0.8226482272148132, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.875, + "step": 255 + }, + { + "completion_length": 469.8333435058594, + "epoch": 0.8951048951048951, + "grad_norm": 15.731892585754395, + "kl": 5.195942401885986, + "learning_rate": 4.988068499954578e-06, + "loss": 0.2078, + "reward": 2.5333333015441895, + "reward_std": 1.7218208312988281, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5333333611488342, + "step": 256 + }, + { + "completion_length": 267.66668701171875, + "epoch": 0.8986013986013986, + "grad_norm": 2.6494510173797607, + "kl": 0.2645886242389679, + "learning_rate": 4.987638924903066e-06, + "loss": 0.0106, + "reward": 1.9833333492279053, + "reward_std": 1.6277797222137451, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4833333194255829, + "step": 257 + }, + { + "completion_length": 772.3333740234375, + "epoch": 0.9020979020979021, + "grad_norm": 0.4527927339076996, + "kl": 0.06693247705698013, + "learning_rate": 4.987201772071971e-06, + "loss": 0.0027, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 258 + }, + { + "completion_length": 585.6666870117188, + "epoch": 0.9055944055944056, + "grad_norm": 0.689224362373352, + "kl": 0.08530323952436447, + "learning_rate": 4.9867570427929356e-06, + "loss": 0.0034, + "reward": 0.7916666865348816, + "reward_std": 0.9183772802352905, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.4583333432674408, + "step": 259 + }, + { + "completion_length": 537.1666870117188, + "epoch": 0.9090909090909091, + "grad_norm": 0.6728858947753906, + "kl": 0.0897747129201889, + "learning_rate": 4.986304738420684e-06, + "loss": 0.0036, + "reward": 3.129167079925537, + "reward_std": 1.1996268033981323, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7958333492279053, + "step": 260 + }, + { + "completion_length": 407.8333435058594, + "epoch": 0.9125874125874126, + "grad_norm": 1.1994887590408325, + "kl": 0.09183052182197571, + "learning_rate": 4.985844860333012e-06, + "loss": 0.0037, + "reward": 2.3583335876464844, + "reward_std": 1.752831220626831, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 261 + }, + { + "completion_length": 677.5, + "epoch": 0.916083916083916, + "grad_norm": 0.508855402469635, + "kl": 0.07326661795377731, + "learning_rate": 4.985377409930789e-06, + "loss": 0.0029, + "reward": 3.375, + "reward_std": 0.8635681867599487, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 262 + }, + { + "completion_length": 736.8333740234375, + "epoch": 0.9195804195804196, + "grad_norm": 0.9614912271499634, + "kl": 0.09196578711271286, + "learning_rate": 4.98490238863795e-06, + "loss": 0.0037, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 263 + }, + { + "completion_length": 770.8333740234375, + "epoch": 0.9230769230769231, + "grad_norm": 0.47455278038978577, + "kl": 0.06785900890827179, + "learning_rate": 4.984419797901491e-06, + "loss": 0.0027, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 264 + }, + { + "completion_length": 623.6666870117188, + "epoch": 0.9265734265734266, + "grad_norm": 0.5573136210441589, + "kl": 0.08627455681562424, + "learning_rate": 4.9839296391914696e-06, + "loss": 0.0035, + "reward": 3.116666793823242, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 265 + }, + { + "completion_length": 391.3333435058594, + "epoch": 0.9300699300699301, + "grad_norm": 1.9462356567382812, + "kl": 0.16661277413368225, + "learning_rate": 4.983431914000991e-06, + "loss": 0.0067, + "reward": 2.4749999046325684, + "reward_std": 1.4665435552597046, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 266 + }, + { + "completion_length": 397.3333435058594, + "epoch": 0.9335664335664335, + "grad_norm": 1.011677622795105, + "kl": 0.23764805495738983, + "learning_rate": 4.982926623846216e-06, + "loss": 0.0095, + "reward": 3.366666793823242, + "reward_std": 0.6274287104606628, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7000000476837158, + "step": 267 + }, + { + "completion_length": 417.0, + "epoch": 0.9370629370629371, + "grad_norm": 1.4490914344787598, + "kl": 0.13754335045814514, + "learning_rate": 4.9824137702663424e-06, + "loss": 0.0055, + "reward": 1.7125000953674316, + "reward_std": 0.9684717655181885, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7124999761581421, + "step": 268 + }, + { + "completion_length": 410.5, + "epoch": 0.9405594405594405, + "grad_norm": 0.8436146974563599, + "kl": 0.14260268211364746, + "learning_rate": 4.981893354823614e-06, + "loss": 0.0057, + "reward": 1.8125, + "reward_std": 1.1806514263153076, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6458333730697632, + "step": 269 + }, + { + "completion_length": 644.6666870117188, + "epoch": 0.9440559440559441, + "grad_norm": 0.7549885511398315, + "kl": 0.09023593366146088, + "learning_rate": 4.981365379103306e-06, + "loss": 0.0036, + "reward": 2.3500001430511475, + "reward_std": 1.3856406211853027, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 270 + }, + { + "completion_length": 195.5, + "epoch": 0.9475524475524476, + "grad_norm": 1.895914077758789, + "kl": 0.29670989513397217, + "learning_rate": 4.980829844713722e-06, + "loss": 0.0119, + "reward": 1.649999976158142, + "reward_std": 1.0168579816818237, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.3166666626930237, + "step": 271 + }, + { + "completion_length": 359.8333435058594, + "epoch": 0.951048951048951, + "grad_norm": 1.0856112241744995, + "kl": 0.255443274974823, + "learning_rate": 4.980286753286196e-06, + "loss": 0.0102, + "reward": 2.2916667461395264, + "reward_std": 1.2310227155685425, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.625, + "step": 272 + }, + { + "completion_length": 726.8333740234375, + "epoch": 0.9545454545454546, + "grad_norm": 0.2943981885910034, + "kl": 0.12990406155586243, + "learning_rate": 4.979736106475075e-06, + "loss": 0.0064, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 273 + }, + { + "completion_length": 680.0, + "epoch": 0.958041958041958, + "grad_norm": 0.5072641372680664, + "kl": 0.07472037523984909, + "learning_rate": 4.979177905957726e-06, + "loss": 0.003, + "reward": 3.012500286102295, + "reward_std": 1.1379531621932983, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8458333015441895, + "step": 274 + }, + { + "completion_length": 491.5, + "epoch": 0.9615384615384616, + "grad_norm": 0.6770206689834595, + "kl": 0.13075995445251465, + "learning_rate": 4.978612153434527e-06, + "loss": 0.0052, + "reward": 2.008333444595337, + "reward_std": 0.7618508338928223, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6750000715255737, + "step": 275 + }, + { + "completion_length": 749.6666870117188, + "epoch": 0.965034965034965, + "grad_norm": 0.5412439107894897, + "kl": 0.10561086982488632, + "learning_rate": 4.978038850628855e-06, + "loss": 0.0042, + "reward": 2.870833396911621, + "reward_std": 0.6615166068077087, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 276 + }, + { + "completion_length": 511.5, + "epoch": 0.9685314685314685, + "grad_norm": 1.1368520259857178, + "kl": 0.14474637806415558, + "learning_rate": 4.977457999287091e-06, + "loss": 0.0058, + "reward": 1.7583332061767578, + "reward_std": 1.0646204948425293, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 277 + }, + { + "completion_length": 750.6666870117188, + "epoch": 0.972027972027972, + "grad_norm": 1.0957084894180298, + "kl": 0.10108073800802231, + "learning_rate": 4.9768696011786095e-06, + "loss": 0.004, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 278 + }, + { + "completion_length": 324.3333435058594, + "epoch": 0.9755244755244755, + "grad_norm": 1.0172570943832397, + "kl": 0.31204575300216675, + "learning_rate": 4.976273658095772e-06, + "loss": 0.0125, + "reward": 0.908333420753479, + "reward_std": 1.0532886981964111, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40833330154418945, + "step": 279 + }, + { + "completion_length": 329.66668701171875, + "epoch": 0.9790209790209791, + "grad_norm": 0.753690242767334, + "kl": 0.09907300770282745, + "learning_rate": 4.975670171853926e-06, + "loss": 0.004, + "reward": 2.7750003337860107, + "reward_std": 1.0994317531585693, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7749999761581421, + "step": 280 + }, + { + "completion_length": 615.3333740234375, + "epoch": 0.9825174825174825, + "grad_norm": 0.8215593695640564, + "kl": 0.09376661479473114, + "learning_rate": 4.975059144291395e-06, + "loss": 0.0038, + "reward": 2.375, + "reward_std": 0.6509608626365662, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8749999403953552, + "step": 281 + }, + { + "completion_length": 435.8333435058594, + "epoch": 0.986013986013986, + "grad_norm": 1.3309355974197388, + "kl": 0.21346941590309143, + "learning_rate": 4.974440577269473e-06, + "loss": 0.0085, + "reward": 2.0333333015441895, + "reward_std": 1.6485350131988525, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5333333611488342, + "step": 282 + }, + { + "completion_length": 470.3333435058594, + "epoch": 0.9895104895104895, + "grad_norm": 1.1230376958847046, + "kl": 0.1047142893075943, + "learning_rate": 4.973814472672424e-06, + "loss": 0.0042, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 283 + }, + { + "completion_length": 887.5, + "epoch": 0.993006993006993, + "grad_norm": 0.6477030515670776, + "kl": 0.08142790198326111, + "learning_rate": 4.973180832407471e-06, + "loss": 0.0033, + "reward": 1.4250000715255737, + "reward_std": 0.9661781191825867, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.5916666388511658, + "step": 284 + }, + { + "completion_length": 566.3333740234375, + "epoch": 0.9965034965034965, + "grad_norm": 0.7089259624481201, + "kl": 0.1486695259809494, + "learning_rate": 4.972539658404793e-06, + "loss": 0.0059, + "reward": 1.7166666984558105, + "reward_std": 0.7332576513290405, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7166666388511658, + "step": 285 + }, + { + "completion_length": 899.3333740234375, + "epoch": 1.0, + "grad_norm": 0.6575971841812134, + "kl": 0.0989997610449791, + "learning_rate": 4.971890952617515e-06, + "loss": 0.004, + "reward": 2.8583335876464844, + "reward_std": 0.9960757493972778, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 286 + }, + { + "completion_length": 414.5, + "epoch": 1.0034965034965035, + "grad_norm": 1.0364247560501099, + "kl": 0.19011634588241577, + "learning_rate": 4.971234717021709e-06, + "loss": 0.0076, + "reward": 1.7916667461395264, + "reward_std": 1.7468304634094238, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6250000596046448, + "step": 287 + }, + { + "completion_length": 524.0, + "epoch": 1.006993006993007, + "grad_norm": 0.9833644032478333, + "kl": 0.14835724234580994, + "learning_rate": 4.970570953616383e-06, + "loss": 0.0059, + "reward": 2.3583335876464844, + "reward_std": 1.1191142797470093, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6916666030883789, + "step": 288 + }, + { + "completion_length": 681.1666870117188, + "epoch": 1.0104895104895104, + "grad_norm": 0.6175888180732727, + "kl": 0.10941031575202942, + "learning_rate": 4.969899664423473e-06, + "loss": 0.0044, + "reward": 2.704166889190674, + "reward_std": 0.7567061185836792, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8708333373069763, + "step": 289 + }, + { + "completion_length": 386.5, + "epoch": 1.013986013986014, + "grad_norm": 2.7495882511138916, + "kl": 0.5513795614242554, + "learning_rate": 4.9692208514878445e-06, + "loss": 0.0221, + "reward": 1.3666666746139526, + "reward_std": 1.0023306608200073, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5333333611488342, + "step": 290 + }, + { + "completion_length": 679.6666870117188, + "epoch": 1.0174825174825175, + "grad_norm": 0.9174596667289734, + "kl": 0.14350205659866333, + "learning_rate": 4.968534516877279e-06, + "loss": 0.0057, + "reward": 2.879167079925537, + "reward_std": 1.0047906637191772, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7124999761581421, + "step": 291 + }, + { + "completion_length": 322.0, + "epoch": 1.020979020979021, + "grad_norm": 6.856034278869629, + "kl": 3.479478597640991, + "learning_rate": 4.96784066268247e-06, + "loss": 0.1392, + "reward": 0.875, + "reward_std": 0.9832345247268677, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.375, + "step": 292 + }, + { + "completion_length": 500.5, + "epoch": 1.0244755244755244, + "grad_norm": 0.8394511938095093, + "kl": 0.14955884218215942, + "learning_rate": 4.967139291017018e-06, + "loss": 0.006, + "reward": 2.133333206176758, + "reward_std": 1.149202585220337, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 293 + }, + { + "completion_length": 470.5, + "epoch": 1.027972027972028, + "grad_norm": 1.0547795295715332, + "kl": 0.26865124702453613, + "learning_rate": 4.966430404017424e-06, + "loss": 0.0107, + "reward": 1.7916667461395264, + "reward_std": 1.1534368991851807, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 294 + }, + { + "completion_length": 357.3333435058594, + "epoch": 1.0314685314685315, + "grad_norm": 1.61123788356781, + "kl": 0.2728823125362396, + "learning_rate": 4.965714003843079e-06, + "loss": 0.0109, + "reward": 3.266666889190674, + "reward_std": 1.6014575958251953, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7666666507720947, + "step": 295 + }, + { + "completion_length": 388.3333435058594, + "epoch": 1.034965034965035, + "grad_norm": 0.8229731917381287, + "kl": 0.33708059787750244, + "learning_rate": 4.964990092676263e-06, + "loss": 0.0135, + "reward": 0.7125000357627869, + "reward_std": 0.5300353765487671, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.3791666626930237, + "step": 296 + }, + { + "completion_length": 667.0, + "epoch": 1.0384615384615385, + "grad_norm": 1.0831242799758911, + "kl": 0.26999422907829285, + "learning_rate": 4.964258672722135e-06, + "loss": 0.0108, + "reward": 2.5458335876464844, + "reward_std": 1.5425965785980225, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7124999761581421, + "step": 297 + }, + { + "completion_length": 804.1666870117188, + "epoch": 1.0419580419580419, + "grad_norm": 0.625715434551239, + "kl": 0.12136679887771606, + "learning_rate": 4.963519746208726e-06, + "loss": 0.0049, + "reward": 1.5791667699813843, + "reward_std": 1.2249915599822998, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7458333373069763, + "step": 298 + }, + { + "completion_length": 615.3333740234375, + "epoch": 1.0454545454545454, + "grad_norm": 0.9705678820610046, + "kl": 0.2214520424604416, + "learning_rate": 4.962773315386935e-06, + "loss": 0.0089, + "reward": 2.516666889190674, + "reward_std": 1.2355836629867554, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 299 + }, + { + "completion_length": 836.1666870117188, + "epoch": 1.048951048951049, + "grad_norm": 1.5465428829193115, + "kl": 0.24709966778755188, + "learning_rate": 4.962019382530521e-06, + "loss": 0.0099, + "reward": 2.0458333492279053, + "reward_std": 1.097544550895691, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 300 + }, + { + "completion_length": 597.6666870117188, + "epoch": 1.0524475524475525, + "grad_norm": 3.8257570266723633, + "kl": 0.9686455130577087, + "learning_rate": 4.961257949936092e-06, + "loss": 0.0387, + "reward": 1.4750001430511475, + "reward_std": 1.025061011314392, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4749999940395355, + "step": 301 + }, + { + "completion_length": 516.6666870117188, + "epoch": 1.055944055944056, + "grad_norm": 2.1578736305236816, + "kl": 0.25257474184036255, + "learning_rate": 4.960489019923105e-06, + "loss": 0.0101, + "reward": 1.712499976158142, + "reward_std": 1.2360976934432983, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7125000357627869, + "step": 302 + }, + { + "completion_length": 390.3333435058594, + "epoch": 1.0594405594405594, + "grad_norm": 1.1851695775985718, + "kl": 0.30646514892578125, + "learning_rate": 4.959712594833855e-06, + "loss": 0.0123, + "reward": 1.3875000476837158, + "reward_std": 1.3440377712249756, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5541666746139526, + "step": 303 + }, + { + "completion_length": 329.66668701171875, + "epoch": 1.062937062937063, + "grad_norm": 1.7874314785003662, + "kl": 0.5978689193725586, + "learning_rate": 4.958928677033465e-06, + "loss": 0.0239, + "reward": 2.5625, + "reward_std": 1.447562575340271, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5625, + "step": 304 + }, + { + "completion_length": 676.5, + "epoch": 1.0664335664335665, + "grad_norm": 1.6353819370269775, + "kl": 0.2865048348903656, + "learning_rate": 4.958137268909887e-06, + "loss": 0.0115, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 305 + }, + { + "completion_length": 685.1666870117188, + "epoch": 1.06993006993007, + "grad_norm": 0.5405178666114807, + "kl": 0.16403402388095856, + "learning_rate": 4.957338372873886e-06, + "loss": 0.0066, + "reward": 1.1166666746139526, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 306 + }, + { + "completion_length": 377.16668701171875, + "epoch": 1.0734265734265733, + "grad_norm": 1.3861095905303955, + "kl": 0.5912900567054749, + "learning_rate": 4.956531991359038e-06, + "loss": 0.0237, + "reward": 0.9541667699813843, + "reward_std": 0.9423928260803223, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4541666507720947, + "step": 307 + }, + { + "completion_length": 568.1666870117188, + "epoch": 1.0769230769230769, + "grad_norm": 2.0841739177703857, + "kl": 0.3946326673030853, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.0158, + "reward": 1.2583333253860474, + "reward_std": 1.1876096725463867, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666388511658, + "step": 308 + }, + { + "completion_length": 610.1666870117188, + "epoch": 1.0804195804195804, + "grad_norm": 0.7838713526725769, + "kl": 0.20940952003002167, + "learning_rate": 4.95489678174111e-06, + "loss": 0.0084, + "reward": 1.1750000715255737, + "reward_std": 1.1035170555114746, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6750000715255737, + "step": 309 + }, + { + "completion_length": 780.3333740234375, + "epoch": 1.083916083916084, + "grad_norm": 0.91953444480896, + "kl": 0.13563194870948792, + "learning_rate": 4.9540679586191605e-06, + "loss": 0.0054, + "reward": 1.8500001430511475, + "reward_std": 1.006479024887085, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 310 + }, + { + "completion_length": 468.66668701171875, + "epoch": 1.0874125874125875, + "grad_norm": 1.1062681674957275, + "kl": 0.36474311351776123, + "learning_rate": 4.953231659980613e-06, + "loss": 0.0146, + "reward": 2.058333396911621, + "reward_std": 1.7576736211776733, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 311 + }, + { + "completion_length": 571.3333740234375, + "epoch": 1.0909090909090908, + "grad_norm": 0.7562583088874817, + "kl": 0.17403468489646912, + "learning_rate": 4.9523878883729794e-06, + "loss": 0.007, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 312 + }, + { + "completion_length": 580.6666870117188, + "epoch": 1.0944055944055944, + "grad_norm": 0.7236371040344238, + "kl": 0.20542237162590027, + "learning_rate": 4.9515366463665324e-06, + "loss": 0.0082, + "reward": 2.4000000953674316, + "reward_std": 0.8803409337997437, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8999999761581421, + "step": 313 + }, + { + "completion_length": 372.5, + "epoch": 1.097902097902098, + "grad_norm": 0.736242949962616, + "kl": 0.19798314571380615, + "learning_rate": 4.9506779365543054e-06, + "loss": 0.0079, + "reward": 3.0916666984558105, + "reward_std": 0.4247548282146454, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9250000715255737, + "step": 314 + }, + { + "completion_length": 660.8333740234375, + "epoch": 1.1013986013986015, + "grad_norm": 0.7641960978507996, + "kl": 0.29524654150009155, + "learning_rate": 4.949811761552074e-06, + "loss": 0.0118, + "reward": 2.4166669845581055, + "reward_std": 1.2176480293273926, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7499999403953552, + "step": 315 + }, + { + "completion_length": 838.3333740234375, + "epoch": 1.104895104895105, + "grad_norm": 0.5717921853065491, + "kl": 0.14558419585227966, + "learning_rate": 4.94893812399836e-06, + "loss": 0.0058, + "reward": 2.258333206176758, + "reward_std": 0.8662660121917725, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9249999523162842, + "step": 316 + }, + { + "completion_length": 308.8333435058594, + "epoch": 1.1083916083916083, + "grad_norm": 1.5407124757766724, + "kl": 0.36382099986076355, + "learning_rate": 4.948057026554415e-06, + "loss": 0.0146, + "reward": 1.2291667461395264, + "reward_std": 1.2054479122161865, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5625, + "step": 317 + }, + { + "completion_length": 582.1666870117188, + "epoch": 1.1118881118881119, + "grad_norm": 0.5300387144088745, + "kl": 0.19406351447105408, + "learning_rate": 4.947168471904213e-06, + "loss": 0.0078, + "reward": 1.375, + "reward_std": 0.4937104880809784, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.8749999403953552, + "step": 318 + }, + { + "completion_length": 889.3333740234375, + "epoch": 1.1153846153846154, + "grad_norm": 0.7921298146247864, + "kl": 0.14385448396205902, + "learning_rate": 4.946272462754447e-06, + "loss": 0.0058, + "reward": 1.629166603088379, + "reward_std": 0.8614546656608582, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 319 + }, + { + "completion_length": 576.6666870117188, + "epoch": 1.118881118881119, + "grad_norm": 2.1564207077026367, + "kl": 0.8259252309799194, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.033, + "reward": 1.399999976158142, + "reward_std": 1.3337916135787964, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.40000003576278687, + "step": 320 + }, + { + "completion_length": 471.8333435058594, + "epoch": 1.1223776223776223, + "grad_norm": 1.2515596151351929, + "kl": 0.24163812398910522, + "learning_rate": 4.944458091896515e-06, + "loss": 0.0097, + "reward": 1.883333444595337, + "reward_std": 1.3325413465499878, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7166666984558105, + "step": 321 + }, + { + "completion_length": 416.66668701171875, + "epoch": 1.1258741258741258, + "grad_norm": 0.7721207141876221, + "kl": 0.2213769555091858, + "learning_rate": 4.9435397357152406e-06, + "loss": 0.0089, + "reward": 1.899999976158142, + "reward_std": 0.6442049741744995, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 322 + }, + { + "completion_length": 1349.5, + "epoch": 1.1293706293706294, + "grad_norm": 0.3130567967891693, + "kl": 0.10197386145591736, + "learning_rate": 4.94261393608816e-06, + "loss": 0.0041, + "reward": 1.9666666984558105, + "reward_std": 0.9277212023735046, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7999999523162842, + "step": 323 + }, + { + "completion_length": 669.5, + "epoch": 1.132867132867133, + "grad_norm": 0.9291994571685791, + "kl": 0.22598087787628174, + "learning_rate": 4.9416806958354206e-06, + "loss": 0.009, + "reward": 0.949999988079071, + "reward_std": 0.6595453023910522, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6166666746139526, + "step": 324 + }, + { + "completion_length": 184.1666717529297, + "epoch": 1.1363636363636362, + "grad_norm": 2.9357590675354004, + "kl": 0.44805118441581726, + "learning_rate": 4.9407400177998335e-06, + "loss": 0.0179, + "reward": 2.450000047683716, + "reward_std": 1.4673106670379639, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 325 + }, + { + "completion_length": 786.8333740234375, + "epoch": 1.1398601398601398, + "grad_norm": 0.7112540006637573, + "kl": 0.23709163069725037, + "learning_rate": 4.939791904846869e-06, + "loss": 0.0095, + "reward": 2.7333335876464844, + "reward_std": 0.40207797288894653, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 326 + }, + { + "completion_length": 391.16668701171875, + "epoch": 1.1433566433566433, + "grad_norm": 1.6311299800872803, + "kl": 0.31598275899887085, + "learning_rate": 4.938836359864641e-06, + "loss": 0.0126, + "reward": 2.2791666984558105, + "reward_std": 0.9937827587127686, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6124999523162842, + "step": 327 + }, + { + "completion_length": 325.8333435058594, + "epoch": 1.1468531468531469, + "grad_norm": 1.6858141422271729, + "kl": 0.40026235580444336, + "learning_rate": 4.937873385763909e-06, + "loss": 0.016, + "reward": 2.0250000953674316, + "reward_std": 1.1339092254638672, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 328 + }, + { + "completion_length": 313.3333435058594, + "epoch": 1.1503496503496504, + "grad_norm": 1.9852374792099, + "kl": 0.36842843890190125, + "learning_rate": 4.936902985478055e-06, + "loss": 0.0147, + "reward": 2.5250000953674316, + "reward_std": 1.1206024885177612, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 329 + }, + { + "completion_length": 333.66668701171875, + "epoch": 1.1538461538461537, + "grad_norm": 1.0456072092056274, + "kl": 0.3002980351448059, + "learning_rate": 4.935925161963089e-06, + "loss": 0.012, + "reward": 2.1083335876464844, + "reward_std": 0.9068719744682312, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7749999761581421, + "step": 330 + }, + { + "completion_length": 419.16668701171875, + "epoch": 1.1573426573426573, + "grad_norm": 0.9209095239639282, + "kl": 0.19463126361370087, + "learning_rate": 4.93493991819763e-06, + "loss": 0.0078, + "reward": 3.566666603088379, + "reward_std": 0.9389710426330566, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 331 + }, + { + "completion_length": 501.3333435058594, + "epoch": 1.1608391608391608, + "grad_norm": 0.9894822239875793, + "kl": 0.23653444647789001, + "learning_rate": 4.933947257182901e-06, + "loss": 0.0095, + "reward": 2.4583334922790527, + "reward_std": 1.6280101537704468, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 332 + }, + { + "completion_length": 283.8333435058594, + "epoch": 1.1643356643356644, + "grad_norm": 1.3056206703186035, + "kl": 0.3558562397956848, + "learning_rate": 4.932947181942721e-06, + "loss": 0.0142, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 333 + }, + { + "completion_length": 617.8333740234375, + "epoch": 1.167832167832168, + "grad_norm": 0.7905691266059875, + "kl": 0.2221965491771698, + "learning_rate": 4.9319396955234925e-06, + "loss": 0.0089, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 334 + }, + { + "completion_length": 802.3333740234375, + "epoch": 1.1713286713286712, + "grad_norm": 0.650930643081665, + "kl": 0.2902371287345886, + "learning_rate": 4.930924800994192e-06, + "loss": 0.0116, + "reward": 2.9375, + "reward_std": 0.9523326754570007, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7708332538604736, + "step": 335 + }, + { + "completion_length": 571.5, + "epoch": 1.1748251748251748, + "grad_norm": 2.592233180999756, + "kl": 0.44388240575790405, + "learning_rate": 4.9299025014463665e-06, + "loss": 0.0178, + "reward": 2.625, + "reward_std": 1.0167349576950073, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 336 + }, + { + "completion_length": 765.0, + "epoch": 1.1783216783216783, + "grad_norm": 0.8478806018829346, + "kl": 0.23496964573860168, + "learning_rate": 4.928872799994116e-06, + "loss": 0.0094, + "reward": 2.4166665077209473, + "reward_std": 1.0943796634674072, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.75, + "step": 337 + }, + { + "completion_length": 369.5, + "epoch": 1.1818181818181819, + "grad_norm": 1.2003388404846191, + "kl": 0.283313125371933, + "learning_rate": 4.92783569977409e-06, + "loss": 0.0113, + "reward": 2.4625000953674316, + "reward_std": 1.1056389808654785, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7958333492279053, + "step": 338 + }, + { + "completion_length": 241.1666717529297, + "epoch": 1.1853146853146854, + "grad_norm": 1.1362509727478027, + "kl": 0.36542683839797974, + "learning_rate": 4.926791203945477e-06, + "loss": 0.0146, + "reward": 2.941667079925537, + "reward_std": 1.237908124923706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7749999761581421, + "step": 339 + }, + { + "completion_length": 262.3333435058594, + "epoch": 1.1888111888111887, + "grad_norm": 2.5425589084625244, + "kl": 0.46542689204216003, + "learning_rate": 4.925739315689991e-06, + "loss": 0.0186, + "reward": 2.2166666984558105, + "reward_std": 1.3840761184692383, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666388511658, + "step": 340 + }, + { + "completion_length": 458.8333435058594, + "epoch": 1.1923076923076923, + "grad_norm": 1.0685269832611084, + "kl": 0.28533288836479187, + "learning_rate": 4.924680038211868e-06, + "loss": 0.0114, + "reward": 3.0375001430511475, + "reward_std": 0.7974568605422974, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 341 + }, + { + "completion_length": 680.6666870117188, + "epoch": 1.1958041958041958, + "grad_norm": 1.049636960029602, + "kl": 0.2565695643424988, + "learning_rate": 4.923613374737848e-06, + "loss": 0.0103, + "reward": 2.875, + "reward_std": 0.7834221124649048, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.875, + "step": 342 + }, + { + "completion_length": 669.5, + "epoch": 1.1993006993006994, + "grad_norm": 0.47562330961227417, + "kl": 0.15911276638507843, + "learning_rate": 4.922539328517174e-06, + "loss": 0.0064, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 343 + }, + { + "completion_length": 533.1666870117188, + "epoch": 1.2027972027972027, + "grad_norm": 2.7278823852539062, + "kl": 0.42878812551498413, + "learning_rate": 4.921457902821578e-06, + "loss": 0.0172, + "reward": 2.191666603088379, + "reward_std": 1.1499637365341187, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916667222976685, + "step": 344 + }, + { + "completion_length": 410.5, + "epoch": 1.2062937062937062, + "grad_norm": 1.2009421586990356, + "kl": 0.30361247062683105, + "learning_rate": 4.92036910094527e-06, + "loss": 0.0121, + "reward": 2.2958333492279053, + "reward_std": 0.7362772822380066, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7958333492279053, + "step": 345 + }, + { + "completion_length": 678.0, + "epoch": 1.2097902097902098, + "grad_norm": 1.1339452266693115, + "kl": 0.36994367837905884, + "learning_rate": 4.9192729262049285e-06, + "loss": 0.0148, + "reward": 1.375, + "reward_std": 1.7195203304290771, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.375, + "step": 346 + }, + { + "completion_length": 364.66668701171875, + "epoch": 1.2132867132867133, + "grad_norm": 1.0105022192001343, + "kl": 0.22824347019195557, + "learning_rate": 4.918169381939693e-06, + "loss": 0.0091, + "reward": 1.75, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.75, + "step": 347 + }, + { + "completion_length": 231.83334350585938, + "epoch": 1.2167832167832167, + "grad_norm": 2.2665371894836426, + "kl": 0.5012367963790894, + "learning_rate": 4.917058471511149e-06, + "loss": 0.02, + "reward": 0.8916667699813843, + "reward_std": 0.8929818868637085, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5583333969116211, + "step": 348 + }, + { + "completion_length": 149.6666717529297, + "epoch": 1.2202797202797202, + "grad_norm": 1.465401530265808, + "kl": 0.71610426902771, + "learning_rate": 4.915940198303324e-06, + "loss": 0.0286, + "reward": 2.183333396911621, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.5166666507720947, + "step": 349 + }, + { + "completion_length": 265.66668701171875, + "epoch": 1.2237762237762237, + "grad_norm": 1.1324924230575562, + "kl": 0.39196571707725525, + "learning_rate": 4.914814565722671e-06, + "loss": 0.0157, + "reward": 2.016666889190674, + "reward_std": 0.9521905779838562, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 350 + }, + { + "completion_length": 228.1666717529297, + "epoch": 1.2272727272727273, + "grad_norm": 2.361294746398926, + "kl": 0.5443918704986572, + "learning_rate": 4.913681577198063e-06, + "loss": 0.0218, + "reward": 2.7916667461395264, + "reward_std": 1.5844295024871826, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 351 + }, + { + "completion_length": 645.1666870117188, + "epoch": 1.2307692307692308, + "grad_norm": 1.6541866064071655, + "kl": 0.3587082326412201, + "learning_rate": 4.912541236180779e-06, + "loss": 0.0143, + "reward": 3.0208334922790527, + "reward_std": 1.1969144344329834, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6875, + "step": 352 + }, + { + "completion_length": 592.1666870117188, + "epoch": 1.2342657342657342, + "grad_norm": 3.038172483444214, + "kl": 0.6741119623184204, + "learning_rate": 4.9113935461444955e-06, + "loss": 0.027, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 353 + }, + { + "completion_length": 416.16668701171875, + "epoch": 1.2377622377622377, + "grad_norm": 1.0763347148895264, + "kl": 0.32444697618484497, + "learning_rate": 4.910238510585275e-06, + "loss": 0.013, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 354 + }, + { + "completion_length": 276.3333435058594, + "epoch": 1.2412587412587412, + "grad_norm": 2.7986843585968018, + "kl": 0.9174998998641968, + "learning_rate": 4.909076133021558e-06, + "loss": 0.0367, + "reward": 2.625, + "reward_std": 1.616709589958191, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 355 + }, + { + "completion_length": 269.16668701171875, + "epoch": 1.2447552447552448, + "grad_norm": 0.9633187055587769, + "kl": 0.3955456614494324, + "learning_rate": 4.907906416994146e-06, + "loss": 0.0158, + "reward": 3.066667079925537, + "reward_std": 0.4490731656551361, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8999999761581421, + "step": 356 + }, + { + "completion_length": 313.16668701171875, + "epoch": 1.2482517482517483, + "grad_norm": 2.313849449157715, + "kl": 0.662523627281189, + "learning_rate": 4.906729366066197e-06, + "loss": 0.0265, + "reward": 1.7666667699813843, + "reward_std": 1.1767185926437378, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 357 + }, + { + "completion_length": 216.0, + "epoch": 1.2517482517482517, + "grad_norm": 4.379472255706787, + "kl": 0.7677586078643799, + "learning_rate": 4.905544983823214e-06, + "loss": 0.0307, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6333333253860474, + "step": 358 + }, + { + "completion_length": 860.3333740234375, + "epoch": 1.2552447552447552, + "grad_norm": 2.9275009632110596, + "kl": 0.6438803672790527, + "learning_rate": 4.904353273873029e-06, + "loss": 0.0258, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 359 + }, + { + "completion_length": 217.83334350585938, + "epoch": 1.2587412587412588, + "grad_norm": 2.738201141357422, + "kl": 0.6947124004364014, + "learning_rate": 4.903154239845798e-06, + "loss": 0.0278, + "reward": 2.125, + "reward_std": 1.3129165172576904, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 360 + }, + { + "completion_length": 850.6666870117188, + "epoch": 1.2622377622377623, + "grad_norm": 0.6407853364944458, + "kl": 0.21777069568634033, + "learning_rate": 4.901947885393986e-06, + "loss": 0.0087, + "reward": 3.066667079925537, + "reward_std": 0.9389710426330566, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 361 + }, + { + "completion_length": 430.5, + "epoch": 1.2657342657342658, + "grad_norm": 3.934774398803711, + "kl": 1.3171093463897705, + "learning_rate": 4.900734214192358e-06, + "loss": 0.0527, + "reward": 2.4666666984558105, + "reward_std": 1.7380064725875854, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 362 + }, + { + "completion_length": 1049.0, + "epoch": 1.2692307692307692, + "grad_norm": 1.0587317943572998, + "kl": 0.3339938521385193, + "learning_rate": 4.899513229937968e-06, + "loss": 0.0134, + "reward": 1.183333396911621, + "reward_std": 0.6088240146636963, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8499999046325684, + "step": 363 + }, + { + "completion_length": 752.5, + "epoch": 1.2727272727272727, + "grad_norm": 0.9463182687759399, + "kl": 0.2867739796638489, + "learning_rate": 4.898284936350144e-06, + "loss": 0.0115, + "reward": 1.445833444595337, + "reward_std": 1.1011831760406494, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.612500011920929, + "step": 364 + }, + { + "completion_length": 302.3333435058594, + "epoch": 1.2762237762237763, + "grad_norm": 1.0470837354660034, + "kl": 0.4384109377861023, + "learning_rate": 4.897049337170483e-06, + "loss": 0.0175, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 365 + }, + { + "completion_length": 299.5, + "epoch": 1.2797202797202798, + "grad_norm": 1.4532350301742554, + "kl": 0.48457586765289307, + "learning_rate": 4.8958064361628334e-06, + "loss": 0.0194, + "reward": 2.183333396911621, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8500000238418579, + "step": 366 + }, + { + "completion_length": 591.1666870117188, + "epoch": 1.2832167832167833, + "grad_norm": 1.7987697124481201, + "kl": 0.44638824462890625, + "learning_rate": 4.894556237113287e-06, + "loss": 0.0179, + "reward": 1.1166666746139526, + "reward_std": 1.1223487854003906, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 367 + }, + { + "completion_length": 1384.5, + "epoch": 1.2867132867132867, + "grad_norm": 0.4040040373802185, + "kl": 0.12767352163791656, + "learning_rate": 4.893298743830168e-06, + "loss": 0.0051, + "reward": 1.691666841506958, + "reward_std": 1.4019334316253662, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 368 + }, + { + "completion_length": 440.3333435058594, + "epoch": 1.2902097902097902, + "grad_norm": 1.9347208738327026, + "kl": 0.46111249923706055, + "learning_rate": 4.89203396014402e-06, + "loss": 0.0184, + "reward": 1.9333332777023315, + "reward_std": 1.0510313510894775, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7666666507720947, + "step": 369 + }, + { + "completion_length": 602.8333740234375, + "epoch": 1.2937062937062938, + "grad_norm": 1.7568728923797607, + "kl": 0.5643346309661865, + "learning_rate": 4.890761889907589e-06, + "loss": 0.0226, + "reward": 1.2333333492279053, + "reward_std": 1.1513760089874268, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.40000003576278687, + "step": 370 + }, + { + "completion_length": 584.1666870117188, + "epoch": 1.297202797202797, + "grad_norm": 2.6727964878082275, + "kl": 0.5424228310585022, + "learning_rate": 4.889482536995826e-06, + "loss": 0.0217, + "reward": 2.7916667461395264, + "reward_std": 1.4527275562286377, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 371 + }, + { + "completion_length": 302.16668701171875, + "epoch": 1.3006993006993006, + "grad_norm": 1.0215359926223755, + "kl": 0.38776999711990356, + "learning_rate": 4.888195905305859e-06, + "loss": 0.0155, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 372 + }, + { + "completion_length": 1038.5, + "epoch": 1.3041958041958042, + "grad_norm": 0.8328973054885864, + "kl": 0.31271958351135254, + "learning_rate": 4.886901998756995e-06, + "loss": 0.0125, + "reward": 1.4750001430511475, + "reward_std": 1.0486897230148315, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6416667103767395, + "step": 373 + }, + { + "completion_length": 407.16668701171875, + "epoch": 1.3076923076923077, + "grad_norm": 1.812672734260559, + "kl": 0.3156376779079437, + "learning_rate": 4.885600821290692e-06, + "loss": 0.0126, + "reward": 3.183333396911621, + "reward_std": 1.2027745246887207, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 374 + }, + { + "completion_length": 264.16668701171875, + "epoch": 1.3111888111888113, + "grad_norm": 4.727421760559082, + "kl": 1.329188585281372, + "learning_rate": 4.884292376870567e-06, + "loss": 0.0532, + "reward": 2.0916666984558105, + "reward_std": 0.94890296459198, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 375 + }, + { + "completion_length": 516.5, + "epoch": 1.3146853146853146, + "grad_norm": 2.27711820602417, + "kl": 0.6330995559692383, + "learning_rate": 4.882976669482368e-06, + "loss": 0.0253, + "reward": 1.3583333492279053, + "reward_std": 1.1029127836227417, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6916667222976685, + "step": 376 + }, + { + "completion_length": 420.66668701171875, + "epoch": 1.3181818181818181, + "grad_norm": 2.9678735733032227, + "kl": 0.8875288367271423, + "learning_rate": 4.881653703133966e-06, + "loss": 0.0355, + "reward": 1.883333444595337, + "reward_std": 1.3325413465499878, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 377 + }, + { + "completion_length": 753.1666870117188, + "epoch": 1.3216783216783217, + "grad_norm": 0.774476945400238, + "kl": 0.36767667531967163, + "learning_rate": 4.880323481855347e-06, + "loss": 0.0147, + "reward": 2.3583335876464844, + "reward_std": 1.55962073802948, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 378 + }, + { + "completion_length": 182.5, + "epoch": 1.3251748251748252, + "grad_norm": 1.207739233970642, + "kl": 0.43915602564811707, + "learning_rate": 4.878986009698596e-06, + "loss": 0.0176, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 379 + }, + { + "completion_length": 341.0, + "epoch": 1.3286713286713288, + "grad_norm": 0.7512596249580383, + "kl": 0.3403867483139038, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.0136, + "reward": 3.0416667461395264, + "reward_std": 1.4800056219100952, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.875, + "step": 380 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.332167832167832, + "grad_norm": 2.4150354862213135, + "kl": 0.6687287092208862, + "learning_rate": 4.87628932906946e-06, + "loss": 0.0267, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 381 + }, + { + "completion_length": 657.5, + "epoch": 1.3356643356643356, + "grad_norm": 1.1033812761306763, + "kl": 0.2525772750377655, + "learning_rate": 4.874930128811631e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 382 + }, + { + "completion_length": 655.6666870117188, + "epoch": 1.3391608391608392, + "grad_norm": 2.7283008098602295, + "kl": 0.7087686061859131, + "learning_rate": 4.87356369410476e-06, + "loss": 0.0284, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6333333253860474, + "step": 383 + }, + { + "completion_length": 1037.166748046875, + "epoch": 1.3426573426573427, + "grad_norm": 1.4860605001449585, + "kl": 0.35516053438186646, + "learning_rate": 4.8721900291112415e-06, + "loss": 0.0142, + "reward": 1.3416666984558105, + "reward_std": 1.0956352949142456, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6750000715255737, + "step": 384 + }, + { + "completion_length": 776.0, + "epoch": 1.3461538461538463, + "grad_norm": 2.1169064044952393, + "kl": 0.6649973392486572, + "learning_rate": 4.870809138015499e-06, + "loss": 0.0266, + "reward": 1.4750001430511475, + "reward_std": 1.2451908588409424, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 385 + }, + { + "completion_length": 803.8333740234375, + "epoch": 1.3496503496503496, + "grad_norm": 1.5138658285140991, + "kl": 0.5593903064727783, + "learning_rate": 4.869421025023965e-06, + "loss": 0.0224, + "reward": 1.2250001430511475, + "reward_std": 1.229125738143921, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5583333373069763, + "step": 386 + }, + { + "completion_length": 579.8333740234375, + "epoch": 1.3531468531468531, + "grad_norm": 0.8988491892814636, + "kl": 0.2851899266242981, + "learning_rate": 4.868025694365073e-06, + "loss": 0.0114, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 387 + }, + { + "completion_length": 173.5, + "epoch": 1.3566433566433567, + "grad_norm": 1.3644022941589355, + "kl": 0.5744073390960693, + "learning_rate": 4.866623150289241e-06, + "loss": 0.023, + "reward": 1.9666666984558105, + "reward_std": 1.2412359714508057, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 388 + }, + { + "completion_length": 578.3333740234375, + "epoch": 1.3601398601398602, + "grad_norm": 0.8156600594520569, + "kl": 0.2687755227088928, + "learning_rate": 4.865213397068864e-06, + "loss": 0.0108, + "reward": 2.2916667461395264, + "reward_std": 1.4901063442230225, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 389 + }, + { + "completion_length": 1756.8333740234375, + "epoch": 1.3636363636363638, + "grad_norm": 0.36968812346458435, + "kl": 0.11372655630111694, + "learning_rate": 4.863796438998293e-06, + "loss": 0.0045, + "reward": 1.4666666984558105, + "reward_std": 0.9174239635467529, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.6333333253860474, + "step": 390 + }, + { + "completion_length": 605.5, + "epoch": 1.367132867132867, + "grad_norm": 1.086455225944519, + "kl": 0.2938157916069031, + "learning_rate": 4.862372280393828e-06, + "loss": 0.0118, + "reward": 2.4375, + "reward_std": 1.2702115774154663, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7708333730697632, + "step": 391 + }, + { + "completion_length": 736.0, + "epoch": 1.3706293706293706, + "grad_norm": 3.411510705947876, + "kl": 0.9218753576278687, + "learning_rate": 4.860940925593703e-06, + "loss": 0.0369, + "reward": 1.4583333730697632, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7916666865348816, + "step": 392 + }, + { + "completion_length": 166.5, + "epoch": 1.3741258741258742, + "grad_norm": 1.464406132698059, + "kl": 0.34225571155548096, + "learning_rate": 4.8595023789580745e-06, + "loss": 0.0137, + "reward": 1.6041667461395264, + "reward_std": 0.7573666572570801, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7708332538604736, + "step": 393 + }, + { + "completion_length": 646.5, + "epoch": 1.3776223776223775, + "grad_norm": 1.6122732162475586, + "kl": 0.4424184560775757, + "learning_rate": 4.858056644869002e-06, + "loss": 0.0177, + "reward": 1.3250000476837158, + "reward_std": 0.9527591466903687, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.8250000476837158, + "step": 394 + }, + { + "completion_length": 641.1666870117188, + "epoch": 1.381118881118881, + "grad_norm": 0.6985570192337036, + "kl": 0.23967330157756805, + "learning_rate": 4.856603727730446e-06, + "loss": 0.0096, + "reward": 2.5458333492279053, + "reward_std": 1.5425965785980225, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7125000357627869, + "step": 395 + }, + { + "completion_length": 161.83334350585938, + "epoch": 1.3846153846153846, + "grad_norm": 1.9270485639572144, + "kl": 0.7514389753341675, + "learning_rate": 4.855143631968242e-06, + "loss": 0.0301, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 396 + }, + { + "completion_length": 166.0, + "epoch": 1.3881118881118881, + "grad_norm": 1.2144757509231567, + "kl": 0.35039469599723816, + "learning_rate": 4.853676362030095e-06, + "loss": 0.014, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 397 + }, + { + "completion_length": 569.0, + "epoch": 1.3916083916083917, + "grad_norm": 6.755039215087891, + "kl": 0.7890805006027222, + "learning_rate": 4.852201922385564e-06, + "loss": 0.0316, + "reward": 2.1083333492279053, + "reward_std": 1.7987264394760132, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6083333492279053, + "step": 398 + }, + { + "completion_length": 909.0, + "epoch": 1.395104895104895, + "grad_norm": 0.7347401976585388, + "kl": 0.18117789924144745, + "learning_rate": 4.850720317526047e-06, + "loss": 0.0072, + "reward": 1.962499976158142, + "reward_std": 0.534263551235199, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7958333492279053, + "step": 399 + }, + { + "completion_length": 793.5, + "epoch": 1.3986013986013985, + "grad_norm": 0.849243700504303, + "kl": 0.27008673548698425, + "learning_rate": 4.849231551964771e-06, + "loss": 0.0108, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 400 + }, + { + "completion_length": 554.1666870117188, + "epoch": 1.402097902097902, + "grad_norm": 2.7050747871398926, + "kl": 0.5240260362625122, + "learning_rate": 4.847735630236773e-06, + "loss": 0.021, + "reward": 3.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 401 + }, + { + "completion_length": 215.83334350585938, + "epoch": 1.4055944055944056, + "grad_norm": 0.9243234992027283, + "kl": 0.3121068477630615, + "learning_rate": 4.84623255689889e-06, + "loss": 0.0125, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 402 + }, + { + "completion_length": 235.6666717529297, + "epoch": 1.4090909090909092, + "grad_norm": 3.3891875743865967, + "kl": 0.5218031406402588, + "learning_rate": 4.844722336529745e-06, + "loss": 0.0209, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7916666865348816, + "step": 403 + }, + { + "completion_length": 923.5, + "epoch": 1.4125874125874125, + "grad_norm": 3.197908878326416, + "kl": 0.7076524496078491, + "learning_rate": 4.84320497372973e-06, + "loss": 0.0283, + "reward": 2.0458335876464844, + "reward_std": 1.3396285772323608, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.7125000357627869, + "step": 404 + }, + { + "completion_length": 197.83334350585938, + "epoch": 1.416083916083916, + "grad_norm": 1.1261261701583862, + "kl": 0.3264281153678894, + "learning_rate": 4.841680473120994e-06, + "loss": 0.0131, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 405 + }, + { + "completion_length": 554.5, + "epoch": 1.4195804195804196, + "grad_norm": 3.3561604022979736, + "kl": 0.8642048835754395, + "learning_rate": 4.840148839347434e-06, + "loss": 0.0346, + "reward": 1.8500001430511475, + "reward_std": 1.0315039157867432, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 406 + }, + { + "completion_length": 795.8333740234375, + "epoch": 1.4230769230769231, + "grad_norm": 4.25921630859375, + "kl": 0.770601749420166, + "learning_rate": 4.838610077074669e-06, + "loss": 0.0308, + "reward": 1.2916667461395264, + "reward_std": 1.0551856756210327, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.4583333432674408, + "step": 407 + }, + { + "completion_length": 915.0, + "epoch": 1.4265734265734267, + "grad_norm": 0.571506142616272, + "kl": 0.20412606000900269, + "learning_rate": 4.837064190990036e-06, + "loss": 0.0082, + "reward": 2.241666793823242, + "reward_std": 1.3698238134384155, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7416666746139526, + "step": 408 + }, + { + "completion_length": 520.6666870117188, + "epoch": 1.43006993006993, + "grad_norm": 0.9773194193840027, + "kl": 0.29276588559150696, + "learning_rate": 4.835511185802574e-06, + "loss": 0.0117, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 409 + }, + { + "completion_length": 357.5, + "epoch": 1.4335664335664335, + "grad_norm": 2.5951545238494873, + "kl": 0.4989779591560364, + "learning_rate": 4.833951066243004e-06, + "loss": 0.02, + "reward": 1.945833444595337, + "reward_std": 1.279689073562622, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.612500011920929, + "step": 410 + }, + { + "completion_length": 794.3333740234375, + "epoch": 1.437062937062937, + "grad_norm": 0.761000394821167, + "kl": 0.20721551775932312, + "learning_rate": 4.832383837063723e-06, + "loss": 0.0083, + "reward": 2.0416667461395264, + "reward_std": 1.100189447402954, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 411 + }, + { + "completion_length": 1086.5, + "epoch": 1.4405594405594406, + "grad_norm": 0.9872347116470337, + "kl": 0.296750009059906, + "learning_rate": 4.830809503038781e-06, + "loss": 0.0119, + "reward": 2.0916666984558105, + "reward_std": 1.442365050315857, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5916666984558105, + "step": 412 + }, + { + "completion_length": 168.5, + "epoch": 1.4440559440559442, + "grad_norm": 1.2185351848602295, + "kl": 0.34197482466697693, + "learning_rate": 4.829228068963873e-06, + "loss": 0.0137, + "reward": 3.2333333492279053, + "reward_std": 0.5671566724777222, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 413 + }, + { + "completion_length": 775.3333740234375, + "epoch": 1.4475524475524475, + "grad_norm": 1.1913334131240845, + "kl": 0.3759481906890869, + "learning_rate": 4.8276395396563215e-06, + "loss": 0.015, + "reward": 0.8916667699813843, + "reward_std": 0.5633975267410278, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.1666666716337204, + "rewards/reward_retry": 0.7250000834465027, + "step": 414 + }, + { + "completion_length": 203.6666717529297, + "epoch": 1.451048951048951, + "grad_norm": 1.0359302759170532, + "kl": 0.31211602687835693, + "learning_rate": 4.826043919955062e-06, + "loss": 0.0125, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 415 + }, + { + "completion_length": 543.6666870117188, + "epoch": 1.4545454545454546, + "grad_norm": 0.7396105527877808, + "kl": 0.25116777420043945, + "learning_rate": 4.824441214720629e-06, + "loss": 0.01, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 416 + }, + { + "completion_length": 253.0, + "epoch": 1.458041958041958, + "grad_norm": 2.3947131633758545, + "kl": 0.3577002286911011, + "learning_rate": 4.8228314288351405e-06, + "loss": 0.0143, + "reward": 1.8500001430511475, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 417 + }, + { + "completion_length": 776.0, + "epoch": 1.4615384615384617, + "grad_norm": 0.9339893460273743, + "kl": 0.2636467218399048, + "learning_rate": 4.821214567202284e-06, + "loss": 0.0105, + "reward": 2.2333333492279053, + "reward_std": 0.5671566724777222, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 418 + }, + { + "completion_length": 185.33334350585938, + "epoch": 1.465034965034965, + "grad_norm": 3.6216635704040527, + "kl": 0.6233493685722351, + "learning_rate": 4.8195906347473e-06, + "loss": 0.0249, + "reward": 1.8000000715255737, + "reward_std": 1.579240322113037, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 419 + }, + { + "completion_length": 1112.0, + "epoch": 1.4685314685314685, + "grad_norm": 0.6356344223022461, + "kl": 0.26539915800094604, + "learning_rate": 4.817959636416969e-06, + "loss": 0.0106, + "reward": 2.375, + "reward_std": 1.001873254776001, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 420 + }, + { + "completion_length": 531.1666870117188, + "epoch": 1.472027972027972, + "grad_norm": 0.8300501108169556, + "kl": 0.31844228506088257, + "learning_rate": 4.816321577179594e-06, + "loss": 0.0127, + "reward": 2.875, + "reward_std": 0.7834221124649048, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 421 + }, + { + "completion_length": 218.83334350585938, + "epoch": 1.4755244755244754, + "grad_norm": 0.796237051486969, + "kl": 0.331187903881073, + "learning_rate": 4.814676462024988e-06, + "loss": 0.0132, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 422 + }, + { + "completion_length": 186.83334350585938, + "epoch": 1.479020979020979, + "grad_norm": 1.279965877532959, + "kl": 0.3236890733242035, + "learning_rate": 4.8130242959644555e-06, + "loss": 0.0129, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 423 + }, + { + "completion_length": 249.0, + "epoch": 1.4825174825174825, + "grad_norm": 4.079779624938965, + "kl": 0.39256423711776733, + "learning_rate": 4.811365084030784e-06, + "loss": 0.0157, + "reward": 1.7125000953674316, + "reward_std": 0.9684717655181885, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7125000357627869, + "step": 424 + }, + { + "completion_length": 183.33334350585938, + "epoch": 1.486013986013986, + "grad_norm": 1.1069165468215942, + "kl": 0.262847363948822, + "learning_rate": 4.809698831278217e-06, + "loss": 0.0105, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 425 + }, + { + "completion_length": 199.6666717529297, + "epoch": 1.4895104895104896, + "grad_norm": 1.413517713546753, + "kl": 0.39733991026878357, + "learning_rate": 4.808025542782453e-06, + "loss": 0.0159, + "reward": 2.7083334922790527, + "reward_std": 0.8662660121917725, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 426 + }, + { + "completion_length": 235.6666717529297, + "epoch": 1.493006993006993, + "grad_norm": 0.9659198522567749, + "kl": 0.2365071177482605, + "learning_rate": 4.806345223640616e-06, + "loss": 0.0095, + "reward": 1.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 427 + }, + { + "completion_length": 774.1666870117188, + "epoch": 1.4965034965034965, + "grad_norm": 0.830765962600708, + "kl": 0.33350443840026855, + "learning_rate": 4.804657878971252e-06, + "loss": 0.0133, + "reward": 2.183333396911621, + "reward_std": 1.3265244960784912, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 428 + }, + { + "completion_length": 203.0, + "epoch": 1.5, + "grad_norm": 1.0319793224334717, + "kl": 0.27221041917800903, + "learning_rate": 4.802963513914304e-06, + "loss": 0.0109, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 429 + }, + { + "completion_length": 461.16668701171875, + "epoch": 1.5034965034965035, + "grad_norm": 1.0231879949569702, + "kl": 0.24733422696590424, + "learning_rate": 4.801262133631101e-06, + "loss": 0.0099, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 430 + }, + { + "completion_length": 244.83334350585938, + "epoch": 1.506993006993007, + "grad_norm": 0.9520881772041321, + "kl": 0.31419527530670166, + "learning_rate": 4.799553743304345e-06, + "loss": 0.0126, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 431 + }, + { + "completion_length": 206.83334350585938, + "epoch": 1.5104895104895104, + "grad_norm": 0.8148533701896667, + "kl": 0.2550124228000641, + "learning_rate": 4.7978383481380865e-06, + "loss": 0.0102, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 432 + }, + { + "completion_length": 1087.8333740234375, + "epoch": 1.513986013986014, + "grad_norm": 0.3516090214252472, + "kl": 0.2816867530345917, + "learning_rate": 4.796115953357718e-06, + "loss": 0.0113, + "reward": 2.2833333015441895, + "reward_std": 1.2408331632614136, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.3333333432674408, + "rewards/reward_retry": 0.6166666746139526, + "step": 433 + }, + { + "completion_length": 556.3333740234375, + "epoch": 1.5174825174825175, + "grad_norm": 3.6779227256774902, + "kl": 0.4250108003616333, + "learning_rate": 4.794386564209953e-06, + "loss": 0.017, + "reward": 2.4083335399627686, + "reward_std": 1.687132716178894, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 434 + }, + { + "completion_length": 707.8333740234375, + "epoch": 1.5209790209790208, + "grad_norm": 1.121485948562622, + "kl": 0.24696388840675354, + "learning_rate": 4.79265018596281e-06, + "loss": 0.0099, + "reward": 2.9000000953674316, + "reward_std": 0.9027735590934753, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 435 + }, + { + "completion_length": 469.8333435058594, + "epoch": 1.5244755244755246, + "grad_norm": 2.6518046855926514, + "kl": 0.7716752886772156, + "learning_rate": 4.790906823905599e-06, + "loss": 0.0309, + "reward": 1.8000000715255737, + "reward_std": 1.447066068649292, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 436 + }, + { + "completion_length": 192.83334350585938, + "epoch": 1.527972027972028, + "grad_norm": 1.165176272392273, + "kl": 0.2884241044521332, + "learning_rate": 4.7891564833489035e-06, + "loss": 0.0115, + "reward": 2.450000286102295, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 437 + }, + { + "completion_length": 254.6666717529297, + "epoch": 1.5314685314685315, + "grad_norm": 0.8783808350563049, + "kl": 0.26613113284111023, + "learning_rate": 4.787399169624562e-06, + "loss": 0.0106, + "reward": 3.370833396911621, + "reward_std": 1.011238932609558, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 438 + }, + { + "completion_length": 158.5, + "epoch": 1.534965034965035, + "grad_norm": 2.008617877960205, + "kl": 0.5028926134109497, + "learning_rate": 4.7856348880856595e-06, + "loss": 0.0201, + "reward": 1.7416666746139526, + "reward_std": 1.1517016887664795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7416666746139526, + "step": 439 + }, + { + "completion_length": 208.5, + "epoch": 1.5384615384615383, + "grad_norm": 0.8693957924842834, + "kl": 0.2799164056777954, + "learning_rate": 4.783863644106502e-06, + "loss": 0.0112, + "reward": 2.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 440 + }, + { + "completion_length": 211.5, + "epoch": 1.541958041958042, + "grad_norm": 1.5437381267547607, + "kl": 0.3011782467365265, + "learning_rate": 4.782085443082607e-06, + "loss": 0.012, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 441 + }, + { + "completion_length": 491.8333435058594, + "epoch": 1.5454545454545454, + "grad_norm": 3.308060884475708, + "kl": 0.43526870012283325, + "learning_rate": 4.780300290430683e-06, + "loss": 0.0174, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 442 + }, + { + "completion_length": 177.1666717529297, + "epoch": 1.548951048951049, + "grad_norm": 2.3108198642730713, + "kl": 0.6005208492279053, + "learning_rate": 4.778508191588613e-06, + "loss": 0.024, + "reward": 2.683333396911621, + "reward_std": 1.2110600471496582, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 443 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.5524475524475525, + "grad_norm": 0.9576809406280518, + "kl": 0.3041282296180725, + "learning_rate": 4.776709152015443e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 444 + }, + { + "completion_length": 807.3333740234375, + "epoch": 1.5559440559440558, + "grad_norm": 0.6298768520355225, + "kl": 0.2337806224822998, + "learning_rate": 4.774903177191358e-06, + "loss": 0.0094, + "reward": 2.5458335876464844, + "reward_std": 1.3377609252929688, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 445 + }, + { + "completion_length": 523.6666870117188, + "epoch": 1.5594405594405596, + "grad_norm": 1.1019190549850464, + "kl": 0.39509618282318115, + "learning_rate": 4.773090272617672e-06, + "loss": 0.0158, + "reward": 2.049999952316284, + "reward_std": 1.5391557216644287, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 446 + }, + { + "completion_length": 787.6666870117188, + "epoch": 1.562937062937063, + "grad_norm": 0.893694281578064, + "kl": 0.37470337748527527, + "learning_rate": 4.771270443816805e-06, + "loss": 0.015, + "reward": 2.2083334922790527, + "reward_std": 0.8720186948776245, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 447 + }, + { + "completion_length": 546.8333740234375, + "epoch": 1.5664335664335665, + "grad_norm": 0.837485134601593, + "kl": 0.22402605414390564, + "learning_rate": 4.769443696332272e-06, + "loss": 0.009, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 448 + }, + { + "completion_length": 177.6666717529297, + "epoch": 1.56993006993007, + "grad_norm": 1.617317795753479, + "kl": 0.3958384692668915, + "learning_rate": 4.767610035728663e-06, + "loss": 0.0158, + "reward": 2.875, + "reward_std": 1.0068515539169312, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 449 + }, + { + "completion_length": 147.33334350585938, + "epoch": 1.5734265734265733, + "grad_norm": 0.9628480076789856, + "kl": 0.3490566611289978, + "learning_rate": 4.765769467591626e-06, + "loss": 0.014, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 450 + }, + { + "completion_length": 203.83334350585938, + "epoch": 1.5769230769230769, + "grad_norm": 0.9194980263710022, + "kl": 0.3181028962135315, + "learning_rate": 4.763921997527849e-06, + "loss": 0.0127, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 451 + }, + { + "completion_length": 167.5, + "epoch": 1.5804195804195804, + "grad_norm": 3.041954517364502, + "kl": 0.426164835691452, + "learning_rate": 4.762067631165049e-06, + "loss": 0.017, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 452 + }, + { + "completion_length": 212.33334350585938, + "epoch": 1.583916083916084, + "grad_norm": 1.1762245893478394, + "kl": 0.2974995970726013, + "learning_rate": 4.760206374151947e-06, + "loss": 0.0119, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 453 + }, + { + "completion_length": 493.66668701171875, + "epoch": 1.5874125874125875, + "grad_norm": 1.3206851482391357, + "kl": 0.36789295077323914, + "learning_rate": 4.7583382321582525e-06, + "loss": 0.0147, + "reward": 1.9166667461395264, + "reward_std": 1.2738393545150757, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.75, + "step": 454 + }, + { + "completion_length": 205.0, + "epoch": 1.5909090909090908, + "grad_norm": 1.0482568740844727, + "kl": 0.2594867944717407, + "learning_rate": 4.7564632108746524e-06, + "loss": 0.0104, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 455 + }, + { + "completion_length": 502.8333435058594, + "epoch": 1.5944055944055944, + "grad_norm": 2.1341159343719482, + "kl": 0.4591405391693115, + "learning_rate": 4.754581316012785e-06, + "loss": 0.0184, + "reward": 3.7083334922790527, + "reward_std": 0.591960072517395, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 456 + }, + { + "completion_length": 633.3333740234375, + "epoch": 1.597902097902098, + "grad_norm": 1.0107204914093018, + "kl": 0.24642407894134521, + "learning_rate": 4.752692553305229e-06, + "loss": 0.0099, + "reward": 3.0375001430511475, + "reward_std": 0.7974569201469421, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708333373069763, + "step": 457 + }, + { + "completion_length": 517.0, + "epoch": 1.6013986013986012, + "grad_norm": 0.6217291355133057, + "kl": 0.22938358783721924, + "learning_rate": 4.750796928505484e-06, + "loss": 0.0092, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 458 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.604895104895105, + "grad_norm": 0.5446264743804932, + "kl": 0.1968853920698166, + "learning_rate": 4.7488944473879515e-06, + "loss": 0.0079, + "reward": 1.625, + "reward_std": 0.7960842847824097, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 459 + }, + { + "completion_length": 193.83334350585938, + "epoch": 1.6083916083916083, + "grad_norm": 0.8946224451065063, + "kl": 0.25773894786834717, + "learning_rate": 4.746985115747918e-06, + "loss": 0.0103, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 460 + }, + { + "completion_length": 204.6666717529297, + "epoch": 1.6118881118881119, + "grad_norm": 0.8260864019393921, + "kl": 0.2527741491794586, + "learning_rate": 4.745068939401539e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 461 + }, + { + "completion_length": 848.6666870117188, + "epoch": 1.6153846153846154, + "grad_norm": 1.5746495723724365, + "kl": 0.3351367712020874, + "learning_rate": 4.743145924185821e-06, + "loss": 0.0134, + "reward": 2.25, + "reward_std": 0.7803846597671509, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.75, + "step": 462 + }, + { + "completion_length": 190.0, + "epoch": 1.6188811188811187, + "grad_norm": 1.0435597896575928, + "kl": 0.26553571224212646, + "learning_rate": 4.741216075958602e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 463 + }, + { + "completion_length": 209.6666717529297, + "epoch": 1.6223776223776225, + "grad_norm": 1.0996354818344116, + "kl": 0.31133967638015747, + "learning_rate": 4.7392794005985324e-06, + "loss": 0.0125, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 464 + }, + { + "completion_length": 512.6666870117188, + "epoch": 1.6258741258741258, + "grad_norm": 0.7010518908500671, + "kl": 0.21432137489318848, + "learning_rate": 4.737335904005063e-06, + "loss": 0.0086, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 465 + }, + { + "completion_length": 527.0, + "epoch": 1.6293706293706294, + "grad_norm": 0.5995029211044312, + "kl": 0.22433510422706604, + "learning_rate": 4.735385592098421e-06, + "loss": 0.009, + "reward": 2.7916667461395264, + "reward_std": 1.4527275562286377, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 466 + }, + { + "completion_length": 191.0, + "epoch": 1.632867132867133, + "grad_norm": 1.2079272270202637, + "kl": 0.2614157795906067, + "learning_rate": 4.733428470819595e-06, + "loss": 0.0105, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 467 + }, + { + "completion_length": 783.1666870117188, + "epoch": 1.6363636363636362, + "grad_norm": 2.2251851558685303, + "kl": 0.6713162660598755, + "learning_rate": 4.731464546130315e-06, + "loss": 0.0269, + "reward": 2.4375, + "reward_std": 1.3401259183883667, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7708333730697632, + "step": 468 + }, + { + "completion_length": 529.1666870117188, + "epoch": 1.63986013986014, + "grad_norm": 0.5742272138595581, + "kl": 0.23623262345790863, + "learning_rate": 4.729493824013036e-06, + "loss": 0.0094, + "reward": 2.2125000953674316, + "reward_std": 1.234073519706726, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7124999761581421, + "step": 469 + }, + { + "completion_length": 181.0, + "epoch": 1.6433566433566433, + "grad_norm": 1.7596086263656616, + "kl": 0.33919036388397217, + "learning_rate": 4.72751631047092e-06, + "loss": 0.0136, + "reward": 1.8500001430511475, + "reward_std": 1.2247450351715088, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.8499999046325684, + "step": 470 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.6468531468531469, + "grad_norm": 1.0671755075454712, + "kl": 0.27314767241477966, + "learning_rate": 4.725532011527817e-06, + "loss": 0.0109, + "reward": 3.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 471 + }, + { + "completion_length": 189.6666717529297, + "epoch": 1.6503496503496504, + "grad_norm": 1.0676515102386475, + "kl": 0.2805836498737335, + "learning_rate": 4.723540933228245e-06, + "loss": 0.0112, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 472 + }, + { + "completion_length": 836.5, + "epoch": 1.6538461538461537, + "grad_norm": 0.8203516006469727, + "kl": 0.172221839427948, + "learning_rate": 4.721543081637372e-06, + "loss": 0.0069, + "reward": 1.5833333730697632, + "reward_std": 1.0308573246002197, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7499999403953552, + "step": 473 + }, + { + "completion_length": 169.0, + "epoch": 1.6573426573426573, + "grad_norm": 1.7924721240997314, + "kl": 0.30363911390304565, + "learning_rate": 4.719538462841003e-06, + "loss": 0.0121, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 474 + }, + { + "completion_length": 176.6666717529297, + "epoch": 1.6608391608391608, + "grad_norm": 0.19596193730831146, + "kl": 0.24111799895763397, + "learning_rate": 4.717527082945555e-06, + "loss": 0.0108, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 475 + }, + { + "completion_length": 234.6666717529297, + "epoch": 1.6643356643356644, + "grad_norm": 0.9966434240341187, + "kl": 0.25714850425720215, + "learning_rate": 4.715508948078037e-06, + "loss": 0.0103, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 476 + }, + { + "completion_length": 1046.8333740234375, + "epoch": 1.667832167832168, + "grad_norm": 0.6285001635551453, + "kl": 0.1687658280134201, + "learning_rate": 4.71348406438604e-06, + "loss": 0.0068, + "reward": 2.0250000953674316, + "reward_std": 1.4372718334197998, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6916666030883789, + "step": 477 + }, + { + "completion_length": 219.1666717529297, + "epoch": 1.6713286713286712, + "grad_norm": 1.0476932525634766, + "kl": 0.29544544219970703, + "learning_rate": 4.71145243803771e-06, + "loss": 0.0118, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 478 + }, + { + "completion_length": 561.1666870117188, + "epoch": 1.6748251748251748, + "grad_norm": 1.0641223192214966, + "kl": 0.1950298398733139, + "learning_rate": 4.709414075221734e-06, + "loss": 0.0078, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 479 + }, + { + "completion_length": 228.5, + "epoch": 1.6783216783216783, + "grad_norm": 0.8561164736747742, + "kl": 0.26422810554504395, + "learning_rate": 4.707368982147318e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 480 + }, + { + "completion_length": 509.3333435058594, + "epoch": 1.6818181818181817, + "grad_norm": 0.5843437314033508, + "kl": 0.20474323630332947, + "learning_rate": 4.70531716504417e-06, + "loss": 0.0082, + "reward": 2.183333396911621, + "reward_std": 1.2027745246887207, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 481 + }, + { + "completion_length": 548.6666870117188, + "epoch": 1.6853146853146854, + "grad_norm": 0.648353636264801, + "kl": 0.18905925750732422, + "learning_rate": 4.703258630162481e-06, + "loss": 0.0076, + "reward": 2.2125000953674316, + "reward_std": 1.3164108991622925, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7124999761581421, + "step": 482 + }, + { + "completion_length": 219.6666717529297, + "epoch": 1.6888111888111887, + "grad_norm": 4.2207932472229, + "kl": 1.0905920267105103, + "learning_rate": 4.701193383772905e-06, + "loss": 0.0436, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 483 + }, + { + "completion_length": 1049.166748046875, + "epoch": 1.6923076923076923, + "grad_norm": 0.5171648859977722, + "kl": 0.20516209304332733, + "learning_rate": 4.699121432166542e-06, + "loss": 0.0082, + "reward": 2.2333333492279053, + "reward_std": 0.9174240827560425, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 484 + }, + { + "completion_length": 201.6666717529297, + "epoch": 1.6958041958041958, + "grad_norm": 1.1004559993743896, + "kl": 0.2839426100254059, + "learning_rate": 4.697042781654913e-06, + "loss": 0.0114, + "reward": 1.870833396911621, + "reward_std": 0.193917915225029, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 485 + }, + { + "completion_length": 190.33334350585938, + "epoch": 1.6993006993006992, + "grad_norm": 1.0573567152023315, + "kl": 0.22315821051597595, + "learning_rate": 4.6949574385699514e-06, + "loss": 0.0089, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 486 + }, + { + "completion_length": 835.5, + "epoch": 1.702797202797203, + "grad_norm": 0.7173390984535217, + "kl": 0.1510881930589676, + "learning_rate": 4.6928654092639725e-06, + "loss": 0.006, + "reward": 1.5500000715255737, + "reward_std": 1.0904128551483154, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666388511658, + "step": 487 + }, + { + "completion_length": 615.8333740234375, + "epoch": 1.7062937062937062, + "grad_norm": 0.8014463186264038, + "kl": 0.22651296854019165, + "learning_rate": 4.690766700109659e-06, + "loss": 0.0091, + "reward": 2.7083334922790527, + "reward_std": 1.315453052520752, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 488 + }, + { + "completion_length": 169.6666717529297, + "epoch": 1.7097902097902098, + "grad_norm": 3.6473190784454346, + "kl": 0.40026336908340454, + "learning_rate": 4.688661317500045e-06, + "loss": 0.016, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 489 + }, + { + "completion_length": 1151.5, + "epoch": 1.7132867132867133, + "grad_norm": 0.8561959266662598, + "kl": 0.16577297449111938, + "learning_rate": 4.68654926784849e-06, + "loss": 0.0066, + "reward": 2.7083334922790527, + "reward_std": 1.0641508102416992, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.875, + "step": 490 + }, + { + "completion_length": 397.3333435058594, + "epoch": 1.7167832167832167, + "grad_norm": 1.0723934173583984, + "kl": 0.21682481467723846, + "learning_rate": 4.6844305575886635e-06, + "loss": 0.0087, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 491 + }, + { + "completion_length": 169.6666717529297, + "epoch": 1.7202797202797204, + "grad_norm": 1.4164685010910034, + "kl": 0.245243638753891, + "learning_rate": 4.682305193174524e-06, + "loss": 0.0098, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 492 + }, + { + "completion_length": 110.33333587646484, + "epoch": 1.7237762237762237, + "grad_norm": 5.974154949188232, + "kl": 1.1889418363571167, + "learning_rate": 4.680173181080302e-06, + "loss": 0.0476, + "reward": 3.075000286102295, + "reward_std": 1.1660832166671753, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 493 + }, + { + "completion_length": 215.5, + "epoch": 1.7272727272727273, + "grad_norm": 0.9199399352073669, + "kl": 0.2431143820285797, + "learning_rate": 4.6780345278004744e-06, + "loss": 0.0097, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 494 + }, + { + "completion_length": 190.83334350585938, + "epoch": 1.7307692307692308, + "grad_norm": 0.9801461696624756, + "kl": 0.25382137298583984, + "learning_rate": 4.675889239849749e-06, + "loss": 0.0102, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 495 + }, + { + "completion_length": 846.6666870117188, + "epoch": 1.7342657342657342, + "grad_norm": 0.6822401881217957, + "kl": 0.21501430869102478, + "learning_rate": 4.673737323763048e-06, + "loss": 0.0086, + "reward": 2.679166793823242, + "reward_std": 1.3748105764389038, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8458333015441895, + "step": 496 + }, + { + "completion_length": 182.33334350585938, + "epoch": 1.737762237762238, + "grad_norm": 6.3415422439575195, + "kl": 1.284159541130066, + "learning_rate": 4.671578786095479e-06, + "loss": 0.0514, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 497 + }, + { + "completion_length": 164.83334350585938, + "epoch": 1.7412587412587412, + "grad_norm": 1.421428918838501, + "kl": 0.3243716359138489, + "learning_rate": 4.669413633422322e-06, + "loss": 0.013, + "reward": 3.566666603088379, + "reward_std": 0.6013872623443604, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 498 + }, + { + "completion_length": 229.6666717529297, + "epoch": 1.7447552447552448, + "grad_norm": 0.8355535864830017, + "kl": 0.24279817938804626, + "learning_rate": 4.667241872339007e-06, + "loss": 0.0097, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 499 + }, + { + "completion_length": 672.6666870117188, + "epoch": 1.7482517482517483, + "grad_norm": 0.5215955376625061, + "kl": 0.19877499341964722, + "learning_rate": 4.665063509461098e-06, + "loss": 0.008, + "reward": 2.924999952316284, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 500 + }, + { + "completion_length": 198.83334350585938, + "epoch": 1.7517482517482517, + "grad_norm": 0.9148537516593933, + "kl": 0.24169328808784485, + "learning_rate": 4.6628785514242615e-06, + "loss": 0.0097, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 501 + }, + { + "completion_length": 928.5, + "epoch": 1.7552447552447552, + "grad_norm": 0.4413454532623291, + "kl": 0.15593400597572327, + "learning_rate": 4.6606870048842626e-06, + "loss": 0.0062, + "reward": 2.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 502 + }, + { + "completion_length": 508.0, + "epoch": 1.7587412587412588, + "grad_norm": 0.7536454796791077, + "kl": 0.24186736345291138, + "learning_rate": 4.658488876516929e-06, + "loss": 0.0097, + "reward": 2.625, + "reward_std": 1.616709589958191, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 503 + }, + { + "completion_length": 208.33334350585938, + "epoch": 1.762237762237762, + "grad_norm": 1.1730728149414062, + "kl": 0.2987002432346344, + "learning_rate": 4.656284173018144e-06, + "loss": 0.0119, + "reward": 2.758333206176758, + "reward_std": 1.0394309759140015, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 504 + }, + { + "completion_length": 197.1666717529297, + "epoch": 1.7657342657342658, + "grad_norm": 2.2083706855773926, + "kl": 0.3215945363044739, + "learning_rate": 4.654072901103815e-06, + "loss": 0.0129, + "reward": 2.0416667461395264, + "reward_std": 0.9002315402030945, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 505 + }, + { + "completion_length": 572.0, + "epoch": 1.7692307692307692, + "grad_norm": 0.8655341863632202, + "kl": 0.24153539538383484, + "learning_rate": 4.65185506750986e-06, + "loss": 0.0097, + "reward": 1.870833396911621, + "reward_std": 1.0137083530426025, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 506 + }, + { + "completion_length": 517.5, + "epoch": 1.7727272727272727, + "grad_norm": 0.49979329109191895, + "kl": 0.16330799460411072, + "learning_rate": 4.649630678992184e-06, + "loss": 0.0065, + "reward": 2.4000000953674316, + "reward_std": 0.9460445642471313, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 507 + }, + { + "completion_length": 324.16668701171875, + "epoch": 1.7762237762237763, + "grad_norm": 0.9129101037979126, + "kl": 0.26079505681991577, + "learning_rate": 4.6473997423266615e-06, + "loss": 0.0104, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 508 + }, + { + "completion_length": 316.16668701171875, + "epoch": 1.7797202797202796, + "grad_norm": 0.7381297945976257, + "kl": 0.34089159965515137, + "learning_rate": 4.645162264309112e-06, + "loss": 0.0136, + "reward": 3.2333335876464844, + "reward_std": 0.849509596824646, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 509 + }, + { + "completion_length": 207.83334350585938, + "epoch": 1.7832167832167833, + "grad_norm": 1.0436253547668457, + "kl": 0.2835765480995178, + "learning_rate": 4.642918251755281e-06, + "loss": 0.0113, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 510 + }, + { + "completion_length": 230.33334350585938, + "epoch": 1.7867132867132867, + "grad_norm": 0.9628374576568604, + "kl": 0.2641430199146271, + "learning_rate": 4.640667711500821e-06, + "loss": 0.0106, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 511 + }, + { + "completion_length": 507.66668701171875, + "epoch": 1.7902097902097902, + "grad_norm": 0.3851446211338043, + "kl": 0.251933217048645, + "learning_rate": 4.638410650401267e-06, + "loss": 0.0101, + "reward": 3.516666889190674, + "reward_std": 1.0614454746246338, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 512 + }, + { + "completion_length": 192.0, + "epoch": 1.7937062937062938, + "grad_norm": 1.3856638669967651, + "kl": 0.2984909415245056, + "learning_rate": 4.636147075332019e-06, + "loss": 0.0119, + "reward": 3.0916666984558105, + "reward_std": 1.2249150276184082, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 513 + }, + { + "completion_length": 206.83334350585938, + "epoch": 1.797202797202797, + "grad_norm": 0.9139816164970398, + "kl": 0.24960675835609436, + "learning_rate": 4.633876993188319e-06, + "loss": 0.01, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 514 + }, + { + "completion_length": 538.0, + "epoch": 1.8006993006993008, + "grad_norm": 0.7666388750076294, + "kl": 0.2067805826663971, + "learning_rate": 4.631600410885231e-06, + "loss": 0.0083, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 515 + }, + { + "completion_length": 186.0, + "epoch": 1.8041958041958042, + "grad_norm": 0.9322411417961121, + "kl": 0.24232684075832367, + "learning_rate": 4.62931733535762e-06, + "loss": 0.0097, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 516 + }, + { + "completion_length": 170.6666717529297, + "epoch": 1.8076923076923077, + "grad_norm": 1.5746034383773804, + "kl": 0.36948150396347046, + "learning_rate": 4.627027773560129e-06, + "loss": 0.0148, + "reward": 2.516666889190674, + "reward_std": 1.525341510772705, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 517 + }, + { + "completion_length": 193.0, + "epoch": 1.8111888111888113, + "grad_norm": 0.9759989380836487, + "kl": 0.3557225167751312, + "learning_rate": 4.62473173246716e-06, + "loss": 0.0142, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 518 + }, + { + "completion_length": 523.6666870117188, + "epoch": 1.8146853146853146, + "grad_norm": 0.9804190993309021, + "kl": 0.2574712038040161, + "learning_rate": 4.622429219072854e-06, + "loss": 0.0103, + "reward": 1.633333444595337, + "reward_std": 1.1919171810150146, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 519 + }, + { + "completion_length": 1029.166748046875, + "epoch": 1.8181818181818183, + "grad_norm": 0.5941687822341919, + "kl": 0.1915300190448761, + "learning_rate": 4.620120240391065e-06, + "loss": 0.0077, + "reward": 2.883333206176758, + "reward_std": 1.7192052602767944, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 520 + }, + { + "completion_length": 157.1666717529297, + "epoch": 1.8216783216783217, + "grad_norm": 3.1836304664611816, + "kl": 0.6161837577819824, + "learning_rate": 4.6178048034553435e-06, + "loss": 0.0246, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 521 + }, + { + "completion_length": 201.33334350585938, + "epoch": 1.8251748251748252, + "grad_norm": 1.5185062885284424, + "kl": 0.31097742915153503, + "learning_rate": 4.6154829153189105e-06, + "loss": 0.0124, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 522 + }, + { + "completion_length": 186.1666717529297, + "epoch": 1.8286713286713288, + "grad_norm": 0.936562180519104, + "kl": 0.3272198438644409, + "learning_rate": 4.613154583054641e-06, + "loss": 0.0131, + "reward": 2.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 523 + }, + { + "completion_length": 216.6666717529297, + "epoch": 1.832167832167832, + "grad_norm": 0.9323495626449585, + "kl": 0.3112618923187256, + "learning_rate": 4.610819813755038e-06, + "loss": 0.0125, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 524 + }, + { + "completion_length": 525.3333740234375, + "epoch": 1.8356643356643356, + "grad_norm": 0.40873953700065613, + "kl": 0.241009920835495, + "learning_rate": 4.608478614532215e-06, + "loss": 0.0096, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 525 + }, + { + "completion_length": 160.83334350585938, + "epoch": 1.8391608391608392, + "grad_norm": 1.1447237730026245, + "kl": 0.37633103132247925, + "learning_rate": 4.60613099251787e-06, + "loss": 0.0151, + "reward": 2.4583334922790527, + "reward_std": 1.0346095561981201, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 526 + }, + { + "completion_length": 176.5, + "epoch": 1.8426573426573427, + "grad_norm": 1.4215019941329956, + "kl": 0.31421756744384766, + "learning_rate": 4.603776954863266e-06, + "loss": 0.0126, + "reward": 2.2083334922790527, + "reward_std": 0.6003471612930298, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 527 + }, + { + "completion_length": 511.16668701171875, + "epoch": 1.8461538461538463, + "grad_norm": 0.7890862226486206, + "kl": 0.21260276436805725, + "learning_rate": 4.601416508739211e-06, + "loss": 0.0085, + "reward": 2.2916667461395264, + "reward_std": 1.4901063442230225, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 528 + }, + { + "completion_length": 145.6666717529297, + "epoch": 1.8496503496503496, + "grad_norm": 2.972633123397827, + "kl": 1.6821321249008179, + "learning_rate": 4.599049661336033e-06, + "loss": 0.0673, + "reward": 2.4583334922790527, + "reward_std": 1.3603004217147827, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 529 + }, + { + "completion_length": 337.66668701171875, + "epoch": 1.8531468531468531, + "grad_norm": 0.4933686852455139, + "kl": 0.2972989082336426, + "learning_rate": 4.596676419863561e-06, + "loss": 0.0119, + "reward": 3.758333206176758, + "reward_std": 0.4694856107234955, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9250000715255737, + "step": 530 + }, + { + "completion_length": 1491.166748046875, + "epoch": 1.8566433566433567, + "grad_norm": 0.7114420533180237, + "kl": 0.16526620090007782, + "learning_rate": 4.5942967915510975e-06, + "loss": 0.0066, + "reward": 2.683333396911621, + "reward_std": 0.8942409753799438, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 531 + }, + { + "completion_length": 822.0, + "epoch": 1.86013986013986, + "grad_norm": 0.4190931022167206, + "kl": 0.21502110362052917, + "learning_rate": 4.591910783647405e-06, + "loss": 0.0086, + "reward": 2.7833335399627686, + "reward_std": 0.9831921458244324, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 532 + }, + { + "completion_length": 739.5, + "epoch": 1.8636363636363638, + "grad_norm": 0.5615747570991516, + "kl": 0.223265141248703, + "learning_rate": 4.589518403420676e-06, + "loss": 0.0089, + "reward": 2.3500001430511475, + "reward_std": 1.5231547355651855, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 533 + }, + { + "completion_length": 188.6666717529297, + "epoch": 1.867132867132867, + "grad_norm": 0.754673957824707, + "kl": 0.2731919288635254, + "learning_rate": 4.587119658158517e-06, + "loss": 0.0109, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 534 + }, + { + "completion_length": 528.3333740234375, + "epoch": 1.8706293706293706, + "grad_norm": 0.45285508036613464, + "kl": 0.21540388464927673, + "learning_rate": 4.584714555167921e-06, + "loss": 0.0086, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 535 + }, + { + "completion_length": 513.1666870117188, + "epoch": 1.8741258741258742, + "grad_norm": 0.6436936259269714, + "kl": 0.2541727125644684, + "learning_rate": 4.582303101775249e-06, + "loss": 0.0102, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 536 + }, + { + "completion_length": 503.3333435058594, + "epoch": 1.8776223776223775, + "grad_norm": 0.5080775618553162, + "kl": 0.2073960304260254, + "learning_rate": 4.579885305326206e-06, + "loss": 0.0083, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 537 + }, + { + "completion_length": 209.6666717529297, + "epoch": 1.8811188811188813, + "grad_norm": 0.9030362963676453, + "kl": 0.283308744430542, + "learning_rate": 4.577461173185821e-06, + "loss": 0.0113, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 538 + }, + { + "completion_length": 121.5, + "epoch": 1.8846153846153846, + "grad_norm": 2.8895628452301025, + "kl": 0.8616495132446289, + "learning_rate": 4.5750307127384194e-06, + "loss": 0.0345, + "reward": 1.4666666984558105, + "reward_std": 1.2002778053283691, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 539 + }, + { + "completion_length": 208.83334350585938, + "epoch": 1.8881118881118881, + "grad_norm": 1.0781502723693848, + "kl": 0.2666887640953064, + "learning_rate": 4.572593931387604e-06, + "loss": 0.0107, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 540 + }, + { + "completion_length": 529.8333740234375, + "epoch": 1.8916083916083917, + "grad_norm": 0.8341970443725586, + "kl": 0.27578771114349365, + "learning_rate": 4.570150836556236e-06, + "loss": 0.011, + "reward": 2.683333396911621, + "reward_std": 0.9092121124267578, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 541 + }, + { + "completion_length": 509.0, + "epoch": 1.895104895104895, + "grad_norm": 0.7221694588661194, + "kl": 0.20753830671310425, + "learning_rate": 4.567701435686405e-06, + "loss": 0.0083, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 542 + }, + { + "completion_length": 999.0, + "epoch": 1.8986013986013988, + "grad_norm": 0.8567831516265869, + "kl": 0.2119346261024475, + "learning_rate": 4.5652457362394094e-06, + "loss": 0.0085, + "reward": 1.808333396911621, + "reward_std": 2.014302968978882, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 543 + }, + { + "completion_length": 497.16668701171875, + "epoch": 1.902097902097902, + "grad_norm": 0.5826951265335083, + "kl": 0.2415902316570282, + "learning_rate": 4.562783745695738e-06, + "loss": 0.0097, + "reward": 2.183333396911621, + "reward_std": 1.0230673551559448, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 544 + }, + { + "completion_length": 831.0, + "epoch": 1.9055944055944056, + "grad_norm": 0.5661029815673828, + "kl": 0.2621002495288849, + "learning_rate": 4.560315471555039e-06, + "loss": 0.0105, + "reward": 2.3000001907348633, + "reward_std": 1.8368451595306396, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 545 + }, + { + "completion_length": 190.6666717529297, + "epoch": 1.9090909090909092, + "grad_norm": 0.8984940648078918, + "kl": 0.261735200881958, + "learning_rate": 4.5578409213361055e-06, + "loss": 0.0105, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 546 + }, + { + "completion_length": 672.5, + "epoch": 1.9125874125874125, + "grad_norm": 0.6307451128959656, + "kl": 0.3331562280654907, + "learning_rate": 4.555360102576844e-06, + "loss": 0.0133, + "reward": 3.5916666984558105, + "reward_std": 0.5571505427360535, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.9250000715255737, + "step": 547 + }, + { + "completion_length": 193.5, + "epoch": 1.916083916083916, + "grad_norm": 0.9689189791679382, + "kl": 0.31761375069618225, + "learning_rate": 4.55287302283426e-06, + "loss": 0.0127, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 548 + }, + { + "completion_length": 477.0, + "epoch": 1.9195804195804196, + "grad_norm": 1.1217161417007446, + "kl": 0.4803551435470581, + "learning_rate": 4.550379689684431e-06, + "loss": 0.0192, + "reward": 2.924999952316284, + "reward_std": 0.06123730167746544, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.9249999523162842, + "step": 549 + }, + { + "completion_length": 501.66668701171875, + "epoch": 1.9230769230769231, + "grad_norm": 0.48732584714889526, + "kl": 0.3280116021633148, + "learning_rate": 4.54788011072248e-06, + "loss": 0.0131, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 550 + }, + { + "completion_length": 190.5, + "epoch": 1.9265734265734267, + "grad_norm": 0.05169845372438431, + "kl": 0.2321687638759613, + "learning_rate": 4.545374293562559e-06, + "loss": 0.0117, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 551 + }, + { + "completion_length": 226.33334350585938, + "epoch": 1.93006993006993, + "grad_norm": 1.1284880638122559, + "kl": 0.3435511291027069, + "learning_rate": 4.542862245837821e-06, + "loss": 0.0137, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 552 + }, + { + "completion_length": 197.5, + "epoch": 1.9335664335664335, + "grad_norm": 0.8085185289382935, + "kl": 0.2905815541744232, + "learning_rate": 4.540343975200401e-06, + "loss": 0.0116, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 553 + }, + { + "completion_length": 504.8333435058594, + "epoch": 1.937062937062937, + "grad_norm": 0.38323989510536194, + "kl": 0.26971811056137085, + "learning_rate": 4.537819489321385e-06, + "loss": 0.0108, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 554 + }, + { + "completion_length": 172.5, + "epoch": 1.9405594405594404, + "grad_norm": 1.8462821245193481, + "kl": 0.32645952701568604, + "learning_rate": 4.535288795890799e-06, + "loss": 0.0131, + "reward": 2.8500001430511475, + "reward_std": 1.3190906047821045, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 555 + }, + { + "completion_length": 508.66668701171875, + "epoch": 1.9440559440559442, + "grad_norm": 0.48262494802474976, + "kl": 0.26610442996025085, + "learning_rate": 4.5327519026175694e-06, + "loss": 0.0106, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 556 + }, + { + "completion_length": 205.33334350585938, + "epoch": 1.9475524475524475, + "grad_norm": 0.8724077343940735, + "kl": 0.34979626536369324, + "learning_rate": 4.530208817229516e-06, + "loss": 0.014, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 557 + }, + { + "completion_length": 466.3333435058594, + "epoch": 1.951048951048951, + "grad_norm": 1.2409106492996216, + "kl": 0.5075003504753113, + "learning_rate": 4.527659547473317e-06, + "loss": 0.0203, + "reward": 1.774999976158142, + "reward_std": 1.3299436569213867, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6083333492279053, + "step": 558 + }, + { + "completion_length": 201.0, + "epoch": 1.9545454545454546, + "grad_norm": 0.9538130760192871, + "kl": 0.22750967741012573, + "learning_rate": 4.5251041011144905e-06, + "loss": 0.0091, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 559 + }, + { + "completion_length": 202.1666717529297, + "epoch": 1.958041958041958, + "grad_norm": 0.8161240220069885, + "kl": 0.28019654750823975, + "learning_rate": 4.522542485937369e-06, + "loss": 0.0112, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 560 + }, + { + "completion_length": 515.5, + "epoch": 1.9615384615384617, + "grad_norm": 0.6905736327171326, + "kl": 0.20913702249526978, + "learning_rate": 4.519974709745076e-06, + "loss": 0.0084, + "reward": 2.2916667461395264, + "reward_std": 1.3492282629013062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 561 + }, + { + "completion_length": 201.5, + "epoch": 1.965034965034965, + "grad_norm": 1.109075665473938, + "kl": 0.29383933544158936, + "learning_rate": 4.517400780359505e-06, + "loss": 0.0118, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 562 + }, + { + "completion_length": 849.0, + "epoch": 1.9685314685314685, + "grad_norm": 0.5454800128936768, + "kl": 0.16988810896873474, + "learning_rate": 4.51482070562129e-06, + "loss": 0.0068, + "reward": 2.4666666984558105, + "reward_std": 1.949530005455017, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 563 + }, + { + "completion_length": 826.0, + "epoch": 1.972027972027972, + "grad_norm": 0.521063506603241, + "kl": 0.2149253934621811, + "learning_rate": 4.512234493389785e-06, + "loss": 0.0086, + "reward": 1.9666666984558105, + "reward_std": 1.5233734846115112, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 564 + }, + { + "completion_length": 502.8333435058594, + "epoch": 1.9755244755244754, + "grad_norm": 0.4798555076122284, + "kl": 0.26902374625205994, + "learning_rate": 4.509642151543043e-06, + "loss": 0.0108, + "reward": 1.625, + "reward_std": 0.7960842847824097, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 565 + }, + { + "completion_length": 525.0, + "epoch": 1.9790209790209792, + "grad_norm": 0.566384494304657, + "kl": 0.2703857123851776, + "learning_rate": 4.507043687977787e-06, + "loss": 0.0108, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 566 + }, + { + "completion_length": 194.33334350585938, + "epoch": 1.9825174825174825, + "grad_norm": 2.502077579498291, + "kl": 0.4179210364818573, + "learning_rate": 4.504439110609385e-06, + "loss": 0.0167, + "reward": 1.383333444595337, + "reward_std": 0.8920015096664429, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 567 + }, + { + "completion_length": 199.33334350585938, + "epoch": 1.986013986013986, + "grad_norm": 0.07109465450048447, + "kl": 0.2686344385147095, + "learning_rate": 4.501828427371834e-06, + "loss": 0.0131, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 568 + }, + { + "completion_length": 190.83334350585938, + "epoch": 1.9895104895104896, + "grad_norm": 1.11842942237854, + "kl": 0.2603175640106201, + "learning_rate": 4.4992116462177274e-06, + "loss": 0.0104, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 569 + }, + { + "completion_length": 513.8333740234375, + "epoch": 1.993006993006993, + "grad_norm": 0.47602808475494385, + "kl": 0.20756664872169495, + "learning_rate": 4.496588775118232e-06, + "loss": 0.0083, + "reward": 2.2916667461395264, + "reward_std": 1.19181227684021, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 570 + }, + { + "completion_length": 197.1666717529297, + "epoch": 1.9965034965034965, + "grad_norm": 0.7599025368690491, + "kl": 0.23664715886116028, + "learning_rate": 4.4939598220630724e-06, + "loss": 0.0095, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 571 + }, + { + "completion_length": 207.83334350585938, + "epoch": 2.0, + "grad_norm": 0.7908173203468323, + "kl": 0.28615739941596985, + "learning_rate": 4.491324795060491e-06, + "loss": 0.0114, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 572 + }, + { + "completion_length": 194.6666717529297, + "epoch": 2.0034965034965033, + "grad_norm": 0.9715352654457092, + "kl": 0.3183891177177429, + "learning_rate": 4.48868370213724e-06, + "loss": 0.0127, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 573 + }, + { + "completion_length": 173.83334350585938, + "epoch": 2.006993006993007, + "grad_norm": 2.3841874599456787, + "kl": 1.3214149475097656, + "learning_rate": 4.4860365513385456e-06, + "loss": 0.0529, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 574 + }, + { + "completion_length": 213.83334350585938, + "epoch": 2.0104895104895104, + "grad_norm": 0.9496575593948364, + "kl": 0.22735705971717834, + "learning_rate": 4.4833833507280884e-06, + "loss": 0.0091, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 575 + }, + { + "completion_length": 511.0, + "epoch": 2.013986013986014, + "grad_norm": 0.6045878529548645, + "kl": 0.25393787026405334, + "learning_rate": 4.4807241083879774e-06, + "loss": 0.0102, + "reward": 1.4583333730697632, + "reward_std": 0.8187898397445679, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 576 + }, + { + "completion_length": 222.1666717529297, + "epoch": 2.0174825174825175, + "grad_norm": 0.7379043102264404, + "kl": 0.22020569443702698, + "learning_rate": 4.478058832418726e-06, + "loss": 0.0088, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 577 + }, + { + "completion_length": 204.6666717529297, + "epoch": 2.020979020979021, + "grad_norm": 0.9404547810554504, + "kl": 0.2797861695289612, + "learning_rate": 4.475387530939226e-06, + "loss": 0.0112, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 578 + }, + { + "completion_length": 206.6666717529297, + "epoch": 2.0244755244755246, + "grad_norm": 0.8784480690956116, + "kl": 0.24152153730392456, + "learning_rate": 4.4727102120867274e-06, + "loss": 0.0097, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 579 + }, + { + "completion_length": 414.66668701171875, + "epoch": 2.027972027972028, + "grad_norm": 0.6715477705001831, + "kl": 0.21307629346847534, + "learning_rate": 4.470026884016805e-06, + "loss": 0.0085, + "reward": 2.4583334922790527, + "reward_std": 1.5001389980316162, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 580 + }, + { + "completion_length": 528.5, + "epoch": 2.0314685314685317, + "grad_norm": 0.7886191010475159, + "kl": 0.4145243763923645, + "learning_rate": 4.467337554903344e-06, + "loss": 0.0166, + "reward": 3.5416667461395264, + "reward_std": 1.0002083778381348, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.875, + "step": 581 + }, + { + "completion_length": 457.5, + "epoch": 2.034965034965035, + "grad_norm": 5.719381809234619, + "kl": 1.370613932609558, + "learning_rate": 4.464642232938505e-06, + "loss": 0.0548, + "reward": 1.9750001430511475, + "reward_std": 2.163504123687744, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.4749999940395355, + "step": 582 + }, + { + "completion_length": 361.5, + "epoch": 2.0384615384615383, + "grad_norm": 0.5381609201431274, + "kl": 0.23687216639518738, + "learning_rate": 4.461940926332708e-06, + "loss": 0.0095, + "reward": 1.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 583 + }, + { + "completion_length": 874.6666870117188, + "epoch": 2.041958041958042, + "grad_norm": 0.45025861263275146, + "kl": 0.16833463311195374, + "learning_rate": 4.4592336433146e-06, + "loss": 0.0067, + "reward": 2.9583334922790527, + "reward_std": 1.6554203033447266, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 584 + }, + { + "completion_length": 726.3333740234375, + "epoch": 2.0454545454545454, + "grad_norm": 0.4446694254875183, + "kl": 0.17844387888908386, + "learning_rate": 4.456520392131035e-06, + "loss": 0.0071, + "reward": 1.133333444595337, + "reward_std": 0.9595138430595398, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 585 + }, + { + "completion_length": 830.3333740234375, + "epoch": 2.0489510489510487, + "grad_norm": 0.8371572494506836, + "kl": 0.16316595673561096, + "learning_rate": 4.453801181047047e-06, + "loss": 0.0065, + "reward": 1.524999976158142, + "reward_std": 1.1206024885177612, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 586 + }, + { + "completion_length": 110.5, + "epoch": 2.0524475524475525, + "grad_norm": 3.6648356914520264, + "kl": 0.4860494136810303, + "learning_rate": 4.4510760183458246e-06, + "loss": 0.0194, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 587 + }, + { + "completion_length": 228.6666717529297, + "epoch": 2.055944055944056, + "grad_norm": 0.8717478513717651, + "kl": 0.28448450565338135, + "learning_rate": 4.448344912328686e-06, + "loss": 0.0114, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 588 + }, + { + "completion_length": 614.0, + "epoch": 2.0594405594405596, + "grad_norm": 0.352130651473999, + "kl": 0.19009076058864594, + "learning_rate": 4.445607871315053e-06, + "loss": 0.0076, + "reward": 1.7333333492279053, + "reward_std": 0.5307227969169617, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 589 + }, + { + "completion_length": 476.3333435058594, + "epoch": 2.062937062937063, + "grad_norm": 2.5581870079040527, + "kl": 0.5677192807197571, + "learning_rate": 4.442864903642428e-06, + "loss": 0.0227, + "reward": 1.8000000715255737, + "reward_std": 1.5792405605316162, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 590 + }, + { + "completion_length": 314.66668701171875, + "epoch": 2.0664335664335662, + "grad_norm": 0.657811164855957, + "kl": 0.20458662509918213, + "learning_rate": 4.440116017666365e-06, + "loss": 0.0082, + "reward": 3.116666793823242, + "reward_std": 1.3291600942611694, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 591 + }, + { + "completion_length": 516.0, + "epoch": 2.06993006993007, + "grad_norm": 0.473056823015213, + "kl": 0.19687163829803467, + "learning_rate": 4.437361221760449e-06, + "loss": 0.0079, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 592 + }, + { + "completion_length": 217.0, + "epoch": 2.0734265734265733, + "grad_norm": 0.793745756149292, + "kl": 0.2862774133682251, + "learning_rate": 4.434600524316266e-06, + "loss": 0.0115, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 593 + }, + { + "completion_length": 216.0, + "epoch": 2.076923076923077, + "grad_norm": 0.7589979767799377, + "kl": 0.2887541651725769, + "learning_rate": 4.431833933743378e-06, + "loss": 0.0116, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 594 + }, + { + "completion_length": 234.0, + "epoch": 2.0804195804195804, + "grad_norm": 0.952064037322998, + "kl": 0.30340343713760376, + "learning_rate": 4.4290614584693005e-06, + "loss": 0.0121, + "reward": 2.5375001430511475, + "reward_std": 0.9115578532218933, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 595 + }, + { + "completion_length": 1109.8333740234375, + "epoch": 2.0839160839160837, + "grad_norm": 0.382217139005661, + "kl": 0.1974603831768036, + "learning_rate": 4.426283106939474e-06, + "loss": 0.0079, + "reward": 1.7166666984558105, + "reward_std": 0.967298686504364, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7166666984558105, + "step": 596 + }, + { + "completion_length": 497.66668701171875, + "epoch": 2.0874125874125875, + "grad_norm": 0.7741627097129822, + "kl": 0.2393149733543396, + "learning_rate": 4.423498887617238e-06, + "loss": 0.0096, + "reward": 1.9583333730697632, + "reward_std": 1.400148868560791, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 597 + }, + { + "completion_length": 518.0, + "epoch": 2.090909090909091, + "grad_norm": 0.534230649471283, + "kl": 0.22715210914611816, + "learning_rate": 4.420708808983809e-06, + "loss": 0.0091, + "reward": 2.4583334922790527, + "reward_std": 1.3603003025054932, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 598 + }, + { + "completion_length": 502.8333435058594, + "epoch": 2.0944055944055946, + "grad_norm": 0.5411605834960938, + "kl": 0.2008448839187622, + "learning_rate": 4.41791287953825e-06, + "loss": 0.008, + "reward": 2.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 599 + }, + { + "completion_length": 545.6666870117188, + "epoch": 2.097902097902098, + "grad_norm": 0.44943779706954956, + "kl": 0.225155770778656, + "learning_rate": 4.415111107797445e-06, + "loss": 0.009, + "reward": 3.016666889190674, + "reward_std": 1.3952300548553467, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 600 + }, + { + "completion_length": 239.0, + "epoch": 2.1013986013986012, + "grad_norm": 0.9387716054916382, + "kl": 0.2535586357116699, + "learning_rate": 4.412303502296081e-06, + "loss": 0.0101, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 601 + }, + { + "completion_length": 188.0, + "epoch": 2.104895104895105, + "grad_norm": 3.3025033473968506, + "kl": 0.3564508557319641, + "learning_rate": 4.409490071586606e-06, + "loss": 0.0143, + "reward": 2.9583334922790527, + "reward_std": 1.6554205417633057, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 602 + }, + { + "completion_length": 526.8333740234375, + "epoch": 2.1083916083916083, + "grad_norm": 0.7135488986968994, + "kl": 0.25961729884147644, + "learning_rate": 4.406670824239221e-06, + "loss": 0.0104, + "reward": 2.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 603 + }, + { + "completion_length": 201.0, + "epoch": 2.111888111888112, + "grad_norm": 0.5526494979858398, + "kl": 0.26036110520362854, + "learning_rate": 4.403845768841842e-06, + "loss": 0.0104, + "reward": 2.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 604 + }, + { + "completion_length": 516.8333740234375, + "epoch": 2.1153846153846154, + "grad_norm": 0.4089651107788086, + "kl": 0.2617362141609192, + "learning_rate": 4.401014914000078e-06, + "loss": 0.0105, + "reward": 3.2916667461395264, + "reward_std": 1.6125807762145996, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 605 + }, + { + "completion_length": 192.5, + "epoch": 2.1188811188811187, + "grad_norm": 0.7996219396591187, + "kl": 0.30715522170066833, + "learning_rate": 4.398178268337202e-06, + "loss": 0.0123, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 606 + }, + { + "completion_length": 793.3333740234375, + "epoch": 2.1223776223776225, + "grad_norm": 0.8545472025871277, + "kl": 0.20438644289970398, + "learning_rate": 4.395335840494131e-06, + "loss": 0.0082, + "reward": 3.375, + "reward_std": 0.493710458278656, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.875, + "step": 607 + }, + { + "completion_length": 197.5, + "epoch": 2.125874125874126, + "grad_norm": 0.09662449359893799, + "kl": 0.2624778151512146, + "learning_rate": 4.3924876391293915e-06, + "loss": 0.0117, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 608 + }, + { + "completion_length": 199.0, + "epoch": 2.129370629370629, + "grad_norm": 0.8693634867668152, + "kl": 0.232680082321167, + "learning_rate": 4.389633672919099e-06, + "loss": 0.0093, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 609 + }, + { + "completion_length": 213.1666717529297, + "epoch": 2.132867132867133, + "grad_norm": 0.23271039128303528, + "kl": 0.2889987826347351, + "learning_rate": 4.386773950556931e-06, + "loss": 0.0139, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 610 + }, + { + "completion_length": 197.83334350585938, + "epoch": 2.1363636363636362, + "grad_norm": 0.8127601742744446, + "kl": 0.35951054096221924, + "learning_rate": 4.3839084807540956e-06, + "loss": 0.0144, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 611 + }, + { + "completion_length": 164.6666717529297, + "epoch": 2.13986013986014, + "grad_norm": 1.0649946928024292, + "kl": 0.26743820309638977, + "learning_rate": 4.381037272239311e-06, + "loss": 0.0107, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 612 + }, + { + "completion_length": 217.6666717529297, + "epoch": 2.1433566433566433, + "grad_norm": 0.8122753500938416, + "kl": 0.27118992805480957, + "learning_rate": 4.378160333758779e-06, + "loss": 0.0108, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 613 + }, + { + "completion_length": 201.6666717529297, + "epoch": 2.1468531468531467, + "grad_norm": 0.8640854358673096, + "kl": 0.2445271909236908, + "learning_rate": 4.3752776740761495e-06, + "loss": 0.0098, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 614 + }, + { + "completion_length": 188.6666717529297, + "epoch": 2.1503496503496504, + "grad_norm": 1.3168154954910278, + "kl": 0.2900705933570862, + "learning_rate": 4.372389301972506e-06, + "loss": 0.0116, + "reward": 1.7083333730697632, + "reward_std": 0.591960072517395, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 615 + }, + { + "completion_length": 241.6666717529297, + "epoch": 2.1538461538461537, + "grad_norm": 1.1053791046142578, + "kl": 0.4096168875694275, + "learning_rate": 4.36949522624633e-06, + "loss": 0.0164, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 616 + }, + { + "completion_length": 147.83334350585938, + "epoch": 2.1573426573426575, + "grad_norm": 3.980419874191284, + "kl": 1.5825055837631226, + "learning_rate": 4.366595455713479e-06, + "loss": 0.0633, + "reward": 2.3000001907348633, + "reward_std": 1.4812158346176147, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 617 + }, + { + "completion_length": 197.0, + "epoch": 2.160839160839161, + "grad_norm": 0.8954426050186157, + "kl": 0.23646585643291473, + "learning_rate": 4.3636899992071555e-06, + "loss": 0.0095, + "reward": 2.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 618 + }, + { + "completion_length": 221.33334350585938, + "epoch": 2.164335664335664, + "grad_norm": 0.8455007076263428, + "kl": 0.25921204686164856, + "learning_rate": 4.360778865577885e-06, + "loss": 0.0104, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 619 + }, + { + "completion_length": 196.5, + "epoch": 2.167832167832168, + "grad_norm": 0.8735758662223816, + "kl": 0.27696120738983154, + "learning_rate": 4.357862063693486e-06, + "loss": 0.0111, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 620 + }, + { + "completion_length": 177.83334350585938, + "epoch": 2.1713286713286712, + "grad_norm": 32.12022018432617, + "kl": 2.4454264640808105, + "learning_rate": 4.354939602439041e-06, + "loss": 0.0978, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 621 + }, + { + "completion_length": 184.6666717529297, + "epoch": 2.174825174825175, + "grad_norm": 2.8916237354278564, + "kl": 0.3946024775505066, + "learning_rate": 4.352011490716875e-06, + "loss": 0.0158, + "reward": 3.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 622 + }, + { + "completion_length": 210.33334350585938, + "epoch": 2.1783216783216783, + "grad_norm": 1.4287588596343994, + "kl": 0.32967257499694824, + "learning_rate": 4.349077737446525e-06, + "loss": 0.0132, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 623 + }, + { + "completion_length": 229.83334350585938, + "epoch": 2.1818181818181817, + "grad_norm": 0.04024571180343628, + "kl": 0.2965821325778961, + "learning_rate": 4.346138351564711e-06, + "loss": 0.0142, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 624 + }, + { + "completion_length": 153.83334350585938, + "epoch": 2.1853146853146854, + "grad_norm": 0.9452215433120728, + "kl": 0.26284661889076233, + "learning_rate": 4.34319334202531e-06, + "loss": 0.0105, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 625 + }, + { + "completion_length": 162.1666717529297, + "epoch": 2.1888111888111887, + "grad_norm": 32.100563049316406, + "kl": 7.969426155090332, + "learning_rate": 4.340242717799337e-06, + "loss": 0.3188, + "reward": 2.2916667461395264, + "reward_std": 0.8002604246139526, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 626 + }, + { + "completion_length": 175.5, + "epoch": 2.1923076923076925, + "grad_norm": 6.515329360961914, + "kl": 0.3849031627178192, + "learning_rate": 4.3372864878749e-06, + "loss": 0.0154, + "reward": 3.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 627 + }, + { + "completion_length": 504.3333435058594, + "epoch": 2.195804195804196, + "grad_norm": 0.6083482503890991, + "kl": 0.19082359969615936, + "learning_rate": 4.334324661257191e-06, + "loss": 0.0076, + "reward": 2.4583334922790527, + "reward_std": 1.5001389980316162, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 628 + }, + { + "completion_length": 196.0, + "epoch": 2.199300699300699, + "grad_norm": 0.9820056557655334, + "kl": 0.2912360727787018, + "learning_rate": 4.331357246968447e-06, + "loss": 0.0116, + "reward": 1.9500001668930054, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 629 + }, + { + "completion_length": 544.0, + "epoch": 2.202797202797203, + "grad_norm": 0.5948340892791748, + "kl": 0.22720639407634735, + "learning_rate": 4.328384254047927e-06, + "loss": 0.0091, + "reward": 2.375, + "reward_std": 0.6509608626365662, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8749999403953552, + "step": 630 + }, + { + "completion_length": 237.0, + "epoch": 2.2062937062937062, + "grad_norm": 0.0632646456360817, + "kl": 0.2671894431114197, + "learning_rate": 4.3254056915518815e-06, + "loss": 0.0131, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 631 + }, + { + "completion_length": 501.16668701171875, + "epoch": 2.20979020979021, + "grad_norm": 0.44626739621162415, + "kl": 0.2233467698097229, + "learning_rate": 4.322421568553529e-06, + "loss": 0.0089, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 632 + }, + { + "completion_length": 187.5, + "epoch": 2.2132867132867133, + "grad_norm": 0.9024590849876404, + "kl": 0.299750417470932, + "learning_rate": 4.319431894143027e-06, + "loss": 0.012, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 633 + }, + { + "completion_length": 532.5, + "epoch": 2.2167832167832167, + "grad_norm": 0.38001272082328796, + "kl": 0.28776365518569946, + "learning_rate": 4.316436677427441e-06, + "loss": 0.0115, + "reward": 3.566666603088379, + "reward_std": 0.9389711618423462, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 634 + }, + { + "completion_length": 201.6666717529297, + "epoch": 2.2202797202797204, + "grad_norm": 1.1841076612472534, + "kl": 0.3013113737106323, + "learning_rate": 4.313435927530719e-06, + "loss": 0.0121, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 635 + }, + { + "completion_length": 185.6666717529297, + "epoch": 2.2237762237762237, + "grad_norm": 0.8018883466720581, + "kl": 0.2923080325126648, + "learning_rate": 4.3104296535936695e-06, + "loss": 0.0117, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 636 + }, + { + "completion_length": 525.3333740234375, + "epoch": 2.227272727272727, + "grad_norm": 0.4936811923980713, + "kl": 0.25341111421585083, + "learning_rate": 4.3074178647739205e-06, + "loss": 0.0101, + "reward": 3.2083334922790527, + "reward_std": 0.9697508215904236, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 637 + }, + { + "completion_length": 191.83334350585938, + "epoch": 2.230769230769231, + "grad_norm": 0.6575815677642822, + "kl": 0.3100575804710388, + "learning_rate": 4.3044005702459055e-06, + "loss": 0.0124, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 638 + }, + { + "completion_length": 178.5, + "epoch": 2.234265734265734, + "grad_norm": 0.8525052666664124, + "kl": 0.31076908111572266, + "learning_rate": 4.301377779200826e-06, + "loss": 0.0124, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 639 + }, + { + "completion_length": 185.33334350585938, + "epoch": 2.237762237762238, + "grad_norm": 1.0106300115585327, + "kl": 0.30621784925460815, + "learning_rate": 4.2983495008466285e-06, + "loss": 0.0122, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 640 + }, + { + "completion_length": 186.5, + "epoch": 2.2412587412587412, + "grad_norm": 0.885761022567749, + "kl": 0.3738858103752136, + "learning_rate": 4.295315744407972e-06, + "loss": 0.015, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 641 + }, + { + "completion_length": 171.6666717529297, + "epoch": 2.2447552447552446, + "grad_norm": 1.113839030265808, + "kl": 0.3465404212474823, + "learning_rate": 4.2922765191262075e-06, + "loss": 0.0139, + "reward": 2.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 642 + }, + { + "completion_length": 203.0, + "epoch": 2.2482517482517483, + "grad_norm": 0.8950809836387634, + "kl": 0.2658528983592987, + "learning_rate": 4.28923183425934e-06, + "loss": 0.0106, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 643 + }, + { + "completion_length": 198.5, + "epoch": 2.2517482517482517, + "grad_norm": 0.9561752080917358, + "kl": 0.31710129976272583, + "learning_rate": 4.286181699082008e-06, + "loss": 0.0127, + "reward": 2.2833335399627686, + "reward_std": 1.0327956676483154, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 644 + }, + { + "completion_length": 168.1666717529297, + "epoch": 2.2552447552447554, + "grad_norm": 0.8310069441795349, + "kl": 0.27687615156173706, + "learning_rate": 4.283126122885455e-06, + "loss": 0.0111, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 645 + }, + { + "completion_length": 196.83334350585938, + "epoch": 2.2587412587412588, + "grad_norm": 0.09269661456346512, + "kl": 0.2699682414531708, + "learning_rate": 4.280065114977492e-06, + "loss": 0.012, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 646 + }, + { + "completion_length": 163.6666717529297, + "epoch": 2.262237762237762, + "grad_norm": 1.2992812395095825, + "kl": 0.3616819381713867, + "learning_rate": 4.276998684682482e-06, + "loss": 0.0145, + "reward": 2.375, + "reward_std": 1.1847995519638062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 647 + }, + { + "completion_length": 198.6666717529297, + "epoch": 2.265734265734266, + "grad_norm": 0.8000275492668152, + "kl": 0.2609575390815735, + "learning_rate": 4.273926841341303e-06, + "loss": 0.0104, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 648 + }, + { + "completion_length": 196.1666717529297, + "epoch": 2.269230769230769, + "grad_norm": 0.8786153197288513, + "kl": 0.3877195119857788, + "learning_rate": 4.270849594311323e-06, + "loss": 0.0155, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 649 + }, + { + "completion_length": 201.0, + "epoch": 2.2727272727272725, + "grad_norm": 0.9727340936660767, + "kl": 0.3743540942668915, + "learning_rate": 4.267766952966369e-06, + "loss": 0.015, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 650 + }, + { + "completion_length": 205.33334350585938, + "epoch": 2.2762237762237763, + "grad_norm": 0.09209764748811722, + "kl": 0.27989333868026733, + "learning_rate": 4.264678926696703e-06, + "loss": 0.0136, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 651 + }, + { + "completion_length": 202.5, + "epoch": 2.2797202797202796, + "grad_norm": 0.9205158948898315, + "kl": 0.3037436008453369, + "learning_rate": 4.261585524908987e-06, + "loss": 0.0121, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 652 + }, + { + "completion_length": 304.66668701171875, + "epoch": 2.2832167832167833, + "grad_norm": 0.8844843506813049, + "kl": 0.3668223023414612, + "learning_rate": 4.25848675702626e-06, + "loss": 0.0147, + "reward": 1.9500001668930054, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.949999988079071, + "step": 653 + }, + { + "completion_length": 194.6666717529297, + "epoch": 2.2867132867132867, + "grad_norm": 1.0558805465698242, + "kl": 0.3064219057559967, + "learning_rate": 4.255382632487907e-06, + "loss": 0.0123, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 654 + }, + { + "completion_length": 185.6666717529297, + "epoch": 2.29020979020979, + "grad_norm": 0.9313608407974243, + "kl": 0.31230098009109497, + "learning_rate": 4.2522731607496275e-06, + "loss": 0.0125, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 655 + }, + { + "completion_length": 211.1666717529297, + "epoch": 2.2937062937062938, + "grad_norm": 0.19107016921043396, + "kl": 0.373710036277771, + "learning_rate": 4.249158351283414e-06, + "loss": 0.0173, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 656 + }, + { + "completion_length": 348.8333435058594, + "epoch": 2.297202797202797, + "grad_norm": 0.7309221029281616, + "kl": 0.3733287751674652, + "learning_rate": 4.246038213577516e-06, + "loss": 0.0149, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 657 + }, + { + "completion_length": 180.6666717529297, + "epoch": 2.300699300699301, + "grad_norm": 0.8861889839172363, + "kl": 0.35562607645988464, + "learning_rate": 4.242912757136412e-06, + "loss": 0.0142, + "reward": 2.616666793823242, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 658 + }, + { + "completion_length": 204.5, + "epoch": 2.304195804195804, + "grad_norm": 0.7407400608062744, + "kl": 0.28287678956985474, + "learning_rate": 4.239781991480786e-06, + "loss": 0.0113, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 659 + }, + { + "completion_length": 174.0, + "epoch": 2.3076923076923075, + "grad_norm": 8.534856796264648, + "kl": 1.5403010845184326, + "learning_rate": 4.236645926147493e-06, + "loss": 0.0616, + "reward": 2.2916667461395264, + "reward_std": 0.8002604246139526, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7916666865348816, + "step": 660 + }, + { + "completion_length": 184.33334350585938, + "epoch": 2.3111888111888113, + "grad_norm": 0.06887773424386978, + "kl": 0.2856985628604889, + "learning_rate": 4.233504570689533e-06, + "loss": 0.0138, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 661 + }, + { + "completion_length": 202.83334350585938, + "epoch": 2.3146853146853146, + "grad_norm": 0.8288156986236572, + "kl": 0.2896421253681183, + "learning_rate": 4.230357934676017e-06, + "loss": 0.0116, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 662 + }, + { + "completion_length": 207.6666717529297, + "epoch": 2.3181818181818183, + "grad_norm": 1.119509220123291, + "kl": 0.4124630391597748, + "learning_rate": 4.227206027692146e-06, + "loss": 0.0165, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 663 + }, + { + "completion_length": 198.1666717529297, + "epoch": 2.3216783216783217, + "grad_norm": 0.8312250971794128, + "kl": 0.3108134865760803, + "learning_rate": 4.224048859339175e-06, + "loss": 0.0124, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 664 + }, + { + "completion_length": 610.8333740234375, + "epoch": 2.325174825174825, + "grad_norm": 0.5707215070724487, + "kl": 0.23091670870780945, + "learning_rate": 4.220886439234385e-06, + "loss": 0.0092, + "reward": 2.383333444595337, + "reward_std": 1.5413198471069336, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.7166666984558105, + "step": 665 + }, + { + "completion_length": 460.0, + "epoch": 2.3286713286713288, + "grad_norm": 10.873461723327637, + "kl": 2.6264634132385254, + "learning_rate": 4.217718777011058e-06, + "loss": 0.1051, + "reward": 1.4666666984558105, + "reward_std": 1.356711745262146, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 666 + }, + { + "completion_length": 207.0, + "epoch": 2.332167832167832, + "grad_norm": 0.6674370765686035, + "kl": 0.2692621350288391, + "learning_rate": 4.2145458823184414e-06, + "loss": 0.0108, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 667 + }, + { + "completion_length": 567.8333740234375, + "epoch": 2.335664335664336, + "grad_norm": 0.42179885506629944, + "kl": 0.2716664671897888, + "learning_rate": 4.211367764821722e-06, + "loss": 0.0109, + "reward": 3.566666603088379, + "reward_std": 0.938971221446991, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9000000357627869, + "step": 668 + }, + { + "completion_length": 223.5, + "epoch": 2.339160839160839, + "grad_norm": 0.6866164803504944, + "kl": 0.24070698022842407, + "learning_rate": 4.208184434201999e-06, + "loss": 0.0096, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 669 + }, + { + "completion_length": 214.5, + "epoch": 2.3426573426573425, + "grad_norm": 0.9751102924346924, + "kl": 0.2499878704547882, + "learning_rate": 4.204995900156247e-06, + "loss": 0.01, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 670 + }, + { + "completion_length": 182.33334350585938, + "epoch": 2.3461538461538463, + "grad_norm": 3.7804720401763916, + "kl": 0.46188828349113464, + "learning_rate": 4.201802172397295e-06, + "loss": 0.0185, + "reward": 3.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 671 + }, + { + "completion_length": 1019.6666870117188, + "epoch": 2.3496503496503496, + "grad_norm": 0.4247821569442749, + "kl": 0.21799665689468384, + "learning_rate": 4.198603260653792e-06, + "loss": 0.0087, + "reward": 2.7166669368743896, + "reward_std": 1.6418485641479492, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666388511658, + "step": 672 + }, + { + "completion_length": 753.8333740234375, + "epoch": 2.3531468531468533, + "grad_norm": 0.5194523334503174, + "kl": 0.22523364424705505, + "learning_rate": 4.195399174670177e-06, + "loss": 0.009, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 673 + }, + { + "completion_length": 573.6666870117188, + "epoch": 2.3566433566433567, + "grad_norm": 0.5000849366188049, + "kl": 0.22850388288497925, + "learning_rate": 4.192189924206652e-06, + "loss": 0.0091, + "reward": 2.950000286102295, + "reward_std": 1.095445156097412, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 674 + }, + { + "completion_length": 1213.8333740234375, + "epoch": 2.36013986013986, + "grad_norm": 0.5522187352180481, + "kl": 0.177886962890625, + "learning_rate": 4.188975519039151e-06, + "loss": 0.0071, + "reward": 1.3916667699813843, + "reward_std": 1.4026464223861694, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5583333969116211, + "step": 675 + }, + { + "completion_length": 872.6666870117188, + "epoch": 2.3636363636363638, + "grad_norm": 0.4857361912727356, + "kl": 0.20906971395015717, + "learning_rate": 4.185755968959308e-06, + "loss": 0.0084, + "reward": 2.9083335399627686, + "reward_std": 1.696000337600708, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7416666746139526, + "step": 676 + }, + { + "completion_length": 502.0, + "epoch": 2.367132867132867, + "grad_norm": 0.5935739278793335, + "kl": 0.27800655364990234, + "learning_rate": 4.182531283774434e-06, + "loss": 0.0111, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 677 + }, + { + "completion_length": 738.5, + "epoch": 2.370629370629371, + "grad_norm": 0.5985221862792969, + "kl": 0.2548876702785492, + "learning_rate": 4.179301473307476e-06, + "loss": 0.0102, + "reward": 2.2125000953674316, + "reward_std": 1.3164108991622925, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7125000357627869, + "step": 678 + }, + { + "completion_length": 173.83334350585938, + "epoch": 2.374125874125874, + "grad_norm": 1.7061294317245483, + "kl": 0.3693540692329407, + "learning_rate": 4.176066547396998e-06, + "loss": 0.0148, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 679 + }, + { + "completion_length": 203.6666717529297, + "epoch": 2.3776223776223775, + "grad_norm": 1.0101178884506226, + "kl": 0.31931599974632263, + "learning_rate": 4.172826515897146e-06, + "loss": 0.0128, + "reward": 2.2083334922790527, + "reward_std": 1.1577637195587158, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 680 + }, + { + "completion_length": 205.83334350585938, + "epoch": 2.3811188811188813, + "grad_norm": 0.8966777920722961, + "kl": 0.3051684498786926, + "learning_rate": 4.169581388677617e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 681 + }, + { + "completion_length": 210.6666717529297, + "epoch": 2.3846153846153846, + "grad_norm": 0.7840998768806458, + "kl": 0.31647345423698425, + "learning_rate": 4.166331175623631e-06, + "loss": 0.0127, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 682 + }, + { + "completion_length": 209.1666717529297, + "epoch": 2.3881118881118883, + "grad_norm": 0.9048584699630737, + "kl": 0.25157231092453003, + "learning_rate": 4.163075886635902e-06, + "loss": 0.0101, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 683 + }, + { + "completion_length": 494.66668701171875, + "epoch": 2.3916083916083917, + "grad_norm": 0.612885057926178, + "kl": 0.1984379142522812, + "learning_rate": 4.159815531630604e-06, + "loss": 0.0079, + "reward": 2.125, + "reward_std": 1.3129165172576904, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 684 + }, + { + "completion_length": 182.83334350585938, + "epoch": 2.395104895104895, + "grad_norm": 1.069145679473877, + "kl": 0.33643895387649536, + "learning_rate": 4.1565501205393445e-06, + "loss": 0.0135, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 685 + }, + { + "completion_length": 192.6666717529297, + "epoch": 2.3986013986013988, + "grad_norm": 0.8116271495819092, + "kl": 0.29202282428741455, + "learning_rate": 4.15327966330913e-06, + "loss": 0.0117, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 686 + }, + { + "completion_length": 196.0, + "epoch": 2.402097902097902, + "grad_norm": 0.9276851415634155, + "kl": 0.31228408217430115, + "learning_rate": 4.150004169902343e-06, + "loss": 0.0125, + "reward": 1.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 687 + }, + { + "completion_length": 213.6666717529297, + "epoch": 2.4055944055944054, + "grad_norm": 1.0499162673950195, + "kl": 0.24672053754329681, + "learning_rate": 4.146723650296701e-06, + "loss": 0.0099, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 688 + }, + { + "completion_length": 219.1666717529297, + "epoch": 2.409090909090909, + "grad_norm": 0.7051374912261963, + "kl": 0.24717721343040466, + "learning_rate": 4.14343811448524e-06, + "loss": 0.0099, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 689 + }, + { + "completion_length": 226.5, + "epoch": 2.4125874125874125, + "grad_norm": 0.7789434194564819, + "kl": 0.2564643919467926, + "learning_rate": 4.140147572476269e-06, + "loss": 0.0103, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 690 + }, + { + "completion_length": 212.0, + "epoch": 2.4160839160839163, + "grad_norm": 0.8126075267791748, + "kl": 0.23958399891853333, + "learning_rate": 4.136852034293349e-06, + "loss": 0.0096, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 691 + }, + { + "completion_length": 210.6666717529297, + "epoch": 2.4195804195804196, + "grad_norm": 0.8626409769058228, + "kl": 0.2777412533760071, + "learning_rate": 4.133551509975264e-06, + "loss": 0.0111, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 692 + }, + { + "completion_length": 529.8333740234375, + "epoch": 2.423076923076923, + "grad_norm": 0.5266372561454773, + "kl": 0.2946487069129944, + "learning_rate": 4.130246009575981e-06, + "loss": 0.0118, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 693 + }, + { + "completion_length": 217.6666717529297, + "epoch": 2.4265734265734267, + "grad_norm": 0.814607560634613, + "kl": 0.31643202900886536, + "learning_rate": 4.126935543164628e-06, + "loss": 0.0127, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 694 + }, + { + "completion_length": 206.1666717529297, + "epoch": 2.43006993006993, + "grad_norm": 0.6121898293495178, + "kl": 0.24353787302970886, + "learning_rate": 4.123620120825459e-06, + "loss": 0.0097, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 695 + }, + { + "completion_length": 416.16668701171875, + "epoch": 2.4335664335664333, + "grad_norm": 0.65854811668396, + "kl": 0.29339665174484253, + "learning_rate": 4.120299752657828e-06, + "loss": 0.0117, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 696 + }, + { + "completion_length": 524.3333740234375, + "epoch": 2.437062937062937, + "grad_norm": 0.5596239566802979, + "kl": 0.26455265283584595, + "learning_rate": 4.11697444877615e-06, + "loss": 0.0106, + "reward": 2.7916667461395264, + "reward_std": 1.4527273178100586, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 697 + }, + { + "completion_length": 173.6666717529297, + "epoch": 2.4405594405594404, + "grad_norm": 2.7013747692108154, + "kl": 0.5755926370620728, + "learning_rate": 4.113644219309877e-06, + "loss": 0.023, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 698 + }, + { + "completion_length": 893.0, + "epoch": 2.444055944055944, + "grad_norm": 0.5892761945724487, + "kl": 0.22364209592342377, + "learning_rate": 4.110309074403467e-06, + "loss": 0.0089, + "reward": 2.433333396911621, + "reward_std": 1.3728317022323608, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7666666507720947, + "step": 699 + }, + { + "completion_length": 1029.166748046875, + "epoch": 2.4475524475524475, + "grad_norm": 0.41362571716308594, + "kl": 0.20189592242240906, + "learning_rate": 4.106969024216348e-06, + "loss": 0.0081, + "reward": 1.7416666746139526, + "reward_std": 0.9625055193901062, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7416666746139526, + "step": 700 + }, + { + "completion_length": 200.5, + "epoch": 2.451048951048951, + "grad_norm": 0.9199966788291931, + "kl": 0.29405680298805237, + "learning_rate": 4.103624078922895e-06, + "loss": 0.0118, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 701 + }, + { + "completion_length": 551.8333740234375, + "epoch": 2.4545454545454546, + "grad_norm": 0.5847578644752502, + "kl": 0.30494964122772217, + "learning_rate": 4.1002742487123896e-06, + "loss": 0.0122, + "reward": 2.4583334922790527, + "reward_std": 1.3603003025054932, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 702 + }, + { + "completion_length": 158.5, + "epoch": 2.458041958041958, + "grad_norm": 3.148179054260254, + "kl": 0.33209604024887085, + "learning_rate": 4.096919543788995e-06, + "loss": 0.0133, + "reward": 2.9583334922790527, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 703 + }, + { + "completion_length": 635.1666870117188, + "epoch": 2.4615384615384617, + "grad_norm": 0.7368152141571045, + "kl": 0.2001763880252838, + "learning_rate": 4.093559974371725e-06, + "loss": 0.008, + "reward": 3.204166889190674, + "reward_std": 0.42143115401268005, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 704 + }, + { + "completion_length": 206.83334350585938, + "epoch": 2.465034965034965, + "grad_norm": 0.7404118776321411, + "kl": 0.2592664361000061, + "learning_rate": 4.09019555069441e-06, + "loss": 0.0104, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 705 + }, + { + "completion_length": 203.5, + "epoch": 2.4685314685314683, + "grad_norm": 0.7086665630340576, + "kl": 0.28512802720069885, + "learning_rate": 4.086826283005669e-06, + "loss": 0.0114, + "reward": 2.704166889190674, + "reward_std": 0.6021662950515747, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 706 + }, + { + "completion_length": 175.33334350585938, + "epoch": 2.472027972027972, + "grad_norm": 3.0447657108306885, + "kl": 0.38635802268981934, + "learning_rate": 4.083452181568876e-06, + "loss": 0.0155, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 707 + }, + { + "completion_length": 200.6666717529297, + "epoch": 2.4755244755244754, + "grad_norm": 0.7985562682151794, + "kl": 0.30575287342071533, + "learning_rate": 4.080073256662128e-06, + "loss": 0.0122, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 708 + }, + { + "completion_length": 212.5, + "epoch": 2.479020979020979, + "grad_norm": 1.0262845754623413, + "kl": 0.30596381425857544, + "learning_rate": 4.076689518578217e-06, + "loss": 0.0122, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 709 + }, + { + "completion_length": 199.5, + "epoch": 2.4825174825174825, + "grad_norm": 0.8163771629333496, + "kl": 0.23148366808891296, + "learning_rate": 4.073300977624594e-06, + "loss": 0.0093, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 710 + }, + { + "completion_length": 228.33334350585938, + "epoch": 2.486013986013986, + "grad_norm": 0.6531832218170166, + "kl": 0.27565860748291016, + "learning_rate": 4.069907644123346e-06, + "loss": 0.011, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 711 + }, + { + "completion_length": 487.16668701171875, + "epoch": 2.4895104895104896, + "grad_norm": 0.3693908452987671, + "kl": 0.32342347502708435, + "learning_rate": 4.066509528411151e-06, + "loss": 0.0129, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 712 + }, + { + "completion_length": 191.83334350585938, + "epoch": 2.493006993006993, + "grad_norm": 0.822213351726532, + "kl": 0.3490138649940491, + "learning_rate": 4.063106640839264e-06, + "loss": 0.014, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 713 + }, + { + "completion_length": 178.6666717529297, + "epoch": 2.4965034965034967, + "grad_norm": 0.7303230166435242, + "kl": 0.26454809308052063, + "learning_rate": 4.059698991773466e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 714 + }, + { + "completion_length": 213.6666717529297, + "epoch": 2.5, + "grad_norm": 0.792052149772644, + "kl": 0.32973194122314453, + "learning_rate": 4.056286591594049e-06, + "loss": 0.0132, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 715 + }, + { + "completion_length": 211.0, + "epoch": 2.5034965034965033, + "grad_norm": 0.6441434025764465, + "kl": 0.3346059024333954, + "learning_rate": 4.052869450695776e-06, + "loss": 0.0134, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 716 + }, + { + "completion_length": 181.6666717529297, + "epoch": 2.506993006993007, + "grad_norm": 2.2384145259857178, + "kl": 0.4402106702327728, + "learning_rate": 4.049447579487851e-06, + "loss": 0.0176, + "reward": 3.016666889190674, + "reward_std": 0.9521903991699219, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 717 + }, + { + "completion_length": 825.8333740234375, + "epoch": 2.5104895104895104, + "grad_norm": 0.4227934777736664, + "kl": 0.19202569127082825, + "learning_rate": 4.046020988393886e-06, + "loss": 0.0077, + "reward": 2.7916667461395264, + "reward_std": 1.5844295024871826, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 718 + }, + { + "completion_length": 199.6666717529297, + "epoch": 2.513986013986014, + "grad_norm": 0.7948997020721436, + "kl": 0.30144181847572327, + "learning_rate": 4.0425896878518725e-06, + "loss": 0.0121, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 719 + }, + { + "completion_length": 200.6666717529297, + "epoch": 2.5174825174825175, + "grad_norm": 0.7969666123390198, + "kl": 0.2623240351676941, + "learning_rate": 4.039153688314146e-06, + "loss": 0.0105, + "reward": 2.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 720 + }, + { + "completion_length": 184.6666717529297, + "epoch": 2.520979020979021, + "grad_norm": 1.1336637735366821, + "kl": 0.2935950756072998, + "learning_rate": 4.035713000247358e-06, + "loss": 0.0117, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 721 + }, + { + "completion_length": 667.3333740234375, + "epoch": 2.5244755244755246, + "grad_norm": 0.39087414741516113, + "kl": 0.2444695681333542, + "learning_rate": 4.032267634132442e-06, + "loss": 0.0098, + "reward": 3.704166889190674, + "reward_std": 0.6021662950515747, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8708332777023315, + "step": 722 + }, + { + "completion_length": 818.5, + "epoch": 2.527972027972028, + "grad_norm": 0.42902201414108276, + "kl": 0.18485748767852783, + "learning_rate": 4.028817600464579e-06, + "loss": 0.0074, + "reward": 1.9666666984558105, + "reward_std": 1.5233734846115112, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 723 + }, + { + "completion_length": 197.33334350585938, + "epoch": 2.5314685314685317, + "grad_norm": 0.5554837584495544, + "kl": 0.3039252758026123, + "learning_rate": 4.02536290975317e-06, + "loss": 0.0122, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 724 + }, + { + "completion_length": 519.3333740234375, + "epoch": 2.534965034965035, + "grad_norm": 0.44166073203086853, + "kl": 0.24431876838207245, + "learning_rate": 4.021903572521802e-06, + "loss": 0.0098, + "reward": 2.7916667461395264, + "reward_std": 1.4527273178100586, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 725 + }, + { + "completion_length": 200.0, + "epoch": 2.5384615384615383, + "grad_norm": 0.7037209868431091, + "kl": 0.3631229102611542, + "learning_rate": 4.018439599308217e-06, + "loss": 0.0145, + "reward": 3.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 726 + }, + { + "completion_length": 192.5, + "epoch": 2.541958041958042, + "grad_norm": 0.664789617061615, + "kl": 0.29182663559913635, + "learning_rate": 4.0149710006642775e-06, + "loss": 0.0117, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 727 + }, + { + "completion_length": 198.5, + "epoch": 2.5454545454545454, + "grad_norm": 1.0678514242172241, + "kl": 0.28828293085098267, + "learning_rate": 4.011497787155938e-06, + "loss": 0.0115, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 728 + }, + { + "completion_length": 213.83334350585938, + "epoch": 2.548951048951049, + "grad_norm": 0.8395413756370544, + "kl": 0.3076155185699463, + "learning_rate": 4.008019969363206e-06, + "loss": 0.0123, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 729 + }, + { + "completion_length": 212.0, + "epoch": 2.5524475524475525, + "grad_norm": 0.7780301570892334, + "kl": 0.2876867651939392, + "learning_rate": 4.0045375578801216e-06, + "loss": 0.0115, + "reward": 2.616666793823242, + "reward_std": 1.0327956676483154, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 730 + }, + { + "completion_length": 183.33334350585938, + "epoch": 2.555944055944056, + "grad_norm": 0.043716005980968475, + "kl": 0.40688663721084595, + "learning_rate": 4.001050563314711e-06, + "loss": 0.0187, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 731 + }, + { + "completion_length": 181.6666717529297, + "epoch": 2.5594405594405596, + "grad_norm": 0.7270947098731995, + "kl": 0.2820360064506531, + "learning_rate": 3.997558996288965e-06, + "loss": 0.0113, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 732 + }, + { + "completion_length": 522.0, + "epoch": 2.562937062937063, + "grad_norm": 0.5480185747146606, + "kl": 0.2843058109283447, + "learning_rate": 3.994062867438803e-06, + "loss": 0.0114, + "reward": 3.016666889190674, + "reward_std": 0.9521905183792114, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 733 + }, + { + "completion_length": 192.0, + "epoch": 2.5664335664335667, + "grad_norm": 0.733644962310791, + "kl": 0.27982231974601746, + "learning_rate": 3.9905621874140396e-06, + "loss": 0.0112, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 734 + }, + { + "completion_length": 198.6666717529297, + "epoch": 2.56993006993007, + "grad_norm": 0.7122451066970825, + "kl": 0.36668699979782104, + "learning_rate": 3.987056966878354e-06, + "loss": 0.0147, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 735 + }, + { + "completion_length": 204.33334350585938, + "epoch": 2.5734265734265733, + "grad_norm": 0.07662484794855118, + "kl": 0.3632362484931946, + "learning_rate": 3.983547216509254e-06, + "loss": 0.0169, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 736 + }, + { + "completion_length": 189.1666717529297, + "epoch": 2.5769230769230766, + "grad_norm": 0.34811052680015564, + "kl": 0.4749183654785156, + "learning_rate": 3.9800329469980495e-06, + "loss": 0.0214, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 737 + }, + { + "completion_length": 583.5, + "epoch": 2.5804195804195804, + "grad_norm": 0.3855575919151306, + "kl": 0.2539462447166443, + "learning_rate": 3.976514169049814e-06, + "loss": 0.0102, + "reward": 2.704166889190674, + "reward_std": 0.6021661758422852, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 738 + }, + { + "completion_length": 174.6666717529297, + "epoch": 2.583916083916084, + "grad_norm": 1.0900449752807617, + "kl": 0.3619951605796814, + "learning_rate": 3.972990893383356e-06, + "loss": 0.0145, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 739 + }, + { + "completion_length": 203.1666717529297, + "epoch": 2.5874125874125875, + "grad_norm": 0.9708390831947327, + "kl": 0.28454601764678955, + "learning_rate": 3.969463130731183e-06, + "loss": 0.0114, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 740 + }, + { + "completion_length": 522.1666870117188, + "epoch": 2.590909090909091, + "grad_norm": 0.6295937895774841, + "kl": 0.26834964752197266, + "learning_rate": 3.965930891839473e-06, + "loss": 0.0107, + "reward": 3.370833396911621, + "reward_std": 0.4925486445426941, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 741 + }, + { + "completion_length": 703.5, + "epoch": 2.594405594405594, + "grad_norm": 0.35760697722435, + "kl": 0.28400832414627075, + "learning_rate": 3.96239418746804e-06, + "loss": 0.0114, + "reward": 3.066666603088379, + "reward_std": 0.2857738435268402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8999999761581421, + "step": 742 + }, + { + "completion_length": 833.8333740234375, + "epoch": 2.597902097902098, + "grad_norm": 0.5528135895729065, + "kl": 0.28165918588638306, + "learning_rate": 3.958853028390294e-06, + "loss": 0.0113, + "reward": 2.3583335876464844, + "reward_std": 1.752831220626831, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916666030883789, + "step": 743 + }, + { + "completion_length": 143.33334350585938, + "epoch": 2.6013986013986012, + "grad_norm": 0.7684369683265686, + "kl": 0.3106473684310913, + "learning_rate": 3.955307425393224e-06, + "loss": 0.0124, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 744 + }, + { + "completion_length": 789.1666870117188, + "epoch": 2.604895104895105, + "grad_norm": 0.9867936372756958, + "kl": 0.2591046094894409, + "learning_rate": 3.951757389277349e-06, + "loss": 0.0104, + "reward": 3.3500001430511475, + "reward_std": 0.5224940180778503, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 745 + }, + { + "completion_length": 194.83334350585938, + "epoch": 2.6083916083916083, + "grad_norm": 0.7808223962783813, + "kl": 0.30762046575546265, + "learning_rate": 3.948202930856697e-06, + "loss": 0.0123, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 746 + }, + { + "completion_length": 503.8333435058594, + "epoch": 2.6118881118881117, + "grad_norm": 0.6441946625709534, + "kl": 0.2855534851551056, + "learning_rate": 3.944644060958764e-06, + "loss": 0.0114, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 747 + }, + { + "completion_length": 190.1666717529297, + "epoch": 2.6153846153846154, + "grad_norm": 0.8443914651870728, + "kl": 0.32207822799682617, + "learning_rate": 3.941080790424483e-06, + "loss": 0.0129, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 748 + }, + { + "completion_length": 193.6666717529297, + "epoch": 2.6188811188811187, + "grad_norm": 0.620596706867218, + "kl": 0.33432909846305847, + "learning_rate": 3.9375131301081974e-06, + "loss": 0.0134, + "reward": 2.616666793823242, + "reward_std": 1.0327956676483154, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 749 + }, + { + "completion_length": 218.33334350585938, + "epoch": 2.6223776223776225, + "grad_norm": 0.8599146604537964, + "kl": 0.2318965494632721, + "learning_rate": 3.933941090877615e-06, + "loss": 0.0093, + "reward": 2.0375001430511475, + "reward_std": 0.4857339859008789, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 750 + }, + { + "completion_length": 196.83334350585938, + "epoch": 2.625874125874126, + "grad_norm": 0.042067479342222214, + "kl": 0.25582045316696167, + "learning_rate": 3.930364683613791e-06, + "loss": 0.0114, + "reward": 1.9500001668930054, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 751 + }, + { + "completion_length": 187.6666717529297, + "epoch": 2.629370629370629, + "grad_norm": 0.6770573854446411, + "kl": 0.2656649649143219, + "learning_rate": 3.92678391921108e-06, + "loss": 0.0106, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 752 + }, + { + "completion_length": 217.5, + "epoch": 2.632867132867133, + "grad_norm": 1.6130694150924683, + "kl": 0.29323238134384155, + "learning_rate": 3.923198808577111e-06, + "loss": 0.0117, + "reward": 2.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 753 + }, + { + "completion_length": 224.83334350585938, + "epoch": 2.6363636363636362, + "grad_norm": 0.7095122933387756, + "kl": 0.27353787422180176, + "learning_rate": 3.9196093626327535e-06, + "loss": 0.0109, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 754 + }, + { + "completion_length": 827.5, + "epoch": 2.63986013986014, + "grad_norm": 0.5739628076553345, + "kl": 0.21068716049194336, + "learning_rate": 3.916015592312083e-06, + "loss": 0.0084, + "reward": 2.883333206176758, + "reward_std": 1.7192052602767944, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 755 + }, + { + "completion_length": 186.0, + "epoch": 2.6433566433566433, + "grad_norm": 0.8608355522155762, + "kl": 0.3407597243785858, + "learning_rate": 3.912417508562345e-06, + "loss": 0.0136, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 756 + }, + { + "completion_length": 556.3333740234375, + "epoch": 2.6468531468531467, + "grad_norm": 0.3163861036300659, + "kl": 0.2427646368741989, + "learning_rate": 3.908815122343929e-06, + "loss": 0.0097, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 757 + }, + { + "completion_length": 187.5, + "epoch": 2.6503496503496504, + "grad_norm": 0.8031748533248901, + "kl": 0.30763155221939087, + "learning_rate": 3.905208444630326e-06, + "loss": 0.0123, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 758 + }, + { + "completion_length": 218.1666717529297, + "epoch": 2.6538461538461537, + "grad_norm": 0.8372368216514587, + "kl": 0.28790879249572754, + "learning_rate": 3.901597486408105e-06, + "loss": 0.0115, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 759 + }, + { + "completion_length": 1181.8333740234375, + "epoch": 2.6573426573426575, + "grad_norm": 0.647392988204956, + "kl": 0.20365619659423828, + "learning_rate": 3.897982258676867e-06, + "loss": 0.0081, + "reward": 1.7208335399627686, + "reward_std": 1.5208892822265625, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5541666746139526, + "step": 760 + }, + { + "completion_length": 180.1666717529297, + "epoch": 2.660839160839161, + "grad_norm": 0.6884165406227112, + "kl": 0.2719978392124176, + "learning_rate": 3.894362772449226e-06, + "loss": 0.0109, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 761 + }, + { + "completion_length": 257.8333435058594, + "epoch": 2.664335664335664, + "grad_norm": 1.337699055671692, + "kl": 0.5194430351257324, + "learning_rate": 3.890739038750763e-06, + "loss": 0.0208, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 762 + }, + { + "completion_length": 498.16668701171875, + "epoch": 2.667832167832168, + "grad_norm": 0.9563208818435669, + "kl": 0.3499029874801636, + "learning_rate": 3.887111068619999e-06, + "loss": 0.014, + "reward": 1.75, + "reward_std": 1.1730302572250366, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.75, + "step": 763 + }, + { + "completion_length": 215.33334350585938, + "epoch": 2.6713286713286712, + "grad_norm": 0.5849650502204895, + "kl": 0.21754197776317596, + "learning_rate": 3.88347887310836e-06, + "loss": 0.0087, + "reward": 2.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 764 + }, + { + "completion_length": 206.83334350585938, + "epoch": 2.674825174825175, + "grad_norm": 0.5816351771354675, + "kl": 0.2685267925262451, + "learning_rate": 3.879842463280146e-06, + "loss": 0.0107, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 765 + }, + { + "completion_length": 190.1666717529297, + "epoch": 2.6783216783216783, + "grad_norm": 0.7096436023712158, + "kl": 0.3302849531173706, + "learning_rate": 3.876201850212489e-06, + "loss": 0.0132, + "reward": 3.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 766 + }, + { + "completion_length": 205.83334350585938, + "epoch": 2.6818181818181817, + "grad_norm": 0.7019976377487183, + "kl": 0.3386441469192505, + "learning_rate": 3.87255704499533e-06, + "loss": 0.0135, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 767 + }, + { + "completion_length": 220.83334350585938, + "epoch": 2.6853146853146854, + "grad_norm": 0.7764424681663513, + "kl": 0.25025084614753723, + "learning_rate": 3.868908058731376e-06, + "loss": 0.01, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 768 + }, + { + "completion_length": 191.1666717529297, + "epoch": 2.6888111888111887, + "grad_norm": 0.6796668767929077, + "kl": 0.2684442698955536, + "learning_rate": 3.865254902536073e-06, + "loss": 0.0107, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 769 + }, + { + "completion_length": 898.3333740234375, + "epoch": 2.6923076923076925, + "grad_norm": 0.38831865787506104, + "kl": 0.14873462915420532, + "learning_rate": 3.861597587537568e-06, + "loss": 0.0059, + "reward": 1.9666666984558105, + "reward_std": 1.649444341659546, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.6333333253860474, + "step": 770 + }, + { + "completion_length": 1279.166748046875, + "epoch": 2.695804195804196, + "grad_norm": 0.6360457539558411, + "kl": 0.1556037813425064, + "learning_rate": 3.857936124876677e-06, + "loss": 0.0062, + "reward": 2.25, + "reward_std": 1.957294225692749, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5833333730697632, + "step": 771 + }, + { + "completion_length": 185.6666717529297, + "epoch": 2.699300699300699, + "grad_norm": 0.8891352415084839, + "kl": 0.2973707318305969, + "learning_rate": 3.85427052570685e-06, + "loss": 0.0119, + "reward": 2.450000286102295, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.949999988079071, + "step": 772 + }, + { + "completion_length": 223.0, + "epoch": 2.702797202797203, + "grad_norm": 0.9200516939163208, + "kl": 0.2344827651977539, + "learning_rate": 3.850600801194138e-06, + "loss": 0.0094, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 773 + }, + { + "completion_length": 181.6666717529297, + "epoch": 2.7062937062937062, + "grad_norm": 1.2495554685592651, + "kl": 0.4023559093475342, + "learning_rate": 3.846926962517158e-06, + "loss": 0.0161, + "reward": 2.4000000953674316, + "reward_std": 0.6123725175857544, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8999999761581421, + "step": 774 + }, + { + "completion_length": 186.83334350585938, + "epoch": 2.70979020979021, + "grad_norm": 0.7409746646881104, + "kl": 0.2839186489582062, + "learning_rate": 3.8432490208670605e-06, + "loss": 0.0114, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 775 + }, + { + "completion_length": 187.1666717529297, + "epoch": 2.7132867132867133, + "grad_norm": 0.9320999383926392, + "kl": 0.2990000247955322, + "learning_rate": 3.839566987447492e-06, + "loss": 0.012, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 776 + }, + { + "completion_length": 531.8333740234375, + "epoch": 2.7167832167832167, + "grad_norm": 0.3263534903526306, + "kl": 0.2381911277770996, + "learning_rate": 3.835880873474567e-06, + "loss": 0.0095, + "reward": 3.2916667461395264, + "reward_std": 1.6125807762145996, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 777 + }, + { + "completion_length": 1248.666748046875, + "epoch": 2.7202797202797204, + "grad_norm": 0.5097912549972534, + "kl": 0.1756594479084015, + "learning_rate": 3.832190690176825e-06, + "loss": 0.007, + "reward": 2.25, + "reward_std": 1.957294225692749, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5833333730697632, + "step": 778 + }, + { + "completion_length": 546.5, + "epoch": 2.7237762237762237, + "grad_norm": 0.38489583134651184, + "kl": 0.233808696269989, + "learning_rate": 3.828496448795208e-06, + "loss": 0.0094, + "reward": 2.7916667461395264, + "reward_std": 1.4527273178100586, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 779 + }, + { + "completion_length": 200.5, + "epoch": 2.7272727272727275, + "grad_norm": 0.6196880340576172, + "kl": 0.28656402230262756, + "learning_rate": 3.824798160583012e-06, + "loss": 0.0115, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 780 + }, + { + "completion_length": 206.83334350585938, + "epoch": 2.730769230769231, + "grad_norm": 0.06716328859329224, + "kl": 0.35444962978363037, + "learning_rate": 3.821095836805868e-06, + "loss": 0.0166, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 781 + }, + { + "completion_length": 1108.0, + "epoch": 2.734265734265734, + "grad_norm": 0.46010759472846985, + "kl": 0.2134471833705902, + "learning_rate": 3.817389488741694e-06, + "loss": 0.0085, + "reward": 2.9375, + "reward_std": 1.2437593936920166, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7708332538604736, + "step": 782 + }, + { + "completion_length": 192.83334350585938, + "epoch": 2.737762237762238, + "grad_norm": 0.7892248034477234, + "kl": 0.27930283546447754, + "learning_rate": 3.8136791276806695e-06, + "loss": 0.0112, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 783 + }, + { + "completion_length": 526.1666870117188, + "epoch": 2.7412587412587412, + "grad_norm": 0.5663818120956421, + "kl": 0.23246847093105316, + "learning_rate": 3.8099647649251984e-06, + "loss": 0.0093, + "reward": 2.183333396911621, + "reward_std": 1.0230673551559448, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 784 + }, + { + "completion_length": 302.0, + "epoch": 2.744755244755245, + "grad_norm": 0.8390914797782898, + "kl": 0.304746150970459, + "learning_rate": 3.806246411789872e-06, + "loss": 0.0122, + "reward": 2.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 785 + }, + { + "completion_length": 463.66668701171875, + "epoch": 2.7482517482517483, + "grad_norm": 0.4586171507835388, + "kl": 0.2490534633398056, + "learning_rate": 3.802524079601442e-06, + "loss": 0.01, + "reward": 2.2916667461395264, + "reward_std": 1.19181227684021, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 786 + }, + { + "completion_length": 538.1666870117188, + "epoch": 2.7517482517482517, + "grad_norm": 0.4255636930465698, + "kl": 0.21123819053173065, + "learning_rate": 3.798797779698774e-06, + "loss": 0.0084, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 787 + }, + { + "completion_length": 499.0, + "epoch": 2.755244755244755, + "grad_norm": 0.5292470455169678, + "kl": 0.24850648641586304, + "learning_rate": 3.795067523432826e-06, + "loss": 0.0099, + "reward": 1.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 788 + }, + { + "completion_length": 527.6666870117188, + "epoch": 2.7587412587412588, + "grad_norm": 0.6640042662620544, + "kl": 0.21320059895515442, + "learning_rate": 3.791333322166605e-06, + "loss": 0.0085, + "reward": 2.3500001430511475, + "reward_std": 0.6928203105926514, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 789 + }, + { + "completion_length": 212.5, + "epoch": 2.762237762237762, + "grad_norm": 0.7885140776634216, + "kl": 0.25268059968948364, + "learning_rate": 3.787595187275136e-06, + "loss": 0.0101, + "reward": 3.116666793823242, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 790 + }, + { + "completion_length": 185.0, + "epoch": 2.765734265734266, + "grad_norm": 1.2679868936538696, + "kl": 0.35767948627471924, + "learning_rate": 3.7838531301454257e-06, + "loss": 0.0143, + "reward": 2.3500001430511475, + "reward_std": 0.9380831718444824, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 791 + }, + { + "completion_length": 202.0, + "epoch": 2.769230769230769, + "grad_norm": 0.6652596592903137, + "kl": 0.2619841694831848, + "learning_rate": 3.780107162176429e-06, + "loss": 0.0105, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 792 + }, + { + "completion_length": 474.0, + "epoch": 2.7727272727272725, + "grad_norm": 8.084759712219238, + "kl": 2.9472758769989014, + "learning_rate": 3.776357294779015e-06, + "loss": 0.1179, + "reward": 2.133333444595337, + "reward_std": 1.6972527503967285, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 793 + }, + { + "completion_length": 487.3333435058594, + "epoch": 2.7762237762237763, + "grad_norm": 0.43876194953918457, + "kl": 0.234140545129776, + "learning_rate": 3.772603539375929e-06, + "loss": 0.0094, + "reward": 2.875, + "reward_std": 1.2624380588531494, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 794 + }, + { + "completion_length": 216.5, + "epoch": 2.7797202797202796, + "grad_norm": 0.7178113460540771, + "kl": 0.3248441517353058, + "learning_rate": 3.768845907401761e-06, + "loss": 0.013, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 795 + }, + { + "completion_length": 425.66668701171875, + "epoch": 2.7832167832167833, + "grad_norm": 0.4357425570487976, + "kl": 0.24865001440048218, + "learning_rate": 3.7650844103029093e-06, + "loss": 0.0099, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 796 + }, + { + "completion_length": 827.1666870117188, + "epoch": 2.7867132867132867, + "grad_norm": 0.36945709586143494, + "kl": 0.26294025778770447, + "learning_rate": 3.7613190595375484e-06, + "loss": 0.0105, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 797 + }, + { + "completion_length": 193.33334350585938, + "epoch": 2.79020979020979, + "grad_norm": 1.0428582429885864, + "kl": 0.3159600496292114, + "learning_rate": 3.7575498665755884e-06, + "loss": 0.0126, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 798 + }, + { + "completion_length": 573.1666870117188, + "epoch": 2.7937062937062938, + "grad_norm": 0.7567842602729797, + "kl": 0.37068232893943787, + "learning_rate": 3.753776842898644e-06, + "loss": 0.0148, + "reward": 2.704166889190674, + "reward_std": 0.8732721209526062, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8708332777023315, + "step": 799 + }, + { + "completion_length": 500.66668701171875, + "epoch": 2.797202797202797, + "grad_norm": 0.5451098680496216, + "kl": 0.24930475652217865, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.01, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 800 + }, + { + "completion_length": 534.8333740234375, + "epoch": 2.800699300699301, + "grad_norm": 0.6100650429725647, + "kl": 0.2307787835597992, + "learning_rate": 3.7462193493845763e-06, + "loss": 0.0092, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 801 + }, + { + "completion_length": 494.3333435058594, + "epoch": 2.804195804195804, + "grad_norm": 0.3402723968029022, + "kl": 0.18503499031066895, + "learning_rate": 3.742434902568889e-06, + "loss": 0.0074, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 802 + }, + { + "completion_length": 173.5, + "epoch": 2.8076923076923075, + "grad_norm": 1.8058785200119019, + "kl": 0.3168944716453552, + "learning_rate": 3.738646671081019e-06, + "loss": 0.0127, + "reward": 2.016666889190674, + "reward_std": 0.9521905183792114, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 803 + }, + { + "completion_length": 512.8333740234375, + "epoch": 2.8111888111888113, + "grad_norm": 0.526727020740509, + "kl": 0.23828241229057312, + "learning_rate": 3.7348546664605777e-06, + "loss": 0.0095, + "reward": 2.625, + "reward_std": 1.3467553853988647, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 804 + }, + { + "completion_length": 527.5, + "epoch": 2.8146853146853146, + "grad_norm": 0.41340726613998413, + "kl": 0.18879906833171844, + "learning_rate": 3.7310589002586683e-06, + "loss": 0.0076, + "reward": 1.9583333730697632, + "reward_std": 1.0772264003753662, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 805 + }, + { + "completion_length": 505.16668701171875, + "epoch": 2.8181818181818183, + "grad_norm": 0.5648691058158875, + "kl": 0.315143346786499, + "learning_rate": 3.7272593840378526e-06, + "loss": 0.0126, + "reward": 3.3500001430511475, + "reward_std": 1.058300495147705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 806 + }, + { + "completion_length": 478.0, + "epoch": 2.8216783216783217, + "grad_norm": 0.7256986498832703, + "kl": 0.36903661489486694, + "learning_rate": 3.723456129372116e-06, + "loss": 0.0148, + "reward": 2.691666603088379, + "reward_std": 1.66505765914917, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 807 + }, + { + "completion_length": 502.3333435058594, + "epoch": 2.825174825174825, + "grad_norm": 0.44219493865966797, + "kl": 0.20148871839046478, + "learning_rate": 3.7196491478468322e-06, + "loss": 0.0081, + "reward": 3.125, + "reward_std": 1.5823242664337158, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 808 + }, + { + "completion_length": 486.5, + "epoch": 2.8286713286713288, + "grad_norm": 0.47201624512672424, + "kl": 0.21870753169059753, + "learning_rate": 3.7158384510587264e-06, + "loss": 0.0087, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 809 + }, + { + "completion_length": 537.3333740234375, + "epoch": 2.832167832167832, + "grad_norm": 0.6436510682106018, + "kl": 0.22534185647964478, + "learning_rate": 3.7120240506158433e-06, + "loss": 0.009, + "reward": 2.950000286102295, + "reward_std": 1.095445156097412, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 810 + }, + { + "completion_length": 237.5, + "epoch": 2.835664335664336, + "grad_norm": 0.646536648273468, + "kl": 0.27237847447395325, + "learning_rate": 3.708205958137506e-06, + "loss": 0.0109, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 811 + }, + { + "completion_length": 255.6666717529297, + "epoch": 2.839160839160839, + "grad_norm": 0.6682825684547424, + "kl": 0.3421524465084076, + "learning_rate": 3.7043841852542884e-06, + "loss": 0.0137, + "reward": 3.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 812 + }, + { + "completion_length": 210.33334350585938, + "epoch": 2.8426573426573425, + "grad_norm": 0.9165672063827515, + "kl": 0.307815283536911, + "learning_rate": 3.7005587436079724e-06, + "loss": 0.0123, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 813 + }, + { + "completion_length": 830.8333740234375, + "epoch": 2.8461538461538463, + "grad_norm": 0.4606754779815674, + "kl": 0.20725002884864807, + "learning_rate": 3.6967296448515176e-06, + "loss": 0.0083, + "reward": 1.633333444595337, + "reward_std": 1.3418892621994019, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 814 + }, + { + "completion_length": 165.6666717529297, + "epoch": 2.8496503496503496, + "grad_norm": 8.907544136047363, + "kl": 0.7807542085647583, + "learning_rate": 3.6928969006490212e-06, + "loss": 0.0312, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 815 + }, + { + "completion_length": 852.3333740234375, + "epoch": 2.8531468531468533, + "grad_norm": 0.5166431665420532, + "kl": 0.22571682929992676, + "learning_rate": 3.689060522675689e-06, + "loss": 0.009, + "reward": 2.191666603088379, + "reward_std": 1.2499668598175049, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 816 + }, + { + "completion_length": 223.6666717529297, + "epoch": 2.8566433566433567, + "grad_norm": 0.7508683204650879, + "kl": 0.2404782623052597, + "learning_rate": 3.6852205226177907e-06, + "loss": 0.0096, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 817 + }, + { + "completion_length": 197.33334350585938, + "epoch": 2.86013986013986, + "grad_norm": 0.9129036664962769, + "kl": 0.3411031663417816, + "learning_rate": 3.6813769121726356e-06, + "loss": 0.0136, + "reward": 2.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 818 + }, + { + "completion_length": 516.5, + "epoch": 2.8636363636363638, + "grad_norm": 0.462089866399765, + "kl": 0.2940046191215515, + "learning_rate": 3.677529703048525e-06, + "loss": 0.0118, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 819 + }, + { + "completion_length": 200.33334350585938, + "epoch": 2.867132867132867, + "grad_norm": 0.8957485556602478, + "kl": 0.2655426859855652, + "learning_rate": 3.6736789069647273e-06, + "loss": 0.0106, + "reward": 2.370833396911621, + "reward_std": 0.4925486445426941, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 820 + }, + { + "completion_length": 198.5, + "epoch": 2.870629370629371, + "grad_norm": 0.8342744708061218, + "kl": 0.24618063867092133, + "learning_rate": 3.6698245356514337e-06, + "loss": 0.0098, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 821 + }, + { + "completion_length": 153.33334350585938, + "epoch": 2.874125874125874, + "grad_norm": 0.7607580423355103, + "kl": 0.27888309955596924, + "learning_rate": 3.6659666008497287e-06, + "loss": 0.0112, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 822 + }, + { + "completion_length": 206.1666717529297, + "epoch": 2.8776223776223775, + "grad_norm": 0.8246819972991943, + "kl": 0.2606455087661743, + "learning_rate": 3.66210511431155e-06, + "loss": 0.0104, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 823 + }, + { + "completion_length": 509.5, + "epoch": 2.8811188811188813, + "grad_norm": 0.4599984586238861, + "kl": 0.2845722734928131, + "learning_rate": 3.658240087799655e-06, + "loss": 0.0114, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 824 + }, + { + "completion_length": 577.5, + "epoch": 2.8846153846153846, + "grad_norm": 0.3288043439388275, + "kl": 0.2456766664981842, + "learning_rate": 3.654371533087586e-06, + "loss": 0.0098, + "reward": 3.516666889190674, + "reward_std": 1.0614454746246338, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 825 + }, + { + "completion_length": 516.3333740234375, + "epoch": 2.8881118881118883, + "grad_norm": 0.4140225946903229, + "kl": 0.2799639403820038, + "learning_rate": 3.6504994619596295e-06, + "loss": 0.0112, + "reward": 2.8500001430511475, + "reward_std": 1.3190906047821045, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 826 + }, + { + "completion_length": 180.0, + "epoch": 2.8916083916083917, + "grad_norm": 4.02695894241333, + "kl": 0.5962376594543457, + "learning_rate": 3.6466238862107884e-06, + "loss": 0.0238, + "reward": 2.625, + "reward_std": 1.1973929405212402, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 827 + }, + { + "completion_length": 220.5, + "epoch": 2.895104895104895, + "grad_norm": 0.7351843118667603, + "kl": 0.2822580337524414, + "learning_rate": 3.642744817646736e-06, + "loss": 0.0113, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 828 + }, + { + "completion_length": 515.0, + "epoch": 2.8986013986013988, + "grad_norm": 0.08226211369037628, + "kl": 0.25046059489250183, + "learning_rate": 3.6388622680837893e-06, + "loss": 0.0124, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 829 + }, + { + "completion_length": 505.16668701171875, + "epoch": 2.902097902097902, + "grad_norm": 0.4543350040912628, + "kl": 0.24183645844459534, + "learning_rate": 3.634976249348867e-06, + "loss": 0.0097, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 830 + }, + { + "completion_length": 218.6666717529297, + "epoch": 2.905594405594406, + "grad_norm": 0.7515471577644348, + "kl": 0.23829472064971924, + "learning_rate": 3.631086773279457e-06, + "loss": 0.0095, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 831 + }, + { + "completion_length": 213.6666717529297, + "epoch": 2.909090909090909, + "grad_norm": 0.7354035973548889, + "kl": 0.261251837015152, + "learning_rate": 3.627193851723577e-06, + "loss": 0.0105, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 832 + }, + { + "completion_length": 212.6666717529297, + "epoch": 2.9125874125874125, + "grad_norm": 0.057771261781454086, + "kl": 0.27106887102127075, + "learning_rate": 3.6232974965397416e-06, + "loss": 0.0132, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 833 + }, + { + "completion_length": 504.5, + "epoch": 2.916083916083916, + "grad_norm": 0.5733168125152588, + "kl": 0.2574426233768463, + "learning_rate": 3.6193977195969243e-06, + "loss": 0.0103, + "reward": 3.3500001430511475, + "reward_std": 1.058300495147705, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 834 + }, + { + "completion_length": 201.33334350585938, + "epoch": 2.9195804195804196, + "grad_norm": 0.04076343774795532, + "kl": 0.3926897644996643, + "learning_rate": 3.6154945327745223e-06, + "loss": 0.0181, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 835 + }, + { + "completion_length": 504.66668701171875, + "epoch": 2.9230769230769234, + "grad_norm": 0.41645586490631104, + "kl": 0.28634482622146606, + "learning_rate": 3.611587947962319e-06, + "loss": 0.0115, + "reward": 2.8500001430511475, + "reward_std": 0.8366600275039673, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 836 + }, + { + "completion_length": 193.1666717529297, + "epoch": 2.9265734265734267, + "grad_norm": 0.584320068359375, + "kl": 0.36589449644088745, + "learning_rate": 3.6076779770604496e-06, + "loss": 0.0146, + "reward": 2.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 837 + }, + { + "completion_length": 151.1666717529297, + "epoch": 2.93006993006993, + "grad_norm": 20.536643981933594, + "kl": 2.479689121246338, + "learning_rate": 3.6037646319793635e-06, + "loss": 0.0992, + "reward": 2.2166666984558105, + "reward_std": 0.8577102422714233, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7166666984558105, + "step": 838 + }, + { + "completion_length": 191.5, + "epoch": 2.9335664335664333, + "grad_norm": 16.66517448425293, + "kl": 1.5780773162841797, + "learning_rate": 3.599847924639788e-06, + "loss": 0.0631, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 839 + }, + { + "completion_length": 391.16668701171875, + "epoch": 2.937062937062937, + "grad_norm": 0.408719927072525, + "kl": 0.27218514680862427, + "learning_rate": 3.595927866972694e-06, + "loss": 0.0109, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 840 + }, + { + "completion_length": 214.83334350585938, + "epoch": 2.9405594405594404, + "grad_norm": 0.8976386189460754, + "kl": 0.3075045049190521, + "learning_rate": 3.592004470919256e-06, + "loss": 0.0123, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 841 + }, + { + "completion_length": 191.5, + "epoch": 2.944055944055944, + "grad_norm": 0.8545355796813965, + "kl": 0.3112524747848511, + "learning_rate": 3.5880777484308193e-06, + "loss": 0.0125, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 842 + }, + { + "completion_length": 169.0, + "epoch": 2.9475524475524475, + "grad_norm": 3.7284159660339355, + "kl": 0.7016023397445679, + "learning_rate": 3.5841477114688616e-06, + "loss": 0.0281, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.7916666865348816, + "step": 843 + }, + { + "completion_length": 125.16667175292969, + "epoch": 2.951048951048951, + "grad_norm": 0.08267883211374283, + "kl": 0.3959384560585022, + "learning_rate": 3.5802143720049565e-06, + "loss": 0.0182, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 844 + }, + { + "completion_length": 191.5, + "epoch": 2.9545454545454546, + "grad_norm": 0.6881157159805298, + "kl": 0.28095734119415283, + "learning_rate": 3.5762777420207382e-06, + "loss": 0.0112, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 845 + }, + { + "completion_length": 165.0, + "epoch": 2.958041958041958, + "grad_norm": 17.351783752441406, + "kl": 3.690758466720581, + "learning_rate": 3.5723378335078653e-06, + "loss": 0.1476, + "reward": 2.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 846 + }, + { + "completion_length": 498.8333435058594, + "epoch": 2.9615384615384617, + "grad_norm": 0.6604005098342896, + "kl": 0.22229203581809998, + "learning_rate": 3.5683946584679818e-06, + "loss": 0.0089, + "reward": 2.516666889190674, + "reward_std": 1.2355835437774658, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 847 + }, + { + "completion_length": 831.5, + "epoch": 2.965034965034965, + "grad_norm": 0.4960877597332001, + "kl": 0.2161356508731842, + "learning_rate": 3.564448228912682e-06, + "loss": 0.0086, + "reward": 1.691666603088379, + "reward_std": 1.2619099617004395, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916666030883789, + "step": 848 + }, + { + "completion_length": 206.0, + "epoch": 2.9685314685314683, + "grad_norm": 0.7653523683547974, + "kl": 0.2660280764102936, + "learning_rate": 3.5604985568634754e-06, + "loss": 0.0106, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 849 + }, + { + "completion_length": 188.6666717529297, + "epoch": 2.972027972027972, + "grad_norm": 5.382836818695068, + "kl": 0.5285301804542542, + "learning_rate": 3.556545654351749e-06, + "loss": 0.0211, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 850 + }, + { + "completion_length": 240.0, + "epoch": 2.9755244755244754, + "grad_norm": 0.7501710653305054, + "kl": 0.2436714470386505, + "learning_rate": 3.552589533418728e-06, + "loss": 0.0097, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 851 + }, + { + "completion_length": 243.6666717529297, + "epoch": 2.979020979020979, + "grad_norm": 0.6721853017807007, + "kl": 0.2798357903957367, + "learning_rate": 3.5486302061154433e-06, + "loss": 0.0112, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 852 + }, + { + "completion_length": 444.5, + "epoch": 2.9825174825174825, + "grad_norm": 0.47981539368629456, + "kl": 0.2967255413532257, + "learning_rate": 3.5446676845026922e-06, + "loss": 0.0119, + "reward": 3.2916667461395264, + "reward_std": 1.6125807762145996, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 853 + }, + { + "completion_length": 395.66668701171875, + "epoch": 2.986013986013986, + "grad_norm": 0.5626189708709717, + "kl": 0.27203381061553955, + "learning_rate": 3.5407019806510035e-06, + "loss": 0.0109, + "reward": 2.450000286102295, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 854 + }, + { + "completion_length": 170.33334350585938, + "epoch": 2.9895104895104896, + "grad_norm": 0.6966221928596497, + "kl": 0.28407615423202515, + "learning_rate": 3.536733106640598e-06, + "loss": 0.0114, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 855 + }, + { + "completion_length": 238.0, + "epoch": 2.993006993006993, + "grad_norm": 0.7480601668357849, + "kl": 0.2773779332637787, + "learning_rate": 3.532761074561355e-06, + "loss": 0.0111, + "reward": 3.450000286102295, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 856 + }, + { + "completion_length": 202.5, + "epoch": 2.9965034965034967, + "grad_norm": 0.7340975999832153, + "kl": 0.3754419982433319, + "learning_rate": 3.5287858965127726e-06, + "loss": 0.015, + "reward": 3.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 857 + }, + { + "completion_length": 197.5, + "epoch": 3.0, + "grad_norm": 0.7914144992828369, + "kl": 0.2655482590198517, + "learning_rate": 3.524807584603932e-06, + "loss": 0.0106, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 858 + }, + { + "completion_length": 200.83334350585938, + "epoch": 3.0034965034965033, + "grad_norm": 0.7124848365783691, + "kl": 0.2514963448047638, + "learning_rate": 3.5208261509534627e-06, + "loss": 0.0101, + "reward": 2.950000286102295, + "reward_std": 1.095445156097412, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 859 + }, + { + "completion_length": 238.83334350585938, + "epoch": 3.006993006993007, + "grad_norm": 0.061087291687726974, + "kl": 0.3129764497280121, + "learning_rate": 3.516841607689501e-06, + "loss": 0.0149, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 860 + }, + { + "completion_length": 237.5, + "epoch": 3.0104895104895104, + "grad_norm": 0.7671120762825012, + "kl": 0.2705675959587097, + "learning_rate": 3.512853966949657e-06, + "loss": 0.0108, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 861 + }, + { + "completion_length": 220.0, + "epoch": 3.013986013986014, + "grad_norm": 0.7371062636375427, + "kl": 0.2749347686767578, + "learning_rate": 3.5088632408809757e-06, + "loss": 0.011, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 862 + }, + { + "completion_length": 291.3333435058594, + "epoch": 3.0174825174825175, + "grad_norm": 0.43244338035583496, + "kl": 0.2551594078540802, + "learning_rate": 3.504869441639901e-06, + "loss": 0.0102, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 863 + }, + { + "completion_length": 478.5, + "epoch": 3.020979020979021, + "grad_norm": 0.6255612373352051, + "kl": 0.21007221937179565, + "learning_rate": 3.5008725813922383e-06, + "loss": 0.0084, + "reward": 2.7333333492279053, + "reward_std": 0.8256310820579529, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 864 + }, + { + "completion_length": 524.3333740234375, + "epoch": 3.0244755244755246, + "grad_norm": 3.062211751937866, + "kl": 0.5263761878013611, + "learning_rate": 3.496872672313116e-06, + "loss": 0.0211, + "reward": 2.4583334922790527, + "reward_std": 1.2043325901031494, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 865 + }, + { + "completion_length": 234.33334350585938, + "epoch": 3.027972027972028, + "grad_norm": 0.7551546692848206, + "kl": 0.28137749433517456, + "learning_rate": 3.4928697265869516e-06, + "loss": 0.0113, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 866 + }, + { + "completion_length": 784.1666870117188, + "epoch": 3.0314685314685317, + "grad_norm": 1202.5098876953125, + "kl": 133.5108184814453, + "learning_rate": 3.488863756407413e-06, + "loss": 5.3404, + "reward": 1.7000001668930054, + "reward_std": 1.4064139127731323, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.5, + "rewards/reward_retry": 0.5333333611488342, + "step": 867 + }, + { + "completion_length": 495.5, + "epoch": 3.034965034965035, + "grad_norm": 0.5171676278114319, + "kl": 0.2767552137374878, + "learning_rate": 3.4848547739773782e-06, + "loss": 0.0111, + "reward": 3.2916667461395264, + "reward_std": 1.6125807762145996, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 868 + }, + { + "completion_length": 237.33334350585938, + "epoch": 3.0384615384615383, + "grad_norm": 0.6683189272880554, + "kl": 0.26167619228363037, + "learning_rate": 3.480842791508904e-06, + "loss": 0.0105, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 869 + }, + { + "completion_length": 234.33334350585938, + "epoch": 3.041958041958042, + "grad_norm": 0.7994163036346436, + "kl": 0.24035631120204926, + "learning_rate": 3.476827821223184e-06, + "loss": 0.0096, + "reward": 2.7833335399627686, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 870 + }, + { + "completion_length": 249.83334350585938, + "epoch": 3.0454545454545454, + "grad_norm": 0.06245582178235054, + "kl": 0.35248756408691406, + "learning_rate": 3.4728098753505157e-06, + "loss": 0.0165, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 871 + }, + { + "completion_length": 214.33334350585938, + "epoch": 3.0489510489510487, + "grad_norm": 0.7337875366210938, + "kl": 0.2762923240661621, + "learning_rate": 3.4687889661302577e-06, + "loss": 0.0111, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 872 + }, + { + "completion_length": 178.6666717529297, + "epoch": 3.0524475524475525, + "grad_norm": 5.453631401062012, + "kl": 0.6660811305046082, + "learning_rate": 3.4647651058107967e-06, + "loss": 0.0266, + "reward": 2.9583334922790527, + "reward_std": 1.5298421382904053, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 873 + }, + { + "completion_length": 207.0, + "epoch": 3.055944055944056, + "grad_norm": 0.7989277839660645, + "kl": 0.2972930669784546, + "learning_rate": 3.460738306649509e-06, + "loss": 0.0119, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 874 + }, + { + "completion_length": 547.6666870117188, + "epoch": 3.0594405594405596, + "grad_norm": 0.3988294005393982, + "kl": 0.19996020197868347, + "learning_rate": 3.4567085809127247e-06, + "loss": 0.008, + "reward": 1.7916667461395264, + "reward_std": 0.9645811915397644, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 875 + }, + { + "completion_length": 227.0, + "epoch": 3.062937062937063, + "grad_norm": 0.049581676721572876, + "kl": 0.3112030327320099, + "learning_rate": 3.452675940875686e-06, + "loss": 0.0148, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 876 + }, + { + "completion_length": 218.83334350585938, + "epoch": 3.0664335664335662, + "grad_norm": 0.10846934467554092, + "kl": 0.300067275762558, + "learning_rate": 3.448640398822513e-06, + "loss": 0.0144, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 877 + }, + { + "completion_length": 808.8333740234375, + "epoch": 3.06993006993007, + "grad_norm": 0.5524038672447205, + "kl": 0.21805749833583832, + "learning_rate": 3.4446019670461684e-06, + "loss": 0.0087, + "reward": 2.516666889190674, + "reward_std": 1.2355835437774658, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8500000238418579, + "step": 878 + }, + { + "completion_length": 227.6666717529297, + "epoch": 3.0734265734265733, + "grad_norm": 0.8796508312225342, + "kl": 0.3852163255214691, + "learning_rate": 3.440560657848414e-06, + "loss": 0.0154, + "reward": 2.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 879 + }, + { + "completion_length": 493.8333435058594, + "epoch": 3.076923076923077, + "grad_norm": 0.7607131004333496, + "kl": 0.37157270312309265, + "learning_rate": 3.436516483539781e-06, + "loss": 0.0149, + "reward": 2.5250000953674316, + "reward_std": 1.6839685440063477, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916667222976685, + "step": 880 + }, + { + "completion_length": 322.3333435058594, + "epoch": 3.0804195804195804, + "grad_norm": 0.8827399611473083, + "kl": 0.23807722330093384, + "learning_rate": 3.4324694564395228e-06, + "loss": 0.0095, + "reward": 3.258333206176758, + "reward_std": 0.7939878702163696, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.9250000715255737, + "step": 881 + }, + { + "completion_length": 508.66668701171875, + "epoch": 3.0839160839160837, + "grad_norm": 0.619054913520813, + "kl": 0.2534877061843872, + "learning_rate": 3.4284195888755877e-06, + "loss": 0.0101, + "reward": 2.3500001430511475, + "reward_std": 1.2328827381134033, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 882 + }, + { + "completion_length": 525.0, + "epoch": 3.0874125874125875, + "grad_norm": 0.340080589056015, + "kl": 0.33849403262138367, + "learning_rate": 3.4243668931845734e-06, + "loss": 0.0135, + "reward": 3.516666889190674, + "reward_std": 1.0614454746246338, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 883 + }, + { + "completion_length": 199.33334350585938, + "epoch": 3.090909090909091, + "grad_norm": 0.8693441152572632, + "kl": 0.29566070437431335, + "learning_rate": 3.4203113817116955e-06, + "loss": 0.0118, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 884 + }, + { + "completion_length": 528.6666870117188, + "epoch": 3.0944055944055946, + "grad_norm": 0.4151526689529419, + "kl": 0.22466908395290375, + "learning_rate": 3.4162530668107435e-06, + "loss": 0.009, + "reward": 3.125, + "reward_std": 1.5823243856430054, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 885 + }, + { + "completion_length": 166.6666717529297, + "epoch": 3.097902097902098, + "grad_norm": 0.7520537972450256, + "kl": 0.2834951877593994, + "learning_rate": 3.412191960844049e-06, + "loss": 0.0113, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 886 + }, + { + "completion_length": 1012.3333740234375, + "epoch": 3.1013986013986012, + "grad_norm": 0.3472742438316345, + "kl": 0.23108121752738953, + "learning_rate": 3.4081280761824465e-06, + "loss": 0.0092, + "reward": 3.183333396911621, + "reward_std": 1.4445298910140991, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 887 + }, + { + "completion_length": 542.0, + "epoch": 3.104895104895105, + "grad_norm": 0.3802301287651062, + "kl": 0.27087870240211487, + "learning_rate": 3.4040614252052305e-06, + "loss": 0.0108, + "reward": 3.375, + "reward_std": 1.001873254776001, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8749999403953552, + "step": 888 + }, + { + "completion_length": 239.5, + "epoch": 3.1083916083916083, + "grad_norm": 0.7231019735336304, + "kl": 0.27221542596817017, + "learning_rate": 3.3999920203001287e-06, + "loss": 0.0109, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 889 + }, + { + "completion_length": 129.83334350585938, + "epoch": 3.111888111888112, + "grad_norm": 0.6994293332099915, + "kl": 0.5249607563018799, + "learning_rate": 3.39591987386325e-06, + "loss": 0.0234, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 890 + }, + { + "completion_length": 1352.3333740234375, + "epoch": 3.1153846153846154, + "grad_norm": 0.4665977954864502, + "kl": 0.17533139884471893, + "learning_rate": 3.391844998299063e-06, + "loss": 0.007, + "reward": 2.612499952316284, + "reward_std": 0.989412784576416, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.7791666984558105, + "step": 891 + }, + { + "completion_length": 515.8333740234375, + "epoch": 3.1188811188811187, + "grad_norm": 0.7257252335548401, + "kl": 0.2537638545036316, + "learning_rate": 3.387767406020343e-06, + "loss": 0.0102, + "reward": 2.625, + "reward_std": 1.4878677129745483, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 892 + }, + { + "completion_length": 520.1666870117188, + "epoch": 3.1223776223776225, + "grad_norm": 0.46513354778289795, + "kl": 0.2608945369720459, + "learning_rate": 3.383687109448143e-06, + "loss": 0.0104, + "reward": 3.3500001430511475, + "reward_std": 1.058300495147705, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 893 + }, + { + "completion_length": 244.0, + "epoch": 3.125874125874126, + "grad_norm": 0.08787418156862259, + "kl": 0.3290454149246216, + "learning_rate": 3.3796041210117545e-06, + "loss": 0.0155, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 894 + }, + { + "completion_length": 255.1666717529297, + "epoch": 3.129370629370629, + "grad_norm": 0.693707287311554, + "kl": 0.2893093526363373, + "learning_rate": 3.375518453148669e-06, + "loss": 0.0116, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 895 + }, + { + "completion_length": 339.0, + "epoch": 3.132867132867133, + "grad_norm": 0.6376519203186035, + "kl": 0.25471729040145874, + "learning_rate": 3.3714301183045382e-06, + "loss": 0.0102, + "reward": 2.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 896 + }, + { + "completion_length": 199.1666717529297, + "epoch": 3.1363636363636362, + "grad_norm": 0.8792543411254883, + "kl": 0.2878587245941162, + "learning_rate": 3.3673391289331398e-06, + "loss": 0.0115, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 897 + }, + { + "completion_length": 231.6666717529297, + "epoch": 3.13986013986014, + "grad_norm": 0.865860641002655, + "kl": 0.2975502610206604, + "learning_rate": 3.3632454974963368e-06, + "loss": 0.0119, + "reward": 2.7833335399627686, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 898 + }, + { + "completion_length": 202.33334350585938, + "epoch": 3.1433566433566433, + "grad_norm": 0.7683760523796082, + "kl": 0.2757861614227295, + "learning_rate": 3.359149236464041e-06, + "loss": 0.011, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 899 + }, + { + "completion_length": 532.8333740234375, + "epoch": 3.1468531468531467, + "grad_norm": 0.4694902002811432, + "kl": 0.2863316535949707, + "learning_rate": 3.3550503583141726e-06, + "loss": 0.0115, + "reward": 3.183333396911621, + "reward_std": 1.4445298910140991, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 900 + }, + { + "completion_length": 532.6666870117188, + "epoch": 3.1503496503496504, + "grad_norm": 0.6590800881385803, + "kl": 0.2290704846382141, + "learning_rate": 3.3509488755326257e-06, + "loss": 0.0092, + "reward": 1.9666666984558105, + "reward_std": 1.5233734846115112, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 901 + }, + { + "completion_length": 546.5, + "epoch": 3.1538461538461537, + "grad_norm": 1.005067229270935, + "kl": 0.28186488151550293, + "learning_rate": 3.346844800613229e-06, + "loss": 0.0113, + "reward": 3.3500001430511475, + "reward_std": 0.6928204298019409, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.8499999046325684, + "step": 902 + }, + { + "completion_length": 517.8333740234375, + "epoch": 3.1573426573426575, + "grad_norm": 0.5787184238433838, + "kl": 0.2819029986858368, + "learning_rate": 3.3427381460577057e-06, + "loss": 0.0113, + "reward": 3.2916667461395264, + "reward_std": 1.6125807762145996, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 903 + }, + { + "completion_length": 505.0, + "epoch": 3.160839160839161, + "grad_norm": 0.3865443170070648, + "kl": 0.32136544585227966, + "learning_rate": 3.338628924375638e-06, + "loss": 0.0129, + "reward": 3.516666889190674, + "reward_std": 1.0614454746246338, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 904 + }, + { + "completion_length": 851.3333740234375, + "epoch": 3.164335664335664, + "grad_norm": 0.4539722800254822, + "kl": 0.15193934738636017, + "learning_rate": 3.3345171480844275e-06, + "loss": 0.0061, + "reward": 1.8000000715255737, + "reward_std": 1.5792405605316162, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 905 + }, + { + "completion_length": 269.66668701171875, + "epoch": 3.167832167832168, + "grad_norm": 0.6799562573432922, + "kl": 0.2678685784339905, + "learning_rate": 3.3304028297092583e-06, + "loss": 0.0107, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 906 + }, + { + "completion_length": 196.1666717529297, + "epoch": 3.1713286713286712, + "grad_norm": 1.3065040111541748, + "kl": 0.3553697466850281, + "learning_rate": 3.326285981783058e-06, + "loss": 0.0142, + "reward": 1.899999976158142, + "reward_std": 0.7314369678497314, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8999999761581421, + "step": 907 + }, + { + "completion_length": 500.0, + "epoch": 3.174825174825175, + "grad_norm": 0.42356637120246887, + "kl": 0.2747868299484253, + "learning_rate": 3.3221666168464584e-06, + "loss": 0.011, + "reward": 2.8500001430511475, + "reward_std": 1.3190906047821045, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 908 + }, + { + "completion_length": 143.33334350585938, + "epoch": 3.1783216783216783, + "grad_norm": 0.8865023255348206, + "kl": 0.2782132625579834, + "learning_rate": 3.31804474744776e-06, + "loss": 0.0111, + "reward": 2.950000286102295, + "reward_std": 0.6324555277824402, + "rewards/reward_correctness": 0.8333333730697632, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 909 + }, + { + "completion_length": 224.83334350585938, + "epoch": 3.1818181818181817, + "grad_norm": 0.8996831178665161, + "kl": 0.2446550577878952, + "learning_rate": 3.313920386142892e-06, + "loss": 0.0098, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 910 + }, + { + "completion_length": 198.5, + "epoch": 3.1853146853146854, + "grad_norm": 0.8285199999809265, + "kl": 0.2812727093696594, + "learning_rate": 3.309793545495374e-06, + "loss": 0.0113, + "reward": 2.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 911 + }, + { + "completion_length": 169.1666717529297, + "epoch": 3.1888111888111887, + "grad_norm": 2.005018711090088, + "kl": 0.5320585370063782, + "learning_rate": 3.3056642380762783e-06, + "loss": 0.0213, + "reward": 3.183333396911621, + "reward_std": 1.0230672359466553, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 912 + }, + { + "completion_length": 207.6666717529297, + "epoch": 3.1923076923076925, + "grad_norm": 0.7691407203674316, + "kl": 0.313401997089386, + "learning_rate": 3.301532476464191e-06, + "loss": 0.0125, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 913 + }, + { + "completion_length": 185.83334350585938, + "epoch": 3.195804195804196, + "grad_norm": 0.7594463229179382, + "kl": 0.3427696228027344, + "learning_rate": 3.2973982732451753e-06, + "loss": 0.0137, + "reward": 3.2833335399627686, + "reward_std": 0.8164966106414795, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.949999988079071, + "step": 914 + }, + { + "completion_length": 497.66668701171875, + "epoch": 3.199300699300699, + "grad_norm": 0.4282509386539459, + "kl": 0.24064713716506958, + "learning_rate": 3.293261641012731e-06, + "loss": 0.0096, + "reward": 2.2916667461395264, + "reward_std": 1.1918123960494995, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 915 + }, + { + "completion_length": 217.6666717529297, + "epoch": 3.202797202797203, + "grad_norm": 0.09016474336385727, + "kl": 0.36173754930496216, + "learning_rate": 3.2891225923677565e-06, + "loss": 0.0169, + "reward": 3.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 916 + }, + { + "completion_length": 215.5, + "epoch": 3.2062937062937062, + "grad_norm": 0.06597334891557693, + "kl": 0.2595183253288269, + "learning_rate": 3.2849811399185125e-06, + "loss": 0.0128, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 917 + }, + { + "completion_length": 574.3333740234375, + "epoch": 3.20979020979021, + "grad_norm": 0.9240683913230896, + "kl": 0.32425248622894287, + "learning_rate": 3.280837296280582e-06, + "loss": 0.013, + "reward": 3.204166889190674, + "reward_std": 0.6063036322593689, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 918 + }, + { + "completion_length": 227.33334350585938, + "epoch": 3.2132867132867133, + "grad_norm": 0.0464554987847805, + "kl": 0.24978820979595184, + "learning_rate": 3.27669107407683e-06, + "loss": 0.0124, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 919 + }, + { + "completion_length": 217.83334350585938, + "epoch": 3.2167832167832167, + "grad_norm": 0.09970775246620178, + "kl": 0.28330814838409424, + "learning_rate": 3.272542485937369e-06, + "loss": 0.0137, + "reward": 2.950000286102295, + "reward_std": 0.0, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 920 + }, + { + "completion_length": 352.16668701171875, + "epoch": 3.2202797202797204, + "grad_norm": 0.7447912096977234, + "kl": 0.2716596722602844, + "learning_rate": 3.2683915444995184e-06, + "loss": 0.0109, + "reward": 2.950000286102295, + "reward_std": 1.095445156097412, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 921 + }, + { + "completion_length": 208.1666717529297, + "epoch": 3.2237762237762237, + "grad_norm": 0.8169512748718262, + "kl": 0.28194016218185425, + "learning_rate": 3.2642382624077647e-06, + "loss": 0.0113, + "reward": 2.7833335399627686, + "reward_std": 0.7527726888656616, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 922 + }, + { + "completion_length": 166.5, + "epoch": 3.227272727272727, + "grad_norm": 4.4463629722595215, + "kl": 0.6169787645339966, + "learning_rate": 3.260082652313726e-06, + "loss": 0.0247, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 923 + }, + { + "completion_length": 212.1666717529297, + "epoch": 3.230769230769231, + "grad_norm": 1.077783226966858, + "kl": 0.4446868896484375, + "learning_rate": 3.2559247268761117e-06, + "loss": 0.0178, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 924 + }, + { + "completion_length": 214.0, + "epoch": 3.234265734265734, + "grad_norm": 0.8344194293022156, + "kl": 0.2506507933139801, + "learning_rate": 3.2517644987606827e-06, + "loss": 0.01, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 925 + }, + { + "completion_length": 197.5, + "epoch": 3.237762237762238, + "grad_norm": 0.878742516040802, + "kl": 0.2665516138076782, + "learning_rate": 3.247601980640217e-06, + "loss": 0.0107, + "reward": 3.116666793823242, + "reward_std": 0.40824827551841736, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 926 + }, + { + "completion_length": 155.6666717529297, + "epoch": 3.2412587412587412, + "grad_norm": 0.8167181015014648, + "kl": 0.29363638162612915, + "learning_rate": 3.243437185194465e-06, + "loss": 0.0117, + "reward": 3.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 927 + }, + { + "completion_length": 195.83334350585938, + "epoch": 3.2447552447552446, + "grad_norm": 0.9464877247810364, + "kl": 0.3257704973220825, + "learning_rate": 3.2392701251101172e-06, + "loss": 0.013, + "reward": 2.950000286102295, + "reward_std": 0.8944271802902222, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 928 + }, + { + "completion_length": 187.0, + "epoch": 3.2482517482517483, + "grad_norm": 0.5975750088691711, + "kl": 0.2691609263420105, + "learning_rate": 3.23510081308076e-06, + "loss": 0.0108, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 929 + }, + { + "completion_length": 752.6666870117188, + "epoch": 3.2517482517482517, + "grad_norm": 0.6095904111862183, + "kl": 0.24479100108146667, + "learning_rate": 3.230929261806842e-06, + "loss": 0.0098, + "reward": 2.516666889190674, + "reward_std": 0.6976150274276733, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8500000238418579, + "step": 930 + }, + { + "completion_length": 195.1666717529297, + "epoch": 3.2552447552447554, + "grad_norm": 0.9495894312858582, + "kl": 0.23146504163742065, + "learning_rate": 3.2267554839956315e-06, + "loss": 0.0093, + "reward": 3.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 931 + }, + { + "completion_length": 200.1666717529297, + "epoch": 3.2587412587412588, + "grad_norm": 0.9735977053642273, + "kl": 0.28021007776260376, + "learning_rate": 3.222579492361179e-06, + "loss": 0.0112, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 932 + }, + { + "completion_length": 212.33334350585938, + "epoch": 3.262237762237762, + "grad_norm": 0.7763716578483582, + "kl": 0.34078341722488403, + "learning_rate": 3.2184012996242808e-06, + "loss": 0.0136, + "reward": 3.616666793823242, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 933 + }, + { + "completion_length": 486.8333435058594, + "epoch": 3.265734265734266, + "grad_norm": 1.7987946271896362, + "kl": 0.4850091338157654, + "learning_rate": 3.214220918512434e-06, + "loss": 0.0194, + "reward": 1.9666666984558105, + "reward_std": 1.5233734846115112, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6333333253860474, + "step": 934 + }, + { + "completion_length": 709.3333740234375, + "epoch": 3.269230769230769, + "grad_norm": 2.0046944618225098, + "kl": 0.5092130899429321, + "learning_rate": 3.2100383617598075e-06, + "loss": 0.0204, + "reward": 3.758333206176758, + "reward_std": 0.4694856107234955, + "rewards/reward_correctness": 1.0, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.9249999523162842, + "step": 935 + }, + { + "completion_length": 222.1666717529297, + "epoch": 3.2727272727272725, + "grad_norm": 0.7624396085739136, + "kl": 0.24156790971755981, + "learning_rate": 3.205853642107192e-06, + "loss": 0.0097, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 936 + }, + { + "completion_length": 794.8333740234375, + "epoch": 3.2762237762237763, + "grad_norm": 0.5818172693252563, + "kl": 0.2191859483718872, + "learning_rate": 3.20166677230197e-06, + "loss": 0.0088, + "reward": 2.3583333492279053, + "reward_std": 0.6785401105880737, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8583332896232605, + "step": 937 + }, + { + "completion_length": 203.5, + "epoch": 3.2797202797202796, + "grad_norm": 0.8157092928886414, + "kl": 0.26415112614631653, + "learning_rate": 3.1974777650980737e-06, + "loss": 0.0106, + "reward": 2.2833335399627686, + "reward_std": 0.5163978338241577, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.3333333432674408, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 938 + }, + { + "completion_length": 516.6666870117188, + "epoch": 3.2832167832167833, + "grad_norm": 0.5414477586746216, + "kl": 0.22202841937541962, + "learning_rate": 3.1932866332559455e-06, + "loss": 0.0089, + "reward": 2.516666889190674, + "reward_std": 0.6976150274276733, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 939 + }, + { + "completion_length": 685.6666870117188, + "epoch": 3.2867132867132867, + "grad_norm": 0.3968733549118042, + "kl": 0.26331794261932373, + "learning_rate": 3.189093389542498e-06, + "loss": 0.0105, + "reward": 3.0458333492279053, + "reward_std": 1.544216513633728, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7125000357627869, + "step": 940 + }, + { + "completion_length": 193.33334350585938, + "epoch": 3.29020979020979, + "grad_norm": 0.7643024325370789, + "kl": 0.3172113001346588, + "learning_rate": 3.184898046731082e-06, + "loss": 0.0127, + "reward": 3.450000286102295, + "reward_std": 0.547722578048706, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 1.0, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 941 + }, + { + "completion_length": 509.5, + "epoch": 3.2937062937062938, + "grad_norm": 0.5346531867980957, + "kl": 0.2539750933647156, + "learning_rate": 3.180700617601436e-06, + "loss": 0.0102, + "reward": 2.625, + "reward_std": 1.616709589958191, + "rewards/reward_correctness": 0.5, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 942 + }, + { + "completion_length": 201.6666717529297, + "epoch": 3.297202797202797, + "grad_norm": 0.7389536499977112, + "kl": 0.2968377470970154, + "learning_rate": 3.176501114939659e-06, + "loss": 0.0119, + "reward": 3.2833335399627686, + "reward_std": 0.8164965510368347, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 943 + }, + { + "completion_length": 506.5, + "epoch": 3.300699300699301, + "grad_norm": 0.4880388081073761, + "kl": 0.24028021097183228, + "learning_rate": 3.1722995515381644e-06, + "loss": 0.0096, + "reward": 1.8500001430511475, + "reward_std": 0.8366600871086121, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.1666666716337204, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 944 + }, + { + "completion_length": 505.16668701171875, + "epoch": 3.304195804195804, + "grad_norm": 0.5714858174324036, + "kl": 0.2349107563495636, + "learning_rate": 3.1680959401956425e-06, + "loss": 0.0094, + "reward": 2.8500001430511475, + "reward_std": 1.0488088130950928, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.8499999046325684, + "step": 945 + }, + { + "completion_length": 505.5, + "epoch": 3.3076923076923075, + "grad_norm": 0.4575049579143524, + "kl": 0.2562292814254761, + "learning_rate": 3.1638902937170224e-06, + "loss": 0.0102, + "reward": 2.125, + "reward_std": 1.150543451309204, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.5, + "rewards/reward_format": 0.8333333730697632, + "rewards/reward_retry": 0.7916666865348816, + "step": 946 + }, + { + "completion_length": 193.5, + "epoch": 3.3111888111888113, + "grad_norm": 0.7549261450767517, + "kl": 0.2406913936138153, + "learning_rate": 3.1596826249134328e-06, + "loss": 0.0096, + "reward": 3.116666793823242, + "reward_std": 0.40824830532073975, + "rewards/reward_correctness": 0.3333333432674408, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.949999988079071, + "step": 947 + }, + { + "completion_length": 244.0, + "epoch": 3.3146853146853146, + "grad_norm": 0.8239571452140808, + "kl": 0.20550505816936493, + "learning_rate": 3.155472946602162e-06, + "loss": 0.0082, + "reward": 2.5375001430511475, + "reward_std": 0.49085378646850586, + "rewards/reward_correctness": 0.0, + "rewards/reward_em_chunk": 0.6666666865348816, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708332777023315, + "step": 948 + }, + { + "completion_length": 822.1666870117188, + "epoch": 3.3181818181818183, + "grad_norm": 0.7868720293045044, + "kl": 0.20647165179252625, + "learning_rate": 3.1512612716066217e-06, + "loss": 0.0083, + "reward": 2.0250000953674316, + "reward_std": 1.4372718334197998, + "rewards/reward_correctness": 0.6666666865348816, + "rewards/reward_em_chunk": 0.0, + "rewards/reward_format": 0.6666666865348816, + "rewards/reward_retry": 0.6916666030883789, + "step": 949 + }, + { + "completion_length": 586.0, + "epoch": 3.3216783216783217, + "grad_norm": 0.6502403616905212, + "kl": 0.2520906627178192, + "learning_rate": 3.147047612756302e-06, + "loss": 0.0101, + "reward": 2.870833396911621, + "reward_std": 0.661516547203064, + "rewards/reward_correctness": 0.1666666716337204, + "rewards/reward_em_chunk": 0.8333333730697632, + "rewards/reward_format": 1.0, + "rewards/reward_retry": 0.8708333373069763, + "step": 950 + } + ], + "logging_steps": 1, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-950/training_args.bin b/checkpoint-950/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..404a67ca1097568ef818195412e92eb5df6df003 --- /dev/null +++ b/checkpoint-950/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9b809202c83316443ca7c3596f9666d891e249e918f031374256726d85b5070 +size 6008 diff --git a/logs/app.log b/logs/app.log index 951307f3724784e4f94f340a921da3bc4e0f14ad..a7d8e6d41a749c184e84375ffe60ba0e197c7fcf 100644 --- a/logs/app.log +++ b/logs/app.log @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf687cbd08df63a7f1283521b08f5eb0ecf3b9390fde77aa5a8ce1c3fd02b01b -size 78927447 +oid sha256:6e7d668ea4ddad2c20c8e4ed87049a1e94172759f83e79f07f125e76ddf7e88b +size 160519920 diff --git a/runs/Apr03_08-51-27_c6a7d9a1991e/events.out.tfevents.1743670289.c6a7d9a1991e.8025.0 b/runs/Apr03_08-51-27_c6a7d9a1991e/events.out.tfevents.1743670289.c6a7d9a1991e.8025.0 index 629dfd8d6e75326cff4ca6e7b41504eb1870b87f..72f1f281a2ea8230aa984d4e6ef11c295933a76e 100644 --- a/runs/Apr03_08-51-27_c6a7d9a1991e/events.out.tfevents.1743670289.c6a7d9a1991e.8025.0 +++ b/runs/Apr03_08-51-27_c6a7d9a1991e/events.out.tfevents.1743670289.c6a7d9a1991e.8025.0 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4717f2b797d418d889320c418ed3b167f5ab6a930dcb881e3fa7e9e98ef728b6 -size 410730 +oid sha256:14d38009dba6121c25eb07eb0d7a52b41c666680edefcacd4009fbb88de58247 +size 690522